Python aCTPandaJob Exemples, act.atlas.aCTPandaJob.aCTPandaJob Python Exemples

Exemple #1

0

Afficher le fichier

    def extractOutputFilesFromMetadata(self, arcjobid):
        aj = self.dbarc.getArcJobInfo(arcjobid, columns=["JobID", "appjobid"])
        if not aj or 'JobID' not in aj or not aj['JobID']:
            self.log.error("failed to find arcjobid %s in database" % arcjobid)
            return {}

        jobid = aj['JobID']
        sessionid = jobid[jobid.rfind('/') + 1:]
        try:
            jobinfo = aCTPandaJob(filename=os.path.join(
                self.tmpdir, sessionid, 'heartbeat.json'))
            metadata = getattr(jobinfo,
                               'xml')  # travis doesn't like jobinfo.xml
        except Exception as x:
            self.log.error("%s: failed to extract metadata for arcjob %s: %s" %
                           (aj['appjobid'], sessionid, x))
            return {}

        try:
            outputfiles = json.loads(metadata)
        except Exception as e:
            self.log.error(
                "%s: failed to load output file info for arcjob %s: %s" %
                (aj['appjobid'], sessionid, str(e)))
            return {}

        surls = {}
        for attrs in outputfiles.values():
            try:
                size = attrs['fsize']
                adler32 = attrs['adler32']
                surl = attrs['surl']
                se = arc.URL(str(surl)).Host()
            except Exception as x:
                self.log.error('%s: %s' % (aj['appjobid'], x))
            else:
                checksum = "adler32:" + adler32
                if se not in surls:
                    surls[se] = []
                surls[se] += [{
                    "surl": surl,
                    "fsize": size,
                    "checksum": checksum,
                    "arcjobid": arcjobid
                }]

        return surls

Exemple #2

0

Afficher le fichier

Fichier : aCTATLASStatus.py Projet : manfuin/aCT

    def processFailed(self, arcjobs):
        """
        process jobs for which pilot failed (batch exit code is non-zero)
        """
        if not arcjobs:
            return

        self.log.info("processing %d failed jobs" % len(arcjobs))
        for aj in arcjobs:
            jobid=aj['JobID']
            if not jobid:
                # Job was not even submitted, there is no more information
                self.log.warning("%s: Job has not been submitted yet so no information to report", aj['appjobid'])
                continue

            sessionid=jobid[jobid.rfind('/')+1:]
            date = aj['created'].strftime('%Y-%m-%d')
            outd = os.path.join(self.conf.get(['joblog','dir']), date, aj['siteName'])
            # Make sure the path to outd exists
            try:
                os.makedirs(outd, 0o755)
            except:
                pass
            # copy from tmp to outd. tmp dir will be cleaned in validator
            localdir = os.path.join(self.tmpdir, sessionid)
            gmlogerrors = os.path.join(localdir, "gmlog", "errors")
            arcjoblog = os.path.join(outd, "%s.log" % aj['appjobid'])
            if not os.path.exists(arcjoblog):
                try:
                    shutil.copy(gmlogerrors, arcjoblog)
                    os.chmod(arcjoblog, 0o644)
                except:
                    self.log.error("Failed to copy %s" % gmlogerrors)

            pilotlog = aj['stdout']
            if not pilotlog and os.path.exists(localdir):
                pilotlogs = [f for f in os.listdir(localdir)]
                for f in pilotlogs:
                    if f.find('.log'):
                        pilotlog = f
            if pilotlog:
                try:
                    shutil.copy(os.path.join(localdir, pilotlog),
                                os.path.join(outd, '%s.out' % aj['appjobid']))
                    os.chmod(os.path.join(outd, '%s.out' % aj['appjobid']), 0o644)
                except Exception as e:
                    self.log.warning("%s: Failed to copy job output for %s: %s" % (aj['appjobid'], jobid, str(e)))

            try:
                smeta = json.loads(str(aj['metadata']))
            except:
                smeta = None

            # fill info for the final heartbeat
            pupdate = aCTPandaJob()
            pupdate.jobId = aj['appjobid']
            pupdate.state = 'failed'
            pupdate.siteName = aj['siteName']
            pupdate.computingElement = urlparse(aj['cluster']).hostname
            try:
                pupdate.schedulerID = smeta['schedulerid']
            except:
                pupdate.schedulerID = self.conf.get(['panda','schedulerid'])
            pupdate.pilotID = self.conf.get(["joblog","urlprefix"])+"/"+date+"/"+aj['siteName']+'/'+aj['appjobid']+".out|Unknown|Unknown|Unknown|Unknown"
            if len(aj["ExecutionNode"]) > 255:
                pupdate.node = aj["ExecutionNode"][:254]
                self.log.warning("%s: Truncating wn hostname from %s to %s" % (aj['pandaid'], aj['ExecutionNode'], pupdate.node))
            else:
                pupdate.node = aj["ExecutionNode"]
            pupdate.node = aj['ExecutionNode']
            pupdate.pilotLog = self.createPilotLog(outd, aj['pandaid'])
            pupdate.cpuConsumptionTime = aj['UsedTotalCPUTime']
            pupdate.cpuConsumptionUnit = 'seconds'
            pupdate.cpuConversionFactor = 1
            pupdate.coreCount = aj['corecount'] or 1
            pupdate.pilotTiming = "0|0|%s|0" % aj['UsedTotalWallTime']
            pupdate.errorCode = 9000
            pupdate.errorDiag = aj['Error']
            # set start/endtime
            if aj['EndTime']:
                pupdate.startTime = self.getStartTime(aj['EndTime'], aj['UsedTotalWallTime']).strftime('%Y-%m-%d %H:%M:%S')
                pupdate.endTime = aj['EndTime'].strftime('%Y-%m-%d %H:%M:%S')
                # Sanity check for efficiency > 100%
                cputimepercore = pupdate.cpuConsumptionTime / pupdate.coreCount
                if aj['UsedTotalWallTime'] < cputimepercore:
                    self.log.warning('%s: Adjusting reported walltime %d to CPU time %d' %
                                      (aj['appjobid'], aj['UsedTotalWallTime'], cputimepercore))
                    pupdate.startTime = (aj['EndTime'] - datetime.timedelta(0, cputimepercore)).strftime('%Y-%m-%d %H:%M:%S')
            else:
                # Set walltime to cputime per core
                pupdate.startTime = self.getStartTime(datetime.datetime.utcnow(), aj['UsedTotalCPUTime'] / pupdate.coreCount).strftime('%Y-%m-%d %H:%M:%S')
                pupdate.endTime = datetime.datetime.utcnow().strftime('%Y-%m-%d %H:%M:%S')
            # save the heartbeat file to be used by aCTAutopilot panda update
            try:
                if smeta and smeta.get('harvesteraccesspoint'):
                    hbfile = os.path.join(smeta['harvesteraccesspoint'], 'jobReport.json')
                else:
                    hbfile = os.path.join(self.tmpdir, "heartbeats", str(aj['pandaid'])+".json")
                pupdate.writeToFile(hbfile)
            except Exception as e:
                self.log.warning("%s: Failed to write file %s: %s" % (aj['appjobid'], hbfile, str(e)))

Exemple #3

0

Afficher le fichier

    def copyFinishedFiles(self, arcjobid, extractmetadata):
        """
        - if extractmetadata: (normal arc jobs, not true pilot jobs)
           - store heartbeat file under tmp/pickle or under harvester access
             point if specified
        - copy .job.log file to jobs/date/pandaqueue/pandaid.out
        - copy gmlog errors to jobs/date/pandaqueue/pandaid.log
        """

        columns = ['JobID', 'appjobid', 'cluster', 'UsedTotalWallTime', 'arcjobs.EndTime',
                   'ExecutionNode', 'stdout', 'fairshare', 'pandajobs.created', 'metadata']
        select = "arcjobs.id=%d AND arcjobs.id=pandajobs.arcjobid" % arcjobid
        aj = self.dbarc.getArcJobsInfo(select, columns=columns, tables='arcjobs,pandajobs')
        if not aj or 'JobID' not in aj[0] or not aj[0]['JobID']:
            self.log.error('No JobID in arcjob %s: %s'%(str(arcjobid), str(aj)))
            return False
        aj = aj[0]
        jobid = aj['JobID']
        sessionid = jobid[jobid.rfind('/')+1:]
        date = aj['created'].strftime('%Y-%m-%d')
        if extractmetadata:
            try:
                jobinfo = aCTPandaJob(filename=os.path.join(self.tmpdir, sessionid, 'heartbeat.json'))
            except Exception as x:
                self.log.error("%s: failed to load heartbeat file for arcjob %s: %s" %(aj['appjobid'], jobid, x))
                jobinfo = aCTPandaJob(jobinfo={'jobId': aj['appjobid'], 'state': 'finished'})

            # update heartbeat and dump to tmp/heartbeats
            jobinfo.computingElement = arc.URL(str(aj['cluster'])).Host()
            if hasattr(jobinfo, 'startTime') and hasattr(jobinfo, 'endTime'):
                # take values from the pilot
                jobinfo.startTime = datetime.datetime.utcfromtimestamp(jobinfo.startTime).strftime('%Y-%m-%d %H:%M:%S')
                jobinfo.endTime = datetime.datetime.utcfromtimestamp(jobinfo.endTime).strftime('%Y-%m-%d %H:%M:%S')
            else:
                # Use ARC values
                if aj['EndTime']:
                    # datetime cannot be serialised to json so use string (for harvester)
                    jobinfo.startTime = (aj['EndTime'] - datetime.timedelta(0, aj['UsedTotalWallTime'])).strftime('%Y-%m-%d %H:%M:%S')
                    jobinfo.endTime = aj['EndTime'].strftime('%Y-%m-%d %H:%M:%S')
                    # Sanity check for efficiency > 100%
                    cputimepercore = getattr(jobinfo, 'cpuConsumptionTime', 0) / getattr(jobinfo, 'coreCount', 1)
                    if aj['UsedTotalWallTime'] < cputimepercore:
                        self.log.warning('%s: Adjusting reported walltime %d to CPU time %d' %
                                          (aj['appjobid'], aj['UsedTotalWallTime'], cputimepercore))
                        jobinfo.startTime = (aj['EndTime'] - datetime.timedelta(0, cputimepercore)).strftime('%Y-%m-%d %H:%M:%S')
                else:
                    self.log.warning('%s: no endtime found' % aj['appjobid'])
            if len(aj["ExecutionNode"]) > 255:
                jobinfo.node = aj["ExecutionNode"][:254]
                self.log.warning("%s: Truncating wn hostname from %s to %s" % (aj['appjobid'], aj['ExecutionNode'], jobinfo.node))
            else:
                jobinfo.node = aj["ExecutionNode"]

            try:
                smeta = json.loads(aj['metadata'].decode())
            except:
                smeta = None

            if smeta and smeta.get('harvesteraccesspoint'):
                # de-serialise the metadata to json
                try:
                    jobinfo.metaData = json.loads(jobinfo.metaData)
                except Exception as e:
                    self.log.warning("%s: no metaData in pilot metadata: %s" % (aj['appjobid'], str(e)))
                jobinfo.writeToFile(os.path.join(smeta['harvesteraccesspoint'], 'jobReport.json'))
            else:
                jobinfo.writeToFile(os.path.join(self.tmpdir, "heartbeats", "%s.json" % aj['appjobid']))

        # copy to joblog dir files downloaded for the job: gmlog errors and pilot log
        outd = os.path.join(self.conf.get(['joblog','dir']), date, aj['fairshare'])
        try:
            os.makedirs(outd, 0o755)
        except:
            pass

        localdir = os.path.join(self.tmpdir, sessionid)
        gmlogerrors = os.path.join(localdir, "gmlog", "errors")
        arcjoblog = os.path.join(outd, "%s.log" % aj['appjobid'])
        if not os.path.exists(arcjoblog):
            try:
                shutil.move(gmlogerrors, arcjoblog)
                os.chmod(arcjoblog, 0o644)
            except:
                self.log.error("Failed to copy %s" % gmlogerrors)

        pilotlog = aj['stdout']
        if not pilotlog and os.path.exists(localdir):
            pilotlogs = [f for f in os.listdir(localdir)]
            for f in pilotlogs:
                if f.find('.log'):
                    pilotlog = f
        if pilotlog:
            try:
                shutil.move(os.path.join(localdir, pilotlog),
                            os.path.join(outd, '%s.out' % aj['appjobid']))
                os.chmod(os.path.join(outd, '%s.out' % aj['appjobid']), 0o644)
            except Exception as e:
                self.log.error("Failed to copy file %s: %s" % (os.path.join(localdir,pilotlog), str(e)))
                return False

        return True

Exemple #4

0

Afficher le fichier

class aCTValidator(aCTATLASProcess):
    '''
    Validate output files for finished jobs, cleanup output files for failed jobs.
    '''
    def __init__(self):
        aCTATLASProcess.__init__(self)

        # Use production role proxy for checking and removing files
        # Get DN from configured proxy file
        cred_type = arc.initializeCredentialsType(
            arc.initializeCredentialsType.SkipCredentials)
        uc = arc.UserConfig(cred_type)
        uc.ProxyPath(str(self.arcconf.get(['voms', 'proxypath'])))
        cred = arc.Credential(uc)
        dn = cred.GetIdentityName()

        actp = aCTProxy(self.log)
        # Beware hard-coded production role
        proxyfile = actp.path(dn, '/atlas/Role=production')
        if not proxyfile:
            raise Exception(
                'Could not find proxy with production role in proxy table')
        self.log.info('set proxy path to %s' % proxyfile)

        self.uc = arc.UserConfig(cred_type)
        self.uc.ProxyPath(str(proxyfile))
        self.uc.UtilsDirPath(arc.UserConfig.ARCUSERDIRECTORY)

        # Possible file status
        self.ok = 0
        self.retry = 1
        self.failed = 2

    def _extractFromSmallFiles(self, aj, filename):
        jobid = aj['JobID']
        sessionid = jobid[jobid.rfind('/'):]
        localdir = str(self.arcconf.get(['tmp', 'dir'])) + sessionid
        smallfiles = tarfile.open(os.path.join(localdir, 'jobSmallFiles.tgz'))
        return smallfiles.extractfile(filename)

    def copyFinishedFiles(self, arcjobid, extractmetadata):
        """
        - if extractmetadata: (normal arc jobs, not true pilot jobs) 
           - extract panda_node_struct.pickle from jobSmallFiles.tgz and store it under tmp/pickle
           - extract metadata-surl.xml and update pickle. store xml under tmp/xml
        - copy .job.log file to jobs/date/cluster/jobid
        - copy gmlog dir to jobs/date/cluster/jobid
        """

        columns = [
            'JobID', 'appjobid', 'cluster', 'UsedTotalWallTime', 'EndTime',
            'ExecutionNode', 'stdout'
        ]
        aj = self.dbarc.getArcJobInfo(arcjobid, columns=columns)
        if not aj.has_key('JobID') or not aj['JobID']:
            self.log.error('No JobID in arcjob %s: %s' %
                           (str(arcjobid), str(aj)))
            return False
        jobid = aj['JobID']
        sessionid = jobid[jobid.rfind('/') + 1:]
        date = time.strftime('%Y%m%d')
        cluster = arc.URL(str(jobid)).Host()
        if extractmetadata:
            try:
                pandapickle = self._extractFromSmallFiles(
                    aj, "panda_node_struct.pickle")
            except Exception, x:
                self.log.error(
                    "%s: failed to extract pickle for arcjob %s: %s" %
                    (aj['appjobid'], sessionid, x))
                pandapickle = None
            try:
                metadata = self._extractFromSmallFiles(aj, "metadata-surl.xml")
            except Exception, x:
                self.log.error(
                    "%s: failed to extract metadata-surl.xml for arcjob %s: %s"
                    % (aj['appjobid'], sessionid, x))
                metadata = None

            # update pickle and dump to tmp/pickle
            if pandapickle:
                try:
                    jobinfo = aCTPandaJob(filehandle=pandapickle)
                except:
                    jobinfo = aCTPandaJob(jobinfo={
                        'jobId': aj['appjobid'],
                        'state': 'finished'
                    })
            else:
                jobinfo = aCTPandaJob(jobinfo={
                    'jobId': aj['appjobid'],
                    'state': 'finished'
                })
            if metadata:
                jobinfo.xml = str(metadata.read())
            jobinfo.computingElement = cluster
            jobinfo.schedulerID = self.conf.get(['panda', 'schedulerid'])
            if aj['EndTime']:
                jobinfo.startTime = aj['EndTime'] - datetime.timedelta(
                    0, aj['UsedTotalWallTime'])
                jobinfo.endTime = aj['EndTime']
            else:
                self.log.warning('%s: no endtime found' % aj['appjobid'])
            if len(aj["ExecutionNode"]) > 255:
                jobinfo.node = aj["ExecutionNode"][:254]
                self.log.warning(
                    "%s: Truncating wn hostname from %s to %s" %
                    (aj['appjobid'], aj['ExecutionNode'], jobinfo.node))
            else:
                jobinfo.node = aj["ExecutionNode"]

            # Add url of logs
            if 'pilotID' in jobinfo.dictionary().keys() and jobinfo.pilotID:
                t = jobinfo.pilotID.split("|")
            else:
                t = ['Unknown'] * 5
            logurl = os.path.join(self.conf.get(["joblog", "urlprefix"]), date,
                                  cluster, sessionid)
            try:  # TODO catch and handle non-ascii
                jobinfo.pilotID = '|'.join([logurl] + t[1:])
            except:
                pass
            jobinfo.writeToFile(
                self.arcconf.get(['tmp', 'dir']) + "/pickle/" +
                aj['appjobid'] + ".pickle")

Exemple #5

0

Afficher le fichier

Fichier : aCTAutopilot.py Projet : manfuin/aCT

    def updateEvents(self, jobs):
        """
        Handle event service updates for finished jobs
        TOFIX for pilot2
        """
        tlist=[]
        for j in jobs:
            eventrangestoupdate = []

            if j['actpandastatus'] == 'finished' \
              and 'plugin=arc' in self.sites[j['siteName']]['catchall'] \
              and re.search('eventService=True', j['pandajob']):

                # Check if we are running in harvester mode
                try:
                    smeta = json.loads(str(j['metadata']))
                    harvesteraccesspoint = smeta.get('harvesteraccesspoint')
                except:
                    harvesteraccesspoint = None

                if not harvesteraccesspoint and j['sendhb'] == 0:
                    continue

                if not j['eventranges'] or j['eventranges'] == '[]':
                    fname = os.path.join(self.tmpdir, "pickle", "%d.pickle" % j['pandaid'])
                    if not os.path.exists(fname):
                        # Jobs which were never submitted should have substatus pilot_noevents so they go to closed
                        # Assume only ARC sites (not condor) run NG-mode ES
                        if j['arcjobid'] == -1 or j['arcjobid'] is None:
                            substatus = 'pilot_noevents'
                            self.log.info('%s: Job did not run and has no eventranges to update, marking pilot_noevents' % j['pandaid'])
                        # Jobs which ran but produced no events have pilot_failed so they go to failed
                        else:
                            substatus = 'pilot_failed'
                            self.log.info('%s: Job ran but has no eventranges to update, marking failed' % j['pandaid'])
                        jobinfo = aCTPandaJob({'jobId': j['pandaid'], 'state': 'closed', 'jobSubStatus': substatus})
                        # Create the empty pickle so that heartbeat code below doesn't fail
                        if harvesteraccesspoint:
                            jobinfo.writeToFile(os.path.join(harvesteraccesspoint, 'jobReport.json'))
                        else:
                            jobinfo.writeToFile(fname)
                    continue

                # If zip is used we need to first send transferring heartbeat
                # with jobMetrics containing the zip file
                # In harvester mode harvester does this itself?
                if 'es_to_zip' in self.sites[j['siteName']]['catchall'] and not harvesteraccesspoint:
                    try:
                        # Load pickled information from pilot
                        fname = os.path.join(self.tmpdir, "pickle", "%d.pickle" % j['pandaid'])
                        jobinfo = aCTPandaJob(filename=fname)
                        jobmetrics = {'jobMetrics': getattr(jobinfo, 'jobMetrics', '')}
                        self.log.info('%s: Sending jobMetrics and transferring state: %s' % (j['pandaid'], jobmetrics))
                    except Exception as x:
                        self.log.error('%s: No pickle info found: %s' % (j['pandaid'], x))
                    else:
                        t = PandaThr(self.getPanda(j['siteName']).updateStatus, j['pandaid'], 'transferring', jobmetrics)
                        aCTUtils.RunThreadsSplit([t], self.nthreads)
                        # If update fails panda won't see the zip and events
                        # will be rescheduled to another job
                        if t.result == None or 'StatusCode' not in t.result:
                            # Strange response from panda
                            continue
                        if t.result['StatusCode'][0] == '60':
                            self.log.error('Failed to contact Panda, proxy may have expired')
                        elif t.result['StatusCode'][0] == '30':
                            self.log.error('Job was already killed')

                eventranges = j['eventranges']
                eventrangeslist = json.loads(eventranges)

                # Get object store ID used
                try:
                    objstoreID = self.sites[j['siteName']]['ddmoses']
                except:
                    self.log.warning('No ES object store defined for %s' % j['siteName'])
                    objstoreID = None

                for eventrange in eventrangeslist:
                    node = {}
                    node['eventRangeID'] = eventrange['eventRangeID']
                    try:
                        node['eventStatus'] = eventrange['status']
                    except:
                        node['eventStatus'] = j['actpandastatus']
                    node['objstoreID'] = objstoreID
                    eventrangestoupdate.append(node)

                self.log.info('%s: updating %i event ranges: %s' % (j['pandaid'], len(eventrangestoupdate), eventrangestoupdate))
                if harvesteraccesspoint:
                    self.log.info('%s: Dumping processed event ranges to %s' %
                                 (j['pandaid'], os.path.join(harvesteraccesspoint, 'worker_updateevents.json')))
                    harvesterdict = {j['pandaid']: eventrangestoupdate}
                    with open(os.path.join(harvesteraccesspoint, 'worker_updateevents.json'), 'w') as f:
                        json.dump(harvesterdict, f)
                else:
                    updatenode = {'eventRanges': json.dumps(eventrangestoupdate)}
                    t = PandaEventsThr(self.getPanda(j['siteName']).updateEventRanges, j['pandaid'], updatenode)
                    tlist.append(t)

        aCTUtils.RunThreadsSplit(tlist, self.nthreads)
        for t in tlist:
            # If update fails events will be rescheduled to another job
            if t.result == None or 'StatusCode' not in t.result:
                # Strange response from panda
                continue
            if t.result['StatusCode'][0] == '60':
                self.log.error('Failed to contact Panda, proxy may have expired')
            elif t.result['StatusCode'][0] == '30':
                self.log.warning('%s: Job was already killed' % j['pandaid'])

Exemple #6

0

Afficher le fichier

Fichier : aCTAutopilot.py Projet : manfuin/aCT

    def updatePandaFinishedPilot(self):
        """
        Final status update for completed jobs (finished or failed in athena)
        and cancelled jobs
        """
        jobs=self.dbpanda.getJobs("actpandastatus='finished' or actpandastatus='failed' or actpandastatus='cancelled' limit 1000")

        if not jobs:
            return

        self.log.info("Updating panda for %d finished jobs (%s)" % (len(jobs), ','.join([str(j['pandaid']) for j in jobs])))

        self.updateEvents(jobs)
        tlist = []
        for j in jobs:
            # If true pilot skip heartbeat and just update DB
            if not j['sendhb']:
                jd={}
                jd['pandastatus']=None
                jd['actpandastatus']='done'
                if j['actpandastatus'] == 'failed':
                    jd['actpandastatus']='donefailed'
                if j['actpandastatus'] == 'cancelled':
                    jd['actpandastatus']='donecancelled'
                if not j['startTime']:
                    jd['startTime'] = datetime.datetime.utcnow()
                if not j['endTime']:
                    jd['endTime'] = datetime.datetime.utcnow()
                self.dbpanda.updateJob(j['pandaid'], jd)
                continue

            # Cancelled jobs have no heartbeat info
            if j['actpandastatus'] == 'cancelled':
                jobinfo = aCTPandaJob(jobinfo = {'jobId': j['pandaid'], 'state': 'failed'})
                jobinfo.pilotErrorCode = 1144
                jobinfo.pilotErrorDiag = "This job was killed by panda server"
                jobinfo.startTime = j['startTime'] if j['startTime'] else datetime.datetime.utcnow()
                jobinfo.endTime = j['endTime'] if j['endTime'] else datetime.datetime.utcnow()
            else:
                try:
                    # Load heartbeat information from pilot
                    fname = os.path.join(self.tmpdir, "heartbeats", "%d.json" % j['pandaid'])
                    jobinfo = aCTPandaJob(filename=fname)
                except Exception as x:
                    self.log.error('%s: %s' % (j['pandaid'], x))
                    # Send some basic info back to panda
                    info = {'jobId': j['pandaid'], 'state': j['pandastatus']}
                    jobinfo = aCTPandaJob(jobinfo=info)
                    jobinfo.errorCode = 9000
                    jobinfo.errorDiag = 'Job failed for unknown reason'
                else:
                    os.remove(fname)

            self.log.debug('%s: final heartbeat: %s' % (j['pandaid'], jobinfo.dictionary()))
            t=PandaThr(self.getPanda(j['siteName']).updateStatus,j['pandaid'],j['pandastatus'],jobinfo.dictionary())
            tlist.append(t)

        aCTUtils.RunThreadsSplit(tlist, self.nthreads)

        for t in tlist:
            if t.result == None:
                continue
            if 'StatusCode' in t.result and t.result['StatusCode'] and t.result['StatusCode'][0] != '0':
                self.log.error('Error updating panda')
                continue
            jd={}
            jd['pandastatus']=None
            jd['actpandastatus']='done'
            if t.status == 'failed':
                jd['actpandastatus']='donefailed'
            if 'pilotErrorCode' in t.args and t.args['pilotErrorCode'] == 1144:
                jd['actpandastatus']='donecancelled'
            jd['theartbeat']=self.dbpanda.getTimeStamp()
            self.dbpanda.updateJob(t.id,jd)
            # Send done message to APFMon
            self.apfmon.updateJob(t.id, 'done' if jd['actpandastatus'] == 'done' else 'fault')

        self.log.info("Threads finished")

        # Clean inputfiles, pickle and eventranges
        for j in jobs:
            pandaid=j['pandaid']
            pandainputdir = os.path.join(self.tmpdir, 'inputfiles', str(pandaid))
            picklefile = os.path.join(self.tmpdir, 'pickle', str(pandaid)+".pickle")
            eventrangesfile = os.path.join(self.tmpdir, 'eventranges', str(pandaid)+".json")
            shutil.rmtree(pandainputdir, ignore_errors=True)
            # remove pickle
            if os.path.exists(picklefile):
                os.unlink(picklefile)
            # remove eventrangesfile
            if os.path.exists(eventrangesfile):
                os.unlink(eventrangesfile)