def downloadSmallFiles(self, jobs): ''' This method is for jobs which should be killed and resubmitted. An attempt is made to download heartbeat json but it is fine to fail as the job may still be running. ''' for job in jobs: if not 'JobID' in job or not job['JobID']: continue jobid = job['JobID'] sessionid = jobid[jobid.rfind('/'):] localdir = self.tmpdir + sessionid try: os.makedirs(localdir, 0o755) except: pass source = aCTUtils.DataPoint(str(jobid + '/heartbeat.json'), self.uc) dest = aCTUtils.DataPoint(str(localdir + '/heartbeat.json'), self.uc) dm = arc.DataMover() status = dm.Transfer(source.h, dest.h, arc.FileCache(), arc.URLMap()) if not status: self.log.debug('%s: Failed to download %s: %s' % (job['pandaid'], source.h.GetURL().str(), str(status)))
def removeOutputFiles(self, surls): ''' Remove SURLs. ''' result = {} # As yet there is no bulk remove in ARC for surl in surls: dp = aCTUtils.DataPoint(str(surl['surl']), self.uc) if not dp.h or surl['surl'].startswith('root://'): self.log.info("Removed %s for %s" % (surl['surl'], surl['arcjobid'])) result[surl['arcjobid']] = self.ok continue status = dp.h.Remove() if not status: if status.Retryable(): self.log.warning("Failed to delete %s for %s, will retry later: %s" % (surl['surl'], surl['arcjobid'], str(status))) result[surl['arcjobid']] = self.retry elif status.GetErrno() == os.errno.ENOENT: self.log.info("File %s for %s does not exist" % (surl['surl'], surl['arcjobid'])) result[surl['arcjobid']] = self.ok else: self.log.error("Failed to delete %s for %s: %s" % (surl['surl'], surl['arcjobid'], str(status))) result[surl['arcjobid']] = self.failed else: self.log.info("Removed %s for %s" % (surl['surl'], surl['arcjobid'])) result[surl['arcjobid']] = self.ok return result
def listUrlRecursive(self, url, fname='', filelist=[]): dp = aCTUtils.DataPoint(url + '/' + fname, self.uc) files = dp.h.List(arc.DataPoint.INFO_TYPE_NAME | arc.DataPoint.INFO_TYPE_TYPE) if not files[1]: self.log.warning("Failed listing %s/%s" % (url, fname)) return filelist for f in files[0]: if f.GetType() == f.file_type_file: filelist.append((fname + '/' + f.GetName()).strip('/')) elif f.GetType() == f.file_type_dir: filelist = self.listUrlRecursive( url, (fname + '/' + str(f.GetName())).strip('/'), filelist) return filelist
def checkOutputFiles(self, surldict): ''' Check if SURLs are working. Returns a dict of arcjobid:file status Do bulk arc.DataPoint.Stat() with max 100 files per request. The list of surls passed here all belong to the same SE. ''' if self.arcconf.get(['downtime', 'srmdown']) == 'True': self.log.info("SRM down, will validate later") return dict((k['arcjobid'], self.retry) for k in surldict.values()) result = {} datapointlist = arc.DataPointList() surllist = [] dummylist = [] bulklimit = 100 for surls in surldict.values(): count = 0 for surl in surls: count += 1 if not surl['surl']: self.log.error("Missing surl for %s, cannot validate" % surl['arcjobid']) result[surl['arcjobid']] = self.failed continue dp = aCTUtils.DataPoint(str(surl['surl']), self.uc) if not dp or not dp.h: self.log.warning("URL %s not supported, skipping validation" % str(surl['surl'])) result[surl['arcjobid']] = self.ok continue datapointlist.append(dp.h) dummylist.append(dp) # to not destroy objects surllist.append(surl) if count % bulklimit != 0 and count != len(surls): continue # do bulk call (files, status) = dp.h.Stat(datapointlist) if not status and status.GetErrno() != os.errno.EOPNOTSUPP: # If call fails it is generally a server or connection problem # and in most cases should be retryable if status.Retryable(): self.log.warning("Failed to query files on %s, will retry later: %s" % (dp.h.GetURL().Host(), str(status))) result.update(dict((k['arcjobid'], self.retry) for k in surllist)) else: self.log.error("Failed to query files on %s: %s" % (dp.h.GetURL().Host(), str(status))) result.update(dict((k['arcjobid'], self.failed) for k in surllist)) else: # files is a list of FileInfo objects. If file is not found or has # another error in the listing FileInfo object will be invalid for i in range(len(datapointlist)): if status.GetErrno() == os.errno.EOPNOTSUPP: # Bulk stat was not supported, do non-bulk here f = arc.FileInfo() st = datapointlist[i].Stat(f) if not st or not f: if status.Retryable(): self.log.warning("Failed to query files on %s, will retry later: %s" % (datapointlist[i].GetURL().Host(), str(st))) result[surllist[i]['arcjobid']] = self.retry else: self.log.warning("%s: Failed to find info on %s" % (surllist[i]['arcjobid'], datapointlist[i].GetURL().str())) result[surllist[i]['arcjobid']] = self.failed files.append(None) else: files.append(f) if not files[i]: self.log.warning("%s: Failed to find info on %s" % (surllist[i]['arcjobid'], datapointlist[i].GetURL().str())) result[surllist[i]['arcjobid']] = self.failed else: # compare metadata try: self.log.debug("File %s for %s: expected size %d, checksum %s, actual size %d, checksum %s" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'], int(surllist[i]['fsize']), surllist[i]['checksum'], int(files[i].GetSize()), files[i].GetCheckSum())) except: self.log.warning("Unhandled issue %d",i) result[surllist[i]['arcjobid']] = self.failed continue if int(surllist[i]['fsize']) != int(files[i].GetSize()): self.log.warning("File %s for %s: size on storage (%d) differs from expected size (%d)" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'], int(files[i].GetSize()), int(surllist[i]['fsize']))) result[surllist[i]['arcjobid']] = self.failed continue if not files[i].CheckCheckSum(): self.log.warning("File %s for %s: no checksum information available" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'])) elif surllist[i]['checksum'] != files[i].GetCheckSum(): self.log.warning("File %s for %s: checksum on storage (%s) differs from expected checksum (%s)" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'], files[i].GetCheckSum(), surllist[i]['checksum'])) result[surllist[i]['arcjobid']] = self.failed continue self.log.info("File %s validated for %s" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'])) # don't overwrite previous failed file for this job if surllist[i]['arcjobid'] not in result: result[surllist[i]['arcjobid']] = self.ok # Clear lists and go to next round datapointlist = arc.DataPointList() surllist = [] dummylist = [] return result
def fetchSome(self, jobs, downloadfiles): # Get specified files for the jobs in downloadfiles # jobs: id: Job object # downloadfiles: id: list of files relative to session dir, with wildcards if not jobs or not downloadfiles: return ([], [], []) # construct datapoint object, initialising connection. Use the same # object until base URL changes. TODO group by base URL. datapoint = aCTUtils.DataPoint(jobs.values()[0].JobID, self.uc) dp = datapoint.h dm = arc.DataMover() dm.retry(False) dm.passive(True) dm.secure(False) fetched = [] notfetched = [] notfetchedretry = [] for (id, job) in jobs.items(): if id not in downloadfiles: continue jobid = job.JobID # If connection URL is different reconnect if arc.URL(jobid).ConnectionURL() != dp: datapoint = aCTUtils.DataPoint(jobid, self.uc) dp = datapoint.h localdir = str(self.conf.get(['tmp', 'dir' ])) + jobid[jobid.rfind('/'):] + '/' files = downloadfiles[id].split(';') if re.search('[\*\[\]\?]', downloadfiles[id]): # found wildcard, need to get sessiondir list remotefiles = self.listUrlRecursive(jobid) expandedfiles = [] for wcf in files: if re.search('[\*\[\]\?]', wcf): # only match wildcards in matching dirs expandedfiles += [ rf for rf in remotefiles if fnmatch.fnmatch(rf, wcf) and os.path.dirname(rf) == os.path.dirname(wcf) ] else: expandedfiles.append(wcf) # remove duplicates from wildcard matching through set files = list(set(expandedfiles)) for f in files: localfile = str(localdir + f) localfiledir = localfile[:localfile.rfind('/')] # create required local dirs try: os.makedirs(localfiledir, 0755) except OSError as e: if e.errno != errno.EEXIST or not os.path.isdir( localfiledir): self.log.warning('Failed to create directory %s: %s', localfiledir, os.strerror(e.errno)) notfetched.append(jobid) break remotefile = arc.URL(str(jobid + '/' + f)) dp.SetURL(remotefile) localdp = aCTUtils.DataPoint(localfile, self.uc) # do the copy status = dm.Transfer(dp, localdp.h, arc.FileCache(), arc.URLMap()) if not status and str(status).find( 'File unavailable' ) == -1: # tmp fix for globus error which is always retried if status.Retryable(): self.log.warning( 'Failed to download but will retry %s: %s', dp.GetURL().str(), str(status)) notfetchedretry.append(jobid) else: self.log.error( 'Failed to download with permanent failure %s: %s', dp.GetURL().str(), str(status)) notfetched.append(jobid) break self.log.info('Downloaded %s', dp.GetURL().str()) if jobid not in notfetched and jobid not in notfetchedretry: fetched.append(jobid) return (fetched, notfetched, notfetchedretry)
def fetchJobs(self, arcstate, nextarcstate): # Get list of jobs in the right state jobstofetch = self.db.getArcJobs("arcstate='" + arcstate + "' and cluster='" + self.cluster + "'" + " limit 100") if not jobstofetch: return self.log.info("Fetching %i jobs" % sum(len(v) for v in jobstofetch.values())) fetched = [] notfetched = [] notfetchedretry = [] for proxyid, jobs in jobstofetch.items(): self.uc.CredentialString(self.db.getProxy(proxyid)) # Clean the download dir just in case something was left from previous attempt for job in jobs: shutil.rmtree( self.conf.get(['tmp', 'dir']) + job[2].JobID[job[2].JobID.rfind('/'):], True) # Get list of downloadable files for these jobs filestodl = self.db.getArcJobsInfo( "arcstate='" + arcstate + "' and cluster='" + self.cluster + "' and proxyid='" + str(proxyid) + "'", ['id', 'downloadfiles']) # id: downloadfiles downloadfiles = dict( (row['id'], row['downloadfiles']) for row in filestodl) # jobs to download all files jobs_downloadall = dict( (j[0], j[2]) for j in jobs if j[0] in downloadfiles and not downloadfiles[j[0]]) # jobs to download specific files jobs_downloadsome = dict( (j[0], j[2]) for j in jobs if j[0] in downloadfiles and downloadfiles[j[0]]) # We don't know if a failure from JobSupervisor is retryable or not # so always retry (f, r) = self.fetchAll(jobs_downloadall) fetched.extend(f) notfetchedretry.extend(r) (f, n, r) = self.fetchSome(jobs_downloadsome, downloadfiles) fetched.extend(f) notfetched.extend(n) notfetchedretry.extend(r) # Check for massive failure, and back off before trying again # TODO: downtime awareness if len(notfetched) > 10 and len(notfetched) == len(jobstofetch) or \ len(notfetchedretry) > 10 and len(notfetchedretry) == len(jobstofetch): self.log.error( "Failed to get any jobs from %s, sleeping for 5 mins" % self.cluster) time.sleep(300) return for proxyid, jobs in jobstofetch.items(): for (id, appjobid, job, created) in jobs: if job.JobID in notfetchedretry: self.log.warning("%s: Could not get output from job %s" % (appjobid, job.JobID)) # Remove download directory to allow retry shutil.rmtree( self.conf.get(['tmp', 'dir']) + job.JobID[job.JobID.rfind('/'):], True) # Check if job still exists fileinfo = arc.FileInfo() self.uc.CredentialString(self.db.getProxy(proxyid)) dp = aCTUtils.DataPoint(job.JobID, self.uc) status = dp.h.Stat(fileinfo) # TODO Check other permanent errors if not status and status.GetErrno() == errno.ENOENT: self.log.warning("%s: Job %s no longer exists" % (appjobid, job.JobID)) self.db.updateArcJob( id, { "arcstate": "donefailed", "tarcstate": self.db.getTimeStamp() }) # Otherwise try again next time elif job.JobID in notfetched: self.log.error("%s: Failed to download job %s" % (appjobid, job.JobID)) self.db.updateArcJob( id, { "arcstate": "donefailed", "tarcstate": self.db.getTimeStamp() }) else: self.log.info("%s: Downloaded job %s" % (appjobid, job.JobID)) self.db.updateArcJob( id, { "arcstate": nextarcstate, "tarcstate": self.db.getTimeStamp() })