Esempio n. 1
0
    def downloadSmallFiles(self, jobs):
        '''
        This method is for jobs which should be killed and resubmitted. An attempt
        is made to download heartbeat json but it is fine to fail as the job
        may still be running.
        '''

        for job in jobs:
            if not 'JobID' in job or not job['JobID']:
                continue
            jobid = job['JobID']
            sessionid = jobid[jobid.rfind('/'):]
            localdir = self.tmpdir + sessionid

            try:
                os.makedirs(localdir, 0o755)
            except:
                pass

            source = aCTUtils.DataPoint(str(jobid + '/heartbeat.json'), self.uc)
            dest = aCTUtils.DataPoint(str(localdir + '/heartbeat.json'), self.uc)
            dm = arc.DataMover()
            status = dm.Transfer(source.h, dest.h, arc.FileCache(), arc.URLMap())
            if not status:
                self.log.debug('%s: Failed to download %s: %s' % (job['pandaid'], source.h.GetURL().str(), str(status)))
Esempio n. 2
0
    def removeOutputFiles(self, surls):
        '''
        Remove SURLs.
        '''
        result = {}

        # As yet there is no bulk remove in ARC
        for surl in surls:
            dp = aCTUtils.DataPoint(str(surl['surl']), self.uc)
            if not dp.h or surl['surl'].startswith('root://'):
                self.log.info("Removed %s for %s" % (surl['surl'], surl['arcjobid']))
                result[surl['arcjobid']] = self.ok
                continue
            status = dp.h.Remove()
            if not status:
                if status.Retryable():
                    self.log.warning("Failed to delete %s for %s, will retry later: %s" %
                                     (surl['surl'], surl['arcjobid'], str(status)))
                    result[surl['arcjobid']] = self.retry
                elif status.GetErrno() == os.errno.ENOENT:
                    self.log.info("File %s for %s does not exist" % (surl['surl'], surl['arcjobid']))
                    result[surl['arcjobid']] = self.ok
                else:
                    self.log.error("Failed to delete %s for %s: %s" % (surl['surl'], surl['arcjobid'], str(status)))
                    result[surl['arcjobid']] = self.failed
            else:
                self.log.info("Removed %s for %s" % (surl['surl'], surl['arcjobid']))
                result[surl['arcjobid']] = self.ok

        return result
Esempio n. 3
0
 def listUrlRecursive(self, url, fname='', filelist=[]):
     dp = aCTUtils.DataPoint(url + '/' + fname, self.uc)
     files = dp.h.List(arc.DataPoint.INFO_TYPE_NAME
                       | arc.DataPoint.INFO_TYPE_TYPE)
     if not files[1]:
         self.log.warning("Failed listing %s/%s" % (url, fname))
         return filelist
     for f in files[0]:
         if f.GetType() == f.file_type_file:
             filelist.append((fname + '/' + f.GetName()).strip('/'))
         elif f.GetType() == f.file_type_dir:
             filelist = self.listUrlRecursive(
                 url, (fname + '/' + str(f.GetName())).strip('/'), filelist)
     return filelist
Esempio n. 4
0
    def checkOutputFiles(self, surldict):
        '''
        Check if SURLs are working. Returns a dict of arcjobid:file status
        Do bulk arc.DataPoint.Stat() with max 100 files per request. The list
        of surls passed here all belong to the same SE.
        '''

        if self.arcconf.get(['downtime', 'srmdown']) == 'True':
            self.log.info("SRM down, will validate later")
            return dict((k['arcjobid'], self.retry) for k in surldict.values())

        result = {}
        datapointlist = arc.DataPointList()
        surllist = []
        dummylist = []
        bulklimit = 100
        for surls in surldict.values():
            count = 0
            for surl in surls:
                count += 1
                if not surl['surl']:
                    self.log.error("Missing surl for %s, cannot validate" % surl['arcjobid'])
                    result[surl['arcjobid']] = self.failed
                    continue
                dp = aCTUtils.DataPoint(str(surl['surl']), self.uc)
                if not dp or not dp.h:
                    self.log.warning("URL %s not supported, skipping validation" % str(surl['surl']))
                    result[surl['arcjobid']] = self.ok
                    continue
                datapointlist.append(dp.h)
                dummylist.append(dp) # to not destroy objects
                surllist.append(surl)

                if count % bulklimit != 0 and count != len(surls):
                    continue

                # do bulk call
                (files, status) = dp.h.Stat(datapointlist)
                if not status and status.GetErrno() != os.errno.EOPNOTSUPP:
                    # If call fails it is generally a server or connection problem
                    # and in most cases should be retryable
                    if status.Retryable():
                        self.log.warning("Failed to query files on %s, will retry later: %s" % (dp.h.GetURL().Host(), str(status)))
                        result.update(dict((k['arcjobid'], self.retry) for k in surllist))
                    else:
                        self.log.error("Failed to query files on %s: %s" % (dp.h.GetURL().Host(), str(status)))
                        result.update(dict((k['arcjobid'], self.failed) for k in surllist))

                else:
                    # files is a list of FileInfo objects. If file is not found or has
                    # another error in the listing FileInfo object will be invalid
                    for i in range(len(datapointlist)):
                        if status.GetErrno() == os.errno.EOPNOTSUPP:
                            # Bulk stat was not supported, do non-bulk here
                            f = arc.FileInfo()
                            st = datapointlist[i].Stat(f)
                            if not st or not f:
                                if status.Retryable():
                                    self.log.warning("Failed to query files on %s, will retry later: %s" % (datapointlist[i].GetURL().Host(), str(st)))
                                    result[surllist[i]['arcjobid']] = self.retry
                                else:
                                    self.log.warning("%s: Failed to find info on %s" % (surllist[i]['arcjobid'], datapointlist[i].GetURL().str()))
                                    result[surllist[i]['arcjobid']] = self.failed
                                files.append(None)
                            else:
                                files.append(f)

                        if not files[i]:
                            self.log.warning("%s: Failed to find info on %s" % (surllist[i]['arcjobid'], datapointlist[i].GetURL().str()))
                            result[surllist[i]['arcjobid']] = self.failed
                        else:
                            # compare metadata
                            try:
                                self.log.debug("File %s for %s: expected size %d, checksum %s, actual size %d, checksum %s" %
                                               (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'], int(surllist[i]['fsize']),
                                               surllist[i]['checksum'], int(files[i].GetSize()), files[i].GetCheckSum()))
                            except:
                                self.log.warning("Unhandled issue %d",i)
                                result[surllist[i]['arcjobid']] = self.failed
                                continue
                            if int(surllist[i]['fsize']) != int(files[i].GetSize()):
                                self.log.warning("File %s for %s: size on storage (%d) differs from expected size (%d)" %
                                                 (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'],
                                                  int(files[i].GetSize()), int(surllist[i]['fsize'])))
                                result[surllist[i]['arcjobid']] = self.failed
                                continue
                            if not files[i].CheckCheckSum():
                                self.log.warning("File %s for %s: no checksum information available" %
                                                 (datapointlist[i].GetURL().str(), surllist[i]['arcjobid']))
                            elif surllist[i]['checksum'] != files[i].GetCheckSum():
                                self.log.warning("File %s for %s: checksum on storage (%s) differs from expected checksum (%s)" %
                                                 (datapointlist[i].GetURL().str(), surllist[i]['arcjobid'],
                                                  files[i].GetCheckSum(), surllist[i]['checksum']))
                                result[surllist[i]['arcjobid']] = self.failed
                                continue

                            self.log.info("File %s validated for %s" % (datapointlist[i].GetURL().str(), surllist[i]['arcjobid']))
                            # don't overwrite previous failed file for this job
                            if surllist[i]['arcjobid'] not in result:
                                result[surllist[i]['arcjobid']] = self.ok

                # Clear lists and go to next round
                datapointlist = arc.DataPointList()
                surllist = []
                dummylist = []

        return result
Esempio n. 5
0
    def fetchSome(self, jobs, downloadfiles):

        # Get specified files for the jobs in downloadfiles
        # jobs: id: Job object
        # downloadfiles: id: list of files relative to session dir, with wildcards
        if not jobs or not downloadfiles:
            return ([], [], [])

        # construct datapoint object, initialising connection. Use the same
        # object until base URL changes. TODO group by base URL.
        datapoint = aCTUtils.DataPoint(jobs.values()[0].JobID, self.uc)
        dp = datapoint.h
        dm = arc.DataMover()
        dm.retry(False)
        dm.passive(True)
        dm.secure(False)
        fetched = []
        notfetched = []
        notfetchedretry = []

        for (id, job) in jobs.items():
            if id not in downloadfiles:
                continue
            jobid = job.JobID

            # If connection URL is different reconnect
            if arc.URL(jobid).ConnectionURL() != dp:
                datapoint = aCTUtils.DataPoint(jobid, self.uc)
                dp = datapoint.h
            localdir = str(self.conf.get(['tmp', 'dir'
                                          ])) + jobid[jobid.rfind('/'):] + '/'

            files = downloadfiles[id].split(';')
            if re.search('[\*\[\]\?]', downloadfiles[id]):
                # found wildcard, need to get sessiondir list
                remotefiles = self.listUrlRecursive(jobid)
                expandedfiles = []
                for wcf in files:
                    if re.search('[\*\[\]\?]', wcf):
                        # only match wildcards in matching dirs
                        expandedfiles += [
                            rf for rf in remotefiles
                            if fnmatch.fnmatch(rf, wcf)
                            and os.path.dirname(rf) == os.path.dirname(wcf)
                        ]
                    else:
                        expandedfiles.append(wcf)
                # remove duplicates from wildcard matching through set
                files = list(set(expandedfiles))

            for f in files:
                localfile = str(localdir + f)
                localfiledir = localfile[:localfile.rfind('/')]
                # create required local dirs
                try:
                    os.makedirs(localfiledir, 0755)
                except OSError as e:
                    if e.errno != errno.EEXIST or not os.path.isdir(
                            localfiledir):
                        self.log.warning('Failed to create directory %s: %s',
                                         localfiledir, os.strerror(e.errno))
                        notfetched.append(jobid)
                        break
                remotefile = arc.URL(str(jobid + '/' + f))
                dp.SetURL(remotefile)
                localdp = aCTUtils.DataPoint(localfile, self.uc)
                # do the copy
                status = dm.Transfer(dp, localdp.h, arc.FileCache(),
                                     arc.URLMap())
                if not status and str(status).find(
                        'File unavailable'
                ) == -1:  # tmp fix for globus error which is always retried
                    if status.Retryable():
                        self.log.warning(
                            'Failed to download but will retry %s: %s',
                            dp.GetURL().str(), str(status))
                        notfetchedretry.append(jobid)
                    else:
                        self.log.error(
                            'Failed to download with permanent failure %s: %s',
                            dp.GetURL().str(), str(status))
                        notfetched.append(jobid)
                    break
                self.log.info('Downloaded %s', dp.GetURL().str())
            if jobid not in notfetched and jobid not in notfetchedretry:
                fetched.append(jobid)
        return (fetched, notfetched, notfetchedretry)
Esempio n. 6
0
    def fetchJobs(self, arcstate, nextarcstate):

        # Get list of jobs in the right state
        jobstofetch = self.db.getArcJobs("arcstate='" + arcstate +
                                         "' and cluster='" + self.cluster +
                                         "'" + " limit 100")

        if not jobstofetch:
            return
        self.log.info("Fetching %i jobs" %
                      sum(len(v) for v in jobstofetch.values()))

        fetched = []
        notfetched = []
        notfetchedretry = []
        for proxyid, jobs in jobstofetch.items():
            self.uc.CredentialString(self.db.getProxy(proxyid))

            # Clean the download dir just in case something was left from previous attempt
            for job in jobs:
                shutil.rmtree(
                    self.conf.get(['tmp', 'dir']) +
                    job[2].JobID[job[2].JobID.rfind('/'):], True)

            # Get list of downloadable files for these jobs
            filestodl = self.db.getArcJobsInfo(
                "arcstate='" + arcstate + "' and cluster='" + self.cluster +
                "' and proxyid='" + str(proxyid) + "'",
                ['id', 'downloadfiles'])
            # id: downloadfiles
            downloadfiles = dict(
                (row['id'], row['downloadfiles']) for row in filestodl)
            # jobs to download all files
            jobs_downloadall = dict(
                (j[0], j[2]) for j in jobs
                if j[0] in downloadfiles and not downloadfiles[j[0]])
            # jobs to download specific files
            jobs_downloadsome = dict(
                (j[0], j[2]) for j in jobs
                if j[0] in downloadfiles and downloadfiles[j[0]])

            # We don't know if a failure from JobSupervisor is retryable or not
            # so always retry
            (f, r) = self.fetchAll(jobs_downloadall)
            fetched.extend(f)
            notfetchedretry.extend(r)

            (f, n, r) = self.fetchSome(jobs_downloadsome, downloadfiles)
            fetched.extend(f)
            notfetched.extend(n)
            notfetchedretry.extend(r)

        # Check for massive failure, and back off before trying again
        # TODO: downtime awareness
        if len(notfetched) > 10 and len(notfetched) == len(jobstofetch) or \
           len(notfetchedretry) > 10 and len(notfetchedretry) == len(jobstofetch):
            self.log.error(
                "Failed to get any jobs from %s, sleeping for 5 mins" %
                self.cluster)
            time.sleep(300)
            return

        for proxyid, jobs in jobstofetch.items():
            for (id, appjobid, job, created) in jobs:
                if job.JobID in notfetchedretry:
                    self.log.warning("%s: Could not get output from job %s" %
                                     (appjobid, job.JobID))
                    # Remove download directory to allow retry
                    shutil.rmtree(
                        self.conf.get(['tmp', 'dir']) +
                        job.JobID[job.JobID.rfind('/'):], True)
                    # Check if job still exists
                    fileinfo = arc.FileInfo()
                    self.uc.CredentialString(self.db.getProxy(proxyid))
                    dp = aCTUtils.DataPoint(job.JobID, self.uc)
                    status = dp.h.Stat(fileinfo)
                    # TODO Check other permanent errors
                    if not status and status.GetErrno() == errno.ENOENT:
                        self.log.warning("%s: Job %s no longer exists" %
                                         (appjobid, job.JobID))
                        self.db.updateArcJob(
                            id, {
                                "arcstate": "donefailed",
                                "tarcstate": self.db.getTimeStamp()
                            })
                    # Otherwise try again next time
                elif job.JobID in notfetched:
                    self.log.error("%s: Failed to download job %s" %
                                   (appjobid, job.JobID))
                    self.db.updateArcJob(
                        id, {
                            "arcstate": "donefailed",
                            "tarcstate": self.db.getTimeStamp()
                        })
                else:
                    self.log.info("%s: Downloaded job %s" %
                                  (appjobid, job.JobID))
                    self.db.updateArcJob(
                        id, {
                            "arcstate": nextarcstate,
                            "tarcstate": self.db.getTimeStamp()
                        })