예제 #1
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        pandaProxySecretKey = pdict.get('pandaProxySecretKey')
        jobSetID = pdict.get('jobsetID')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstorepresignedurl', lfn, guid)

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        # pandaID, filename, jobSetID, pandaProxySecretKey=None, stageIn=True
        status, output = self.stageIn(jobId, lfn, jobSetID, pandaProxySecretKey, fullname, fsize, fchecksum, experiment)

        if status == 0:
            updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", type="input")
            state = "DONE"
        else:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"

        # self.__sendReport(state, report)
        self.prepareReport(state, report)
        return status, output
예제 #2
0
def TransferFiles(job_state, datadir, files, **kwargs):
    """
    Transfers files from list 'files'

    May change CWD with pUtil.chdir (several times)

    :param job_state:
    :param datadir: job data dir
    :param files: list of filenames
    :param kwargs: specific arguments for other purposes
    :return:
    """
    job = job_state.job

    pUtil.chdir(datadir)

    XMLMetadata = pUtil.getMetadata(job_state.site.workdir, job.jobId)
    thisSite = DorE(kwargs, 'thisSite')

    if not setGuids(job_state, files, **kwargs):
        job.result[2] = PilotErrors().ERR_LOSTJOBPFC
        return ReturnCode.FailedJob

    outPFC = updateOutPFC(job, **kwargs)
    if not outPFC:
        return ReturnCode.FailedJob

    dsname = defaultDSname(job.destinationDblock)

    datasetDict = pUtil.getDatasetDict(job.outFiles, job.destinationDblock, job.logFile, job.logDblock)
    if not datasetDict:
        log("Output files will go to default dataset: %s" % (dsname))

    # the cmtconfig is needed by at least the xrdcp site mover
    cmtconfig = pUtil.getCmtconfig(job.cmtconfig)

    tin_0 = os.times()
    rf = None
    _state = ReturnCode.OK
    _msg = ""
    ec = -1
    try:
        # Note: alt stage-out numbers are not saved in recovery mode (job object not returned from this function)
        rc, pilotErrorDiag, rf, rs, job.filesNormalStageOut, job.filesAltStageOut, os_bucket_id = Mover.mover_put_data(
            "xmlcatalog_file:%s" % outPFC, dsname,
            thisSite.sitename, thisSite.computingElement, analysisJob=pUtil.isAnalysisJob(job.trf.split(",")[0]),
            proxycheck=DorE(kwargs, 'proxycheckFlag'),
            pinitdir=DorE(kwargs, 'pilot_initdir'),
            datasetDict=datasetDict,
            stageoutTries=DorE(kwargs, 'stageoutretry'), 
            cmtconfig=cmtconfig, recoveryWorkDir=thisSite.workdir,
            job=job)
    except Exception, e:
        pilotErrorDiag = "Put function can not be called for staging out: %s" % str(e)
        log("!!%s!!1105!! %s" % (env['errorLabel'], pilotErrorDiag))
        ec = PilotErrors().ERR_PUTFUNCNOCALL
        _state = ReturnCode.Holding
        _msg = env['errorLabel']
예제 #3
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        experiment = pdict.get('experiment', '')
        outputDir = pdict.get('outputDir', '')
        os_bucket_id = pdict.get('os_bucket_id', -1)
        timeout = pdict.get('timeout', None)
        if not timeout:
            timeout = self.timeout

        # get the site information object
        si = getSiteInformation(experiment)

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstore', lfn, guid)

        parsed = urlparse.urlparse(destination)
        scheme = parsed.scheme
        hostname = parsed.netloc.partition(':')[0]
        port = int(parsed.netloc.partition(':')[2])
        report['remoteSite'] = '%s://%s:%s' % (scheme, hostname, port)

        filename = os.path.basename(source)
        surl = destination
        self.log("surl=%s, timeout=%s" % (surl, timeout))
        if "log.tgz" in surl:
            surl = surl.replace(lfn, "%s:%s"%(scope,lfn))
        else:
            report['eventType'] = 'put_es'

        status, output, size, checksum = self.stageOut(source, surl, token, experiment, outputDir=outputDir, timeout=timeout, os_bucket_id=os_bucket_id, report=report)
        if status !=0:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"
            # self.__sendReport(state, report)
            self.prepareReport(state, report)
            return self.put_data_retfail(status, output, surl)

        state = "DONE"
        # self.__sendReport(state, report)
        self.prepareReport(state, report)
        return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
예제 #4
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""


        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        jobId = pdict.get('jobId', '')
        jobSetID = pdict.get('jobsetID', '')
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        #token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')
        outputDir = pdict.get('outputDir', '')
        timeout = pdict.get('timeout', None)
        pandaProxySecretKey = pdict.get('pandaProxySecretKey')
        if not timeout:
            timeout = self.timeout

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstorepresignedurl', lfn, guid)

        filename = os.path.basename(source)
        surl = destination
        status, output, size, checksum = self.stageOut(source, jobId, lfn, jobSetID, pandaProxySecretKey, experiment, outputDir=outputDir, timeout=timeout)
        if status !=0:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"
            # self.__sendReport(state, report)
            self.prepareReport(state, report)
            return self.put_data_retfail(status, output, surl)

        state = "DONE"
        # self.__sendReport(state, report)
        # self.prepareReport(state, report)
        return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'], 'gfal-copy', lfn,
                                           guid)

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        # get the site information object
        si = getSiteInformation(experiment)
        ret_path = si.getCopyPrefixPathNew(gpfn, stageIn=True)
        if not ret_path.startswith("s3:"):
            errorLog = "Failed to use copyprefix to convert the current path to S3 path."
            tolog("!!WARNING!!1777!! %s" % (errorLog))
            status = PilotErrors.ERR_STAGEINFAILED
            state = "PSTAGE_FAIL"
            output = errorLog
        else:
            gpfn = ret_path
            status, output = self.stageIn(gpfn, fullname, fsize, fchecksum,
                                          experiment)

        if status == 0:
            updateFileState(lfn,
                            workDir,
                            jobId,
                            mode="file_state",
                            state="transferred",
                            type="input")
            state = "DONE"
        else:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"

        self.prepareReport(state, report)
        return status, output
예제 #6
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))
        if prodSourceLabel == 'ddm' and analysisJob:
            tolog(
                "Treating PanDA Mover job as a production job during stage-out"
            )
            analysisJob = False

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstore',
                                           lfn, guid)

        filename = os.path.basename(source)
        surl = destination
        status, output, size, checksum = self.stageOut(source, surl, token,
                                                       experiment)
        if status != 0:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"
            self.prepareReport(state, report)
            return self.put_data_retfail(status, output, surl)

        state = "DONE"
        self.prepareReport(state, report)
        return 0, pilotErrorDiag, surl, size, checksum, self.arch_type
예제 #7
0
파일: WatchDog.py 프로젝트: vokac/pilot
    def pollChildren(self):
        """
        check children processes, collect zombie jobs, and update jobDic status
        """

        error = PilotErrors()

        # tolog("---pollChildren: %s" % str(jobDic))
        for k in self.__env['jobDic'].keys():
            try:
                _id, rc = os.waitpid(self.__env['jobDic'][k][0], os.WNOHANG)
            except OSError, e:
                try:
                    if self.__env['jobDic'][k][1].result[
                            0] == "finished" or self.__env['jobDic'][k][
                                1].result[0] == "failed" or self.__env[
                                    'jobDic'][k][1].result[0] == "holding":
                        continue
                except:
                    pUtil.tolog(
                        "!!FAILED!!1000!! Pilot failed to check the job state: %s"
                        % traceback.format_exc())

                pUtil.tolog("Harmless exception when checking job %s, %s" %
                            (self.__env['jobDic'][k][1].jobId, e))
                if str(e).rstrip() == "[Errno 10] No child processes":
                    pilotErrorDiag = "Exception caught by pilot watchdog: %s" % str(
                        e)
                    pUtil.tolog(
                        "!!FAILED!!1000!! Pilot setting state to failed since there are no child processes"
                    )
                    pUtil.tolog("!!FAILED!!1000!! %s" % (pilotErrorDiag))
                    self.__env['jobDic'][k][1].result[0] = "failed"
                    self.__env['jobDic'][k][1].currentState = self.__env[
                        'jobDic'][k][1].result[0]
                    if self.__env['jobDic'][k][1].result[2] == 0:
                        self.__env['jobDic'][k][1].result[
                            2] = error.ERR_NOCHILDPROCESSES
                    if self.__env['jobDic'][k][1].pilotErrorDiag == "":
                        self.__env['jobDic'][k][
                            1].pilotErrorDiag = pilotErrorDiag
                else:
                    pass
            else:
                if _id:  # finished
                    rc = rc % 255  # exit code
                    if k == "prod":  # production job is done
                        self.__prodJobDone = True
                        pUtil.tolog("Production job is done")
                    if self.__env['jobDic'][k][1].result[
                            0] != "finished" and self.__env['jobDic'][k][
                                1].result[0] != "failed" and self.__env[
                                    'jobDic'][k][1].result[0] != "holding":
                        if not rc:  # rc=0, ok job
                            if not self.__env['jobDic'][k][1].result[1]:
                                self.__env['jobDic'][k][1].result[
                                    1] = rc  # transExitCode (because pilotExitCode is reported back by child job)
                        else:  # rc != 0, failed job
                            self.__env['jobDic'][k][1].result[
                                1] = rc  # transExitCode
예제 #8
0
def verifyMultiTrf(jobParameterList, jobHomePackageList, jobTrfList, jobAtlasRelease):
    """ make sure that a multi-trf (or single trf) job is properly setup """

    error = PilotErrors()

    ec = 0
    pilotErrorDiag = ""
    N_jobParameterList = len(jobParameterList)
    N_jobHomePackageList = len(jobHomePackageList)
    N_jobTrfList = len(jobTrfList)
    N_jobAtlasRelease = len(jobAtlasRelease)

    # test jobs have multiple atlas releases defined, but not real tasks
    if N_jobTrfList > N_jobAtlasRelease and N_jobAtlasRelease == 1:
        # jobAtlasRelease = ['14.0.0'] -> ['14.0.0', '14.0.0']
        jobAtlasRelease = jobAtlasRelease*N_jobTrfList
        N_jobAtlasRelease = len(jobAtlasRelease)

    if (N_jobParameterList == N_jobHomePackageList) and \
       (N_jobHomePackageList == N_jobTrfList) and \
       (N_jobTrfList == N_jobAtlasRelease):
        if N_jobAtlasRelease == 1:
            tolog("Multi-trf verification succeeded (single job)")
        else:
            tolog("Multi-trf verification succeeded")
    else:
        pilotErrorDiag = "Multi-trf verification failed: N(jobPars) eq %d, but N(homepackage,transformation,AtlasRelease) eq (%d,%d,%d)" %\
                         (N_jobParameterList, N_jobHomePackageList, N_jobTrfList, N_jobAtlasRelease)
        tolog("!!FAILED!!2999!! %s" % (pilotErrorDiag))
        ec = error.ERR_SETUPFAILURE

    return ec, pilotErrorDiag, jobAtlasRelease
    def put_data(self,
                 pfn,
                 ddm_storage,
                 fsize=0,
                 fchecksum=0,
                 dsname='',
                 extradirs='',
                 **pdict):
        """ Should be generic: executes setup and command after it """

        error = PilotErrors()
        pilotErrorDiag = ""

        if fsize == 0 or fchecksum == 0:
            ec, pilotErrorDiag, fsize, fchecksum = getLocalFileInfo(
                source, csumtype="adler32")
            if ec != 0:
                return SiteMover.put_data_retfail(ec, pilotErrorDiag)

        s, o = commands.getstatusoutput(
            'source %s; %s %s %s' %
            (self._setup, self._copycmd, source, destination))
        if s != 0:
            check_syserr(s, o)
            pilotErrorDiag = "Error during copy: %s" % (o)
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            return SiteMover.put_data_retfail(error.ERR_STAGEOUTFAILED,
                                              pilotErrorDiag)
        # TODO: how are md5 and size controlled?
        return 0, pilotErrorDiag, destination, fsize, fchecksum, ARCH_DEFAULT
예제 #10
0
    def put_data(self,
                 source,
                 ddm_storage,
                 fsize=0,
                 fchecksum=0,
                 dsname='',
                 **pdict):
        """ Data transfer using rfcp - generic version
        It's not advisable to use this right now because there's no
        easy way to register the srm space token if the file is
        copied with rfcp """

        error = PilotErrors()

        # Get input parameters from pdict
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'castorSVC', lfn,
                                           guid)

        pilotErrorDiag = "put_data does not work for this mover"
        tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
        self.prepareReport('NOT_IMPL', report)
        return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag)
예제 #11
0
    def core_get_data(self, envsetup, token, source_surl, dest_path,
                      experiment):
        """ stage-in core function, can be overridden (see stormSiteMover) """

        error = PilotErrors()

        # determine which timeout option to use
        timeout_option = "--connect-timeout 300 --max-time %d" % (self.timeout)

        sslCert = self.sslCert
        sslKey = self.sslKey
        sslCertDir = self.sslCertDir

        # used curl options:
        # --cert: <cert[:passwd]> Client certificate file and password (SSL)
        # --capath: <directory> CA directory (made using c_rehash) to verify
        # --location: Follow Location: hints (H)
        # --output: <file> Write output to <file> instead of stdout
        # --cilent: Makes Curl mute
        # --show-error: When used with -s it makes curl show error message if it fails
        # Removed for SL6: --ciphers <list of ciphers> (SSL)  Specifies  which  ciphers  to use in the connection.
        """ define curl command string """
        _cmd_str = 'lcg-gt %s https' % (source_surl)
        try:
            s, o = commands.getstatusoutput(_cmd_str)
            tolog("Executing command: %s" % (_cmd_str))
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" %
                  (str(e), s, o))
            o = str(e)
예제 #12
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))
        if prodSourceLabel == 'ddm' and analysisJob:
            tolog(
                "Treating PanDA Mover job as a production job during stage-out"
            )
            analysisJob = False

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'xrdcp', lfn, guid)

        filename = os.path.basename(source)

        # get all the proper paths
        ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(
            error,
            analysisJob,
            token,
            prodSourceLabel,
            dsname,
            filename,
            scope=scope,
            alt=alt,
            sitemover=self)  # quick workaround
        if ec != 0:
            reportState = {}
            reportState["clientState"] = tracer_error
            self.prepareReport(reportState, report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        # get the RSE from ToA
        try:
            _RSE = self.getRSE(surl=surl)
        except Exception, e:
            tolog(
                "Warning: Failed to get RSE: %s (can not add this info to tracing report)"
                % str(e))
예제 #13
0
        def sig2exc(sig, frm):
            """ signal handler """

            error = PilotErrors()
            runJob.setGlobalPilotErrorDiag(
                "!!FAILED!!3000!! SIGTERM Signal %s is caught in child pid=%d!\n"
                % (sig, os.getpid()))
            tolog(runJob.getGlobalPilotErrorDiag())
            if sig == signal.SIGTERM:
                runJob.setGlobalErrorCode(error.ERR_SIGTERM)
            elif sig == signal.SIGQUIT:
                runJob.setGlobalErrorCode(error.ERR_SIGQUIT)
            elif sig == signal.SIGSEGV:
                runJob.setGlobalErrorCode(error.ERR_SIGSEGV)
            elif sig == signal.SIGXCPU:
                runJob.setGlobalErrorCode(error.ERR_SIGXCPU)
            elif sig == signal.SIGBUS:
                runJob.setGlobalErrorCode(error.ERR_SIGBUS)
            elif sig == signal.SIGUSR1:
                runJob.setGlobalErrorCode(error.ERR_SIGUSR1)
            else:
                runJob.setGlobalErrorCode(error.ERR_KILLSIGNAL)
            runJob.setFailureCode(runJob.getGlobalErrorCode)
            # print to stderr
            print >> sys.stderr, runJob.getGlobalPilotErrorDiag()
            raise SystemError(sig)
예제 #14
0
    def core_get_data(self, envsetup, token, source_surl, dest_path,
                      experiment):
        """ stage-in core function, can be overridden (see stormSiteMover) """

        error = PilotErrors()

        # determine which timeout option to use
        if self.isNewLCGVersion("%s lcg-cp" % (envsetup)):
            timeout_option = "--srm-timeout=%d --connect-timeout=300 --sendreceive-timeout=%d" % (
                self.timeout, self.timeout)
        else:
            timeout_option = "-t %d" % (self.timeout)

        # used lcg-cp options:
        # --vo: specifies the Virtual Organization the user belongs to
        #   -t: time-out
        if token:
            # do not use option -b on SL3 clusters running older versions of lcg_utils
            use_b = True
            s, o = commands.getstatusoutput("%s lcg-cr --version" % (envsetup))
            if s != 0:
                # (BDII collects all information coming from site GIISes and stores them in a permanent database)
                tolog("(Probably too old lcg_utils - skipping BDII disabling)")
                use_b = False

            # for the time being
            use_b = False
            if use_b:
                _cmd_str = '%s lcg-cp --vo atlas --srcsetype srmv2 -s %s -b %s %s file://%s' %\
                           (envsetup, token, timeout_option, source_surl, dest_path)
            else:
                tolog("(Skipping space token for the time being)")
                _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % (
                    envsetup, timeout_option, source_surl, dest_path)
        else:
            _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % (
                envsetup, timeout_option, source_surl, dest_path)

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # add the full stage-out command to the job setup script
        to_script = _cmd_str.replace("file://%s" % os.path.dirname(dest_path),
                                     "file://`pwd`")
        to_script = to_script.lstrip(' ')  # remove any initial spaces
        if to_script.startswith('/'):
            to_script = 'source ' + to_script
        thisExperiment.updateJobSetupScript(os.path.dirname(dest_path),
                                            to_script=to_script)

        tolog("Executing command: %s" % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        try:
            s, o = commands.getstatusoutput(_cmd_str)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" %
                  (str(e), s, o))
            o = str(e)
예제 #15
0
    def pollChildren(self):
        """
        check children processes, collect zombie jobs, and update jobDic status
        """
        
        pUtil.tolog("Watchdog to check children processes")
        error = PilotErrors()

        try:
            _id, rc = os.waitpid(self.__env['jobDic']['prod'][0], os.WNOHANG)
        except OSError, e:
            pUtil.tolog("Harmless exception when checking child process %s, %s" % (self.__env['jobDic']['prod'][0], e))
            if str(e).rstrip() == "[Errno 10] No child processes":
                pilotErrorDiag = "Exception caught by pilot watchdog: %s" % str(e)
                for j in self.__env['jobDic']['prod'][1]:
                    pUtil.tolog("Watchdog. JobID: %s, status [%s]" % (j.jobId, j.result[0])) 
                    if j.result[0] in ["finished", "failed", "holding", "transferring"]:
                        pUtil.tolog("Job: %s already %s" % (j.jobId, j.result[0]))
                    else:    
                        pUtil.tolog("!!FAILED!!1000!! Pilot setting state to failed since there are no child processes")
                        pUtil.tolog("!!FAILED!!1000!! %s" % (pilotErrorDiag))
                        pUtil.tolog("Watchdog will fail JobID: %s  status: [%s]" % (j.jobId, j.result[0]))
                        j.result[0] = "failed"
                        j.currentState = j.result[0]
                        if j.result[2] == 0:
                            j.result[2] = error.ERR_NOCHILDPROCESSES
                            if j.pilotErrorDiag == "":
                                j.pilotErrorDiag = pilotErrorDiag
            else:
                pass
예제 #16
0
class DBReleaseHandler:
    """
    Methods for handling the DBRelease file and possibly skip it in the input file list
    In the presence of $[VO_ATLAS_SW_DIR|OSG_APP]/database, the pilot will use these methods to:
    1. Extract the requested DBRelease version from the job parameters string, if present
    2. Scan the $[VO_ATLAS_SW_DIR|OSG_APP]/database dir for available DBRelease files
    3. If the requested DBRelease file is available, continue [else, abort at this point]
    4. Create a DBRelease setup file containing necessary environment variables
    5. Create a new DBRelease file only containing the setup file in the input file directory
    6. Update the job state file
    7. Remove the DBRelease file from the input file list if all previous steps finished correctly
    """

    # private data members
    __error = PilotErrors()  # PilotErrors object
    __version = ""
    __DBReleaseDir = ""
    __filename = "DBRelease-%s.tar.gz"
    __setupFilename = "setup.py"
    __workdir = ""

    def __init__(self, workdir=""):
        """ Default initialization """

        _path = self.getDBReleaseDir()  # _path is a dummy variable
        self.__workdir = workdir

    def removeDBRelease(self, inputFiles, inFilesGuids, realDatasetsIn,
                        dispatchDblock, dispatchDBlockToken, prodDBlockToken):
        """ remove the given DBRelease files from the input file list """
        # will only remove the DBRelease files that are already available locally

        # identify all DBRelease files in the list (mark all for removal)
        # note: multi-trf jobs tend to have the same DBRelease file listed twice
        position = 0
        positions_list = []
        for f in inputFiles:
            if "DBRelease" in f:
                positions_list.append(position)
                tolog("Will remove file %s from input file list" % (f))
            position += 1

        # remove the corresponding guids, datasets and tokens
        for position in positions_list:
            try:
                del (inputFiles[position])
            except Exception, e:
                tolog(
                    "!!WARNING!!1990!! Could not delete object %d in inFiles: %s"
                    % (position, str(e)))
            else:
                tolog("Removed item %d in inFiles" % (position))
            try:
                del (inFilesGuids[position])
            except Exception, e:
                tolog(
                    "!!WARNING!!1990!! Could not delete object %d in inFilesGuids: %s"
                    % (position, str(e)))
            else:
예제 #17
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        os_bucket_id = pdict.get('os_bucket_id', -1)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 's3objectstore',
                                           lfn, guid)

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        status, output = self.stageIn(gpfn,
                                      fullname,
                                      fsize,
                                      fchecksum,
                                      experiment,
                                      os_bucket_id=os_bucket_id)

        if status == 0:
            updateFileState(lfn,
                            workDir,
                            jobId,
                            mode="file_state",
                            state="transferred",
                            ftype="input")
            state = "DONE"
        else:
            errors = PilotErrors()
            state = errors.getErrorName(status)
            if state == None:
                state = "PSTAGE_FAIL"

        # self.__sendReport(state, report)
        self.prepareReport(state, report)
        return status, output
예제 #18
0
    def core_get_data(self, envsetup, token, source_surl, local_fullname,
                      experiment):
        """ special get function developed for storm sites """

        error = PilotErrors()

        # Transform the surl into a full surl
        full_se_endpoint = self.extractSE(readpar('se').split(",")[0])[1]
        prefix = os.path.commonprefix([source_surl, full_se_endpoint])
        if prefix:
            # Can use the bdii-free form
            source_surl = full_se_endpoint + source_surl[len(prefix):]
            _cmd_str = '%s lcg-gt --nobdii --setype srmv2 "%s" file' % (
                envsetup, source_surl)
        else:
            # Fallback solution, use old lcg-gt form
            # get the TURL using the SURL
            tolog(
                "!!WARNING!1234!! Source surl does not match %s, cannot use the bdii-independent lcg-gt"
                % full_se_endpoint)
            _cmd_str = '%s lcg-gt "%s" file' % (envsetup, source_surl)

        tolog("Executing command: %s" % (_cmd_str))
        t0 = os.times()
        s, o = commands.getstatusoutput(_cmd_str)
        t1 = os.times()
        t = t1[4] - t0[4]
        tolog("Command finished after %f s" % (t))
        if s == 0:
            # get the experiment object
            thisExperiment = getExperiment(experiment)

            # add the full stage-out command to the job setup script
            to_script = _cmd_str
            to_script = to_script.lstrip(' ')  # remove any initial spaces
            if to_script.startswith('/'):
                to_script = 'source ' + to_script
            thisExperiment.updateJobSetupScript(
                os.path.dirname(local_fullname), to_script=to_script)

            source_turl, req_token = o.split('\n')
            source_turl = source_turl.replace('file://', '')
            tolog("Creating link from %s to %s" %
                  (source_turl, local_fullname))
            try:
                os.symlink(source_turl, local_fullname)
                _cmd_str = '%s lcg-sd %s %s 0' % (envsetup, source_surl,
                                                  req_token)
                tolog("Executing command: %s" % (_cmd_str))
                s, o = commands.getstatusoutput(_cmd_str)
                # Do we need to check the exit status of lcg-sd? What do we do if it fails?
                tolog("get_data succeeded")
            except Exception, e:
                pilotErrorDiag = "Exception caught: %s" % str(e)
                tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                tolog("get_data failed")
                return error.ERR_STAGEINFAILED, pilotErrorDiag
예제 #19
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ The local file (local access to the dCache file) is assumed to have a relative path
        that is the same of the relative path in the 'gpfn'
        loc_... are the variables used to access the file in the locally exported file system
        TODO: document GPFN format
        TODO: document better constraint
        """

        error = PilotErrors()
        pilotErrorDiag = ""

        # get the DQ2 tracing report
        try:
            report = pdict['report']
        except:
            report = {}
        else:
            # set the proper protocol
            report['protocol'] = 'HU'
            # mark the relative start
            report['relativeStart'] = time()
            # the current file
            report['filename'] = lfn
            # guid
            report['guid'] = guid.replace('-','')

        # get a proper envsetup
        envsetup = self.getEnvsetup(get=True)

        if self._setup:
            _setup_str = "source %s; " % self._setup
        else:
            _setup_str = envsetup

        try:
            timeout = pdict['timeout']
        except:
            timeout = 5*3600

        if( gpfn.find('SFN') != -1 ):
            s = gpfn.split('SFN=')
            loc_pfn = s[1]
        else:
            _tmp = gpfn.split('/', 3)
            loc_pfn = '/'+_tmp[3]

        _cmd_str = '%snecp %s %s/%s' % (_setup_str, loc_pfn, path, lfn)
        tolog("NECP executing (timeout %s): %s" % (timeout, _cmd_str))
        report['transferStart'] = time()
        try:
            s, telapsed, cout, cerr = timed_command(_cmd_str, timeout)
        except Exception, e:
            tolog("!!WARNING!!2999!! timed_command() threw an exception: %s" % str(e))
            s = 1
            o = str(e)
            telapsed = timeout
예제 #20
0
def getOutFilesGuids(outFiles, workdir, experiment, TURL=False):
    """ get the outFilesGuids from the PFC """

    ec = 0
    pilotErrorDiag = ""
    outFilesGuids = []

    # Get the experiment object and the GUID source filename
    thisExperiment = getExperiment(experiment)
    filename = thisExperiment.getGUIDSourceFilename()

    # If a source file should not be used (ie empty filename string), then generate the GUIDs here
    if filename == "":
        tolog("Pilot will generate GUIDs for the output files")
        for i in range(0, len(outFiles)):
            guid = getGUID()
            if guid == "":
                guid = "- GUID generation failed -"
            outFilesGuids.append(guid)

        return ec, pilotErrorDiag, outFilesGuids
    else:
        tolog("Pilot will get GUIDs for the output files from source: %s" %
              (filename))
        pfcFile = os.path.join(workdir,
                               filename)  #"%s/PoolFileCatalog.xml" % (workdir)

    # The PFC used for Event Service will be TURL based, use the corresponding file
    if TURL:
        pfcFile = pfcFile.replace(".xml", "TURL.xml")

    # Initialization: make sure the guid list has the same length as the file list
    for i in range(0, len(outFiles)):
        outFilesGuids.append(None)

    # make sure the PFC exists
    if os.path.isfile(pfcFile):
        from xml.dom import minidom
        xmldoc = minidom.parse(pfcFile)
        fileList = xmldoc.getElementsByTagName("File")
        for thisfile in fileList:
            gpfn = str(
                thisfile.getElementsByTagName("pfn")[0].getAttribute("name"))
            guid = str(thisfile.getAttribute("ID"))
            for i in range(0, len(outFiles)):
                if outFiles[i] == gpfn:
                    outFilesGuids[i] = guid
    else:
        pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile)
        tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag))
        error = PilotErrors()
        ec = error.ERR_MISSINGPFC

    return ec, pilotErrorDiag, outFilesGuids
예제 #21
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        transferType = pdict.get('transferType', '')
        proxycheck = pdict.get('proxycheck', False)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'local', lfn, guid)

        tolog("transferType=%s" % (transferType))
        status, output = self.getStageInMode(lfn, prodDBlockToken,
                                             transferType)
        tolog("output=%s" % str(output))
        if output["transfer_mode"]:
            updateFileState(lfn,
                            workDir,
                            jobId,
                            mode="transfer_mode",
                            state=output["transfer_mode"],
                            ftype="input")
            tolog(
                "updated file state for lfn=%s, workDir=%s, jobId=%s, state=%s"
                % (lfn, workDir, jobId, output["transfer_mode"]))
        if status != 0:
            self.prepareReport(output["report"], report)
            return status, output["errorLog"]

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        status, output = self.stageIn(gpfn, fullname, fsize, fchecksum,
                                      experiment)

        if status == 0:
            updateFileState(lfn,
                            workDir,
                            jobId,
                            mode="file_state",
                            state="transferred",
                            ftype="input")

        self.prepareReport(output["report"], report)
        return status, output["errorLog"]
예제 #22
0
def updateOutPFC(job, **kwargs):
    file_name = "OutPutFileCatalog.xml"
    file_path = os.path.join(DorE(kwargs, 'thisSite').workdir, file_name)
    try:
        guids_status = pUtil.PFCxml(job.experiment, file_path, job.outFiles, fguids=job.outFilesGuids, fntag="pfn",
                                    analJob=pUtil.isAnalysisJob(job.trf.split(",")[0]), jr=True)
    except Exception, e:
        log("!!FAILED!!1105!! Exception caught (Could not generate xml for the remaining output files): %s" %
                    str(e))
        job.result[2] = PilotErrors().ERR_LOSTJOBXML
        return False
예제 #23
0
    def setupNordugridTrf(self, job, analysisJob, wgetCommand, pilot_initdir):
        """ perform the Nordugrid trf setup """

        error = PilotErrors()
        pilotErrorDiag = ""
        cmd = ""

        # assume that the runtime script has already been created
        if not os.environ.has_key('RUNTIME_CONFIG_DIR'):
            pilotErrorDiag = "Environment variable not set: RUNTIME_CONFIG_DIR"
            tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag))
            return error.ERR_SETUPFAILURE, pilotErrorDiag, ""

        runtime_script = "%s/APPS/HEP/ATLAS-%s" % (
            os.environ['RUNTIME_CONFIG_DIR'], job.release)
        if os.path.exists(runtime_script):
            cmd = ". %s 1" % (runtime_script)
            if analysisJob:
                # try to download the analysis trf
                status, pilotErrorDiag, trfName = self.getAnalysisTrf(
                    wgetCommand, job.trf, pilot_initdir)
                if status != 0:
                    return status, pilotErrorDiag, ""
                trfName = "./" + trfName
            else:
                trfName = job.trf
                cmd += '; export ATLAS_RELEASE=%s;export AtlasVersion=%s;export AtlasPatchVersion=%s' % (
                    job.homePackage.split('/')[-1],
                    job.homePackage.split('/')[-1],
                    job.homePackage.split('/')[-1])
            cmd += "; %s %s" % (trfName, job.jobPars)
        elif verifyReleaseString(job.release) == "NULL":
            if analysisJob:
                # try to download the analysis trf
                status, pilotErrorDiag, trfName = self.getAnalysisTrf(
                    wgetCommand, job.trf, pilot_initdir)
                if status != 0:
                    return status, pilotErrorDiag, ""
                trfName = "./" + trfName
            else:
                trfName = job.trf
            cmd = "%s %s" % (trfName, job.jobPars)
        else:
            pilotErrorDiag = "Could not locate runtime script: %s" % (
                runtime_script)
            tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag))
            return error.ERR_SETUPFAILURE, pilotErrorDiag, ""

        # correct for multi-core if necessary (especially important in case coreCount=1 to limit parallel make)
        cmd = self.addMAKEFLAGS(job.coreCount, "") + cmd

        return 0, pilotErrorDiag, cmd
 def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
     "Executes setup and command after it"
     error = PilotErrors()
     ec = 0
     pilotErrorDiag = ""
     s, o = commands.getstatusoutput(
         'source %s; %s %s %s' % (self._setup, self._copycmd, gpfn, path))
     if s != 0:
         pilotErrorDiag = "Error during copy: %s" % (o)
         tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
         ec = error.ERR_STAGEINFAILED
     # TODO: how are md5 and size controlled? some processing?
     return ec, pilotErrorDiag
예제 #25
0
파일: Diagnosis.py 프로젝트: vokac/pilot
class Diagnosis(object):

    # private data members
    __instance = None  # Boolean used by subclasses to become a Singleton
    __error = PilotErrors()  # PilotErrors object

    # Required methods

    def __init__(self):
        """ Default initialization """

        # e.g. self.__errorLabel = errorLabel
        pass
예제 #26
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        logPath = pdict.get('logPath', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))
        if prodSourceLabel == 'ddm' and analysisJob:
            tolog(
                "Treating PanDA Mover job as a production job during stage-out"
            )
            analysisJob = False

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'],
                                           'xrootdObjectstore', lfn, guid)

        filename = os.path.basename(source)

        if logPath != "":
            surl = logPath
        else:
            surl = os.path.join(destination, lfn)

        # get the DQ2 site name from ToA
        try:
            _dq2SiteName = self.getDQ2SiteName(surl=surl)
        except Exception, e:
            tolog(
                "Warning: Failed to get the DQ2 site name: %s (can not add this info to tracing report)"
                % str(e))
예제 #27
0
def getFinalState(result):
    """
    Figure out the final job state (finished or failed)
    Simplies job recovery
    """

    state = "failed"

    # job has failed if transExitCode != 0
    if result[1] != 0:
        state = "failed"
    else:
        error = PilotErrors()
        # job has finished if pilotErrorCode is in the allowed list or recoverable jobs
        if ((error.isRecoverableErrorCode(result[2])) or (result[2] == error.ERR_KILLSIGNAL and result[0] == "holding")):
            state = "finished"

    return state
예제 #28
0
def prepareOutFiles(outFiles, logFile, workdir, fullpath=False):
    """ verify and prepare the output files for transfer """

    # fullpath = True means that the file in outFiles already has a full path, adding it to workdir is then not needed
    ec = 0
    pilotErrorDiag = ""
    outs = []
    modt = []

    from SiteMover import SiteMover
    for outf in outFiles:
        if outf and outf != 'NULL':  # non-empty string and not NULL
            path = os.path.join(workdir, outf)
            if (not os.path.isfile(path)
                    and not fullpath) or (not os.path.isfile(outf)
                                          and fullpath):
                pilotErrorDiag = "Expected output file %s does not exist" % (
                    path)
                tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag))
                error = PilotErrors()
                ec = error.ERR_MISSINGOUTPUTFILE
                break
            else:
                tolog("outf = %s" % (outf))
                if fullpath:
                    # remove the full path here from outf
                    workdir = os.path.dirname(outf)
                    outf = os.path.basename(outf)

                outs.append(outf)

                # get the modification time for the file (needed by NG)
                modt.append(SiteMover.getModTime(workdir, outf))

                tolog("Output file(s):")
                try:
                    _ec, _rs = commands.getstatusoutput("ls -l %s/%s" %
                                                        (workdir, outf))
                except Exception, e:
                    tolog(str(e))
                else:
                    tolog(_rs)
예제 #29
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """
        Move the file from the current local directory to the local pilot init dir

        Parameters are:
        source -- full path of the file in  local directory
        destinaion -- destination SE, method://[hostname[:port]]/full-dir-path/ (NB: no file name) NOT USED (pinitdir is used instead)
        fsize -- file size of the source file (evaluated if 0)
        fchecksum -- MD5 checksum of the source file (evaluated if 0)
        pdict -- to allow additional parameters that may make sense with specific movers
        
        Assume that the pilot init dir is locally mounted and its local path is the same as the remote path
        if both fsize and fchecksum (for the source) are given and !=0 these are assumed without reevaluating them
        returns: exitcode, pilotErrorDiag, gpfn, fsize, fchecksum
        """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        DN = pdict.get('DN', '')
        dsname = pdict.get('dsname', '')
        analJob = pdict.get('analJob', False)
        sitename = pdict.get('sitename', '')
        testLevel = pdict.get('testLevel', '0')
        pilot_initdir = pdict.get('pinitdir', '')
        experiment = pdict.get('experiment', "ATLAS")

        # get the site information object
        si = getSiteInformation(experiment)

        # are we on a tier 3?
        if si.isTier3():
            outputDir = self.getTier3Path(dsname, DN)
            tolog("Writing output on a Tier 3 site to: %s" % (outputDir))

            # create the dirs if they don't exist
            try:
                self.mkdirWperm(outputDir)
            except Exception, e:
                tolog("!!WARNING!!2999!! Could not create dir: %s, %s" %
                      (outputDir, str(e)))
예제 #30
0
class ErrorDiagnosis(Diagnosis):

    # private data members
    __instance = None  # Boolean used by subclasses to become a Singleton
    __error = PilotErrors()  # PilotErrors object

    def __init__(self):
        """ Default initialization """

        # e.g. self.__errorLabel = errorLabel
        pass

    def __new__(cls, *args, **kwargs):
        """ Override the __new__ method to make the class a singleton """

        if not cls.__instance:
            cls.__instance = super(ErrorDiagnosis,
                                   cls).__new__(cls, *args, **kwargs)

        return cls.__instance

    def interpretPayload(self, job, res, getstatusoutput_was_interrupted,
                         current_job_number, runCommandList, failureCode):
        """ Interpret the payload, look for specific errors in the stdout """

        # get the experiment object
        thisExperiment = getExperiment(job.experiment)
        if not thisExperiment:
            job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory"
            job.result[
                2] = self.__error.ERR_GENERALERROR  # change to better/new error code
            tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag))
            return job

        ### WARNING: EXPERIMENT SPECIFIC, MOVE LATER
        try:
            ec, pilotErrorDiag = self.processJobReport(job.workdir)
        except Exception, e:
            tolog("!!WARNING!!1114!! Caught exception: %s" % (e))
        else: