Esempio n. 1
0
    def stageInFile(self, source, destination):
        """StageIn the file. should be implementated by different site mover."""
        statusRet = 0
        outputRet = {}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        self.log("StageIn files started.")
        _cmd_str = '%s xrdcp -np %s %s' % (self._setup, source, destination)

        # update job setup script
        thisExperiment = getExperiment(self.__experiment)
        # add the full stage-out command to the job setup script
        to_script = _cmd_str.replace(destination, "`pwd`/%s" % os.path.basename(destination))
        to_script = to_script.lstrip(' ') # remove any initial spaces
        if to_script.startswith('/'):
            to_script = 'source ' + to_script
        thisExperiment.updateJobSetupScript(os.path.dirname(destination), to_script=to_script)

        self.log('Executing command: %s' % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        outputRet["report"]['relativeStart'] = time()
        outputRet["report"]['transferStart'] = time()
        try:
            timerCommand = TimerCommand(_cmd_str)
            s, o = timerCommand.run(timeout=self.timeout)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught by stageInFile(): %s" % (str(e)))
            o = str(e)
Esempio n. 2
0
    def stageInFile(self, source, destination):
        """StageIn the file. should be implementated by different site mover."""
        statusRet = 0
        outputRet = {}
        outputRet["errorLog"] = None
        outputRet["report"] = {}
        outputRet["report"]["clientState"] = None

        self.log("StageIn files started.")
        _cmd_str = '%s xrdcp -np %s %s' % (self._setup, source, destination)

        # update job setup script
        thisExperiment = getExperiment(self.__experiment)
        # add the full stage-out command to the job setup script
        to_script = _cmd_str.replace(
            destination, "`pwd`/%s" % os.path.basename(destination))
        to_script = to_script.lstrip(' ')  # remove any initial spaces
        if to_script.startswith('/'):
            to_script = 'source ' + to_script
        thisExperiment.updateJobSetupScript(os.path.dirname(destination),
                                            to_script=to_script)

        self.log('Executing command: %s' % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        outputRet["report"]['relativeStart'] = time()
        outputRet["report"]['transferStart'] = time()
        try:
            timerCommand = TimerCommand(_cmd_str)
            s, o = timerCommand.run(timeout=self.timeout)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught by stageInFile(): %s" %
                  (str(e)))
            o = str(e)
Esempio n. 3
0
    def setupHPCEvent(self):
        self.__jobSite = Site.Site()
        self.__jobSite.setSiteInfo(self.argumentParser())
        ## For HPC job, we don't need to reassign the workdir
        # reassign workdir for this job
        self.__jobSite.workdir = self.__jobSite.wntmpdir
        if not os.path.exists(self.__jobSite.workdir):
            os.makedirs(self.__jobSite.workdir)

        tolog("runJobHPCEvent.getPilotLogFilename=%s" %
              self.getPilotLogFilename())
        if self.getPilotLogFilename() != "":
            pUtil.setPilotlogFilename(self.getPilotLogFilename())

        # set node info
        self.__node = Node.Node()
        self.__node.setNodeName(os.uname()[1])
        self.__node.collectWNInfo(self.__jobSite.workdir)

        # redirect stderr
        #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w")

        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % self.__jobSite.workdir)

        # get the experiment object
        self.__thisExperiment = getExperiment(self.getExperiment())
        tolog("runEvent will serve experiment: %s" %
              (self.__thisExperiment.getExperiment()))
Esempio n. 4
0
    def extractJobInformation(self, job, runCommandList):
        """ Extract relevant job information, e.g. number of events """

        # get the experiment object
        thisExperiment = getExperiment(job.experiment)
        if not thisExperiment:
            job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory"
            job.result[
                2] = self.__error.ERR_GENERALERROR  # change to better/new error code
            tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag))
            return job

        # note that this class should not be experiment specific, so move anything related to ATLAS to ATLASExperiment.py
        # and use thisExperiment.whatever() to retrieve it here

        # grab the number of events
        try:
            # nEvents_str can be a string of the form N|N|..|N with the number of jobs in the trf(s) [currently not used]
            # Add to Job class if necessary
            job.nEvents, job.nEventsW, nEvents_str = thisExperiment.getNumberOfEvents(
                job=job, number_of_jobs=len(runCommandList))
        except Exception, e:
            tolog(
                "!!WARNING!!2999!! Failed to get number of events: %s (ignore)"
                % str(e))
Esempio n. 5
0
    def core_get_data(self, envsetup, token, source_surl, dest_path,
                      experiment):
        """ stage-in core function, can be overridden (see stormSiteMover) """

        error = PilotErrors()

        # determine which timeout option to use
        if self.isNewLCGVersion("%s lcg-cp" % (envsetup)):
            timeout_option = "--srm-timeout=%d --connect-timeout=300 --sendreceive-timeout=%d" % (
                self.timeout, self.timeout)
        else:
            timeout_option = "-t %d" % (self.timeout)

        # used lcg-cp options:
        # --vo: specifies the Virtual Organization the user belongs to
        #   -t: time-out
        if token:
            # do not use option -b on SL3 clusters running older versions of lcg_utils
            use_b = True
            s, o = commands.getstatusoutput("%s lcg-cr --version" % (envsetup))
            if s != 0:
                # (BDII collects all information coming from site GIISes and stores them in a permanent database)
                tolog("(Probably too old lcg_utils - skipping BDII disabling)")
                use_b = False

            # for the time being
            use_b = False
            if use_b:
                _cmd_str = '%s lcg-cp --vo atlas --srcsetype srmv2 -s %s -b %s %s file://%s' %\
                           (envsetup, token, timeout_option, source_surl, dest_path)
            else:
                tolog("(Skipping space token for the time being)")
                _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % (
                    envsetup, timeout_option, source_surl, dest_path)
        else:
            _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % (
                envsetup, timeout_option, source_surl, dest_path)

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # add the full stage-out command to the job setup script
        to_script = _cmd_str.replace("file://%s" % os.path.dirname(dest_path),
                                     "file://`pwd`")
        to_script = to_script.lstrip(' ')  # remove any initial spaces
        if to_script.startswith('/'):
            to_script = 'source ' + to_script
        thisExperiment.updateJobSetupScript(os.path.dirname(dest_path),
                                            to_script=to_script)

        tolog("Executing command: %s" % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        try:
            s, o = commands.getstatusoutput(_cmd_str)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" %
                  (str(e), s, o))
            o = str(e)
Esempio n. 6
0
    def setupHPCEvent(self):
        self.__jobSite = Site.Site()
        self.__jobSite.setSiteInfo(self.argumentParser())
        ## For HPC job, we don't need to reassign the workdir
        # reassign workdir for this job
        self.__jobSite.workdir = self.__jobSite.wntmpdir
        if not os.path.exists(self.__jobSite.workdir):
            os.makedirs(self.__jobSite.workdir)


        tolog("runJobHPCEvent.getPilotLogFilename=%s"% self.getPilotLogFilename())
        if self.getPilotLogFilename() != "":
            pUtil.setPilotlogFilename(self.getPilotLogFilename())

        # set node info
        self.__node = Node.Node()
        self.__node.setNodeName(os.uname()[1])
        self.__node.collectWNInfo(self.__jobSite.workdir)

        # redirect stderr
        #sys.stderr = open("%s/runJobHPCEvent.stderr" % (self.__jobSite.workdir), "w")

        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % self.__jobSite.workdir)

        # get the experiment object
        self.__thisExperiment = getExperiment(self.getExperiment())
        tolog("runEvent will serve experiment: %s" % (self.__thisExperiment.getExperiment()))
Esempio n. 7
0
    def verifySetupProxy(self, _setupStr, experiment):
        #check do we have a valid proxy

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        status, output = thisExperiment.verifyProxy(envsetup=_setupStr)
        return status, output
Esempio n. 8
0
    def verifySetupProxy(self, _setupStr, experiment):
        #check do we have a valid proxy

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        status, output = thisExperiment.verifyProxy(envsetup=_setupStr)
        return status, output
Esempio n. 9
0
    def core_get_data(self, envsetup, token, source_surl, local_fullname,
                      experiment):
        """ special get function developed for storm sites """

        error = PilotErrors()

        # Transform the surl into a full surl
        full_se_endpoint = self.extractSE(readpar('se').split(",")[0])[1]
        prefix = os.path.commonprefix([source_surl, full_se_endpoint])
        if prefix:
            # Can use the bdii-free form
            source_surl = full_se_endpoint + source_surl[len(prefix):]
            _cmd_str = '%s lcg-gt --nobdii --setype srmv2 "%s" file' % (
                envsetup, source_surl)
        else:
            # Fallback solution, use old lcg-gt form
            # get the TURL using the SURL
            tolog(
                "!!WARNING!1234!! Source surl does not match %s, cannot use the bdii-independent lcg-gt"
                % full_se_endpoint)
            _cmd_str = '%s lcg-gt "%s" file' % (envsetup, source_surl)

        tolog("Executing command: %s" % (_cmd_str))
        t0 = os.times()
        s, o = commands.getstatusoutput(_cmd_str)
        t1 = os.times()
        t = t1[4] - t0[4]
        tolog("Command finished after %f s" % (t))
        if s == 0:
            # get the experiment object
            thisExperiment = getExperiment(experiment)

            # add the full stage-out command to the job setup script
            to_script = _cmd_str
            to_script = to_script.lstrip(' ')  # remove any initial spaces
            if to_script.startswith('/'):
                to_script = 'source ' + to_script
            thisExperiment.updateJobSetupScript(
                os.path.dirname(local_fullname), to_script=to_script)

            source_turl, req_token = o.split('\n')
            source_turl = source_turl.replace('file://', '')
            tolog("Creating link from %s to %s" %
                  (source_turl, local_fullname))
            try:
                os.symlink(source_turl, local_fullname)
                _cmd_str = '%s lcg-sd %s %s 0' % (envsetup, source_surl,
                                                  req_token)
                tolog("Executing command: %s" % (_cmd_str))
                s, o = commands.getstatusoutput(_cmd_str)
                # Do we need to check the exit status of lcg-sd? What do we do if it fails?
                tolog("get_data succeeded")
            except Exception, e:
                pilotErrorDiag = "Exception caught: %s" % str(e)
                tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                tolog("get_data failed")
                return error.ERR_STAGEINFAILED, pilotErrorDiag
Esempio n. 10
0
def getOutFilesGuids(outFiles, workdir, experiment, TURL=False):
    """ get the outFilesGuids from the PFC """

    ec = 0
    pilotErrorDiag = ""
    outFilesGuids = []

    # Get the experiment object and the GUID source filename
    thisExperiment = getExperiment(experiment)
    filename = thisExperiment.getGUIDSourceFilename()

    # If a source file should not be used (ie empty filename string), then generate the GUIDs here
    if filename == "":
        tolog("Pilot will generate GUIDs for the output files")
        for i in range(0, len(outFiles)):
            guid = getGUID()
            if guid == "":
                guid = "- GUID generation failed -"
            outFilesGuids.append(guid)

        return ec, pilotErrorDiag, outFilesGuids
    else:
        tolog("Pilot will get GUIDs for the output files from source: %s" %
              (filename))
        pfcFile = os.path.join(workdir,
                               filename)  #"%s/PoolFileCatalog.xml" % (workdir)

    # The PFC used for Event Service will be TURL based, use the corresponding file
    if TURL:
        pfcFile = pfcFile.replace(".xml", "TURL.xml")

    # Initialization: make sure the guid list has the same length as the file list
    for i in range(0, len(outFiles)):
        outFilesGuids.append(None)

    # make sure the PFC exists
    if os.path.isfile(pfcFile):
        from xml.dom import minidom
        xmldoc = minidom.parse(pfcFile)
        fileList = xmldoc.getElementsByTagName("File")
        for thisfile in fileList:
            gpfn = str(
                thisfile.getElementsByTagName("pfn")[0].getAttribute("name"))
            guid = str(thisfile.getAttribute("ID"))
            for i in range(0, len(outFiles)):
                if outFiles[i] == gpfn:
                    outFilesGuids[i] = guid
    else:
        pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile)
        tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag))
        error = PilotErrors()
        ec = error.ERR_MISSINGPFC

    return ec, pilotErrorDiag, outFilesGuids
Esempio n. 11
0
    def core_get_data(self, envsetup, token, source_surl, dest_path, experiment):
        """ stage-in core function, can be overridden (see stormSiteMover) """

        error = PilotErrors()

        # determine which timeout option to use
        if self.isNewLCGVersion("%s lcg-cp" % (envsetup)):
            timeout_option = "--srm-timeout=%d --connect-timeout=300 --sendreceive-timeout=%d" % (self.timeout, self.timeout)
        else:
            timeout_option = "-t %d" % (self.timeout)

        # used lcg-cp options:
        # --vo: specifies the Virtual Organization the user belongs to
        #   -t: time-out
        if token:
            # do not use option -b on SL3 clusters running older versions of lcg_utils
            use_b = True
            s, o = commands.getstatusoutput("%s lcg-cr --version" % (envsetup))
            if s != 0:
                # (BDII collects all information coming from site GIISes and stores them in a permanent database)
                tolog("(Probably too old lcg_utils - skipping BDII disabling)")
                use_b = False

            # for the time being
            use_b = False
            if use_b:
                _cmd_str = '%s lcg-cp --vo atlas --srcsetype srmv2 -s %s -b %s %s file://%s' %\
                           (envsetup, token, timeout_option, source_surl, dest_path)
            else:
                tolog("(Skipping space token for the time being)")
                _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % (envsetup, timeout_option, source_surl, dest_path)
        else:
            _cmd_str = '%s lcg-cp --vo atlas %s %s file://%s' % (envsetup, timeout_option, source_surl, dest_path)

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # add the full stage-out command to the job setup script
        to_script = _cmd_str.replace("file://%s" % os.path.dirname(dest_path), "file://`pwd`")
        to_script = to_script.lstrip(' ') # remove any initial spaces
        if to_script.startswith('/'):
            to_script = 'source ' + to_script
        thisExperiment.updateJobSetupScript(os.path.dirname(dest_path), to_script=to_script)

        tolog("Executing command: %s" % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        try:
            s, o = commands.getstatusoutput(_cmd_str)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" % (str(e), s, o))
            o = str(e)
Esempio n. 12
0
def getOutFilesGuids(outFiles, workdir, experiment, TURL=False):
    """ get the outFilesGuids from the PFC """

    ec = 0
    pilotErrorDiag = ""
    outFilesGuids = []

    # Get the experiment object and the GUID source filename
    thisExperiment = getExperiment(experiment)
    filename = thisExperiment.getGUIDSourceFilename()

    # If a source file should not be used (ie empty filename string), then generate the GUIDs here
    if filename == "":
        tolog("Pilot will generate GUIDs for the output files")
        for i in range (0, len(outFiles)):
            guid = getGUID()
            if guid == "":
                guid = "- GUID generation failed -"
            outFilesGuids.append(guid)

        return ec, pilotErrorDiag, outFilesGuids
    else:
        tolog("Pilot will get GUIDs for the output files from source: %s" % (filename))
        pfcFile = os.path.join(workdir, filename) #"%s/PoolFileCatalog.xml" % (workdir)

    # The PFC used for Event Service will be TURL based, use the corresponding file
    if TURL:
        pfcFile = pfcFile.replace(".xml", "TURL.xml")

    # Initialization: make sure the guid list has the same length as the file list
    for i in range (0, len(outFiles)):
        outFilesGuids.append(None)

    # make sure the PFC exists
    if os.path.isfile(pfcFile):
        from xml.dom import minidom
        xmldoc = minidom.parse(pfcFile)
        fileList = xmldoc.getElementsByTagName("File")
        for thisfile in fileList:
            gpfn = str(thisfile.getElementsByTagName("pfn")[0].getAttribute("name"))
            guid = str(thisfile.getAttribute("ID"))
            for i in range(0, len(outFiles)):
                if outFiles[i] == gpfn:
                    outFilesGuids[i] = guid
    else:
        pilotErrorDiag = "PFC file does not exist: %s" % (pfcFile)
        tolog("!!FAILED!!3000!! %s" % (pilotErrorDiag))
        error = PilotErrors()
        ec = error.ERR_MISSINGPFC

    return ec, pilotErrorDiag, outFilesGuids
Esempio n. 13
0
    def core_get_data(self, envsetup, token, source_surl, local_fullname, experiment):
        """ special get function developed for storm sites """

        error = PilotErrors()

        # Transform the surl into a full surl
        full_se_endpoint = self.extractSE(readpar('se').split(",")[0])[1]
        prefix = os.path.commonprefix([source_surl, full_se_endpoint])
        if prefix:
            # Can use the bdii-free form
            source_surl = full_se_endpoint + source_surl[len(prefix):]
            _cmd_str = '%s lcg-gt --nobdii --setype srmv2 "%s" file' % (envsetup, source_surl)
        else:
            # Fallback solution, use old lcg-gt form 
            # get the TURL using the SURL
            tolog("!!WARNING!1234!! Source surl does not match %s, cannot use the bdii-independent lcg-gt" % full_se_endpoint)
            _cmd_str = '%s lcg-gt "%s" file' % (envsetup, source_surl)

        tolog("Executing command: %s" % (_cmd_str))
        t0 = os.times()
        s, o = commands.getstatusoutput(_cmd_str)
        t1 = os.times()
        t = t1[4] - t0[4]
        tolog("Command finished after %f s" % (t))
        if s == 0:
            # get the experiment object
            thisExperiment = getExperiment(experiment)

            # add the full stage-out command to the job setup script
            to_script = _cmd_str
            to_script = to_script.lstrip(' ') # remove any initial spaces
            if to_script.startswith('/'):
                to_script = 'source ' + to_script
            thisExperiment.updateJobSetupScript(os.path.dirname(local_fullname), to_script=to_script)

            source_turl, req_token = o.split('\n')
            source_turl = source_turl.replace('file://','')
            tolog("Creating link from %s to %s" % (source_turl, local_fullname))
            try:
                os.symlink(source_turl, local_fullname)
                _cmd_str = '%s lcg-sd %s %s 0' % (envsetup, source_surl, req_token)
                tolog("Executing command: %s" % (_cmd_str))
                s,o = commands.getstatusoutput(_cmd_str)
                # Do we need to check the exit status of lcg-sd? What do we do if it fails?
                tolog("get_data succeeded")
            except Exception, e:
                pilotErrorDiag = "Exception caught: %s" % str(e)
                tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                tolog("get_data failed")
                return error.ERR_STAGEINFAILED, pilotErrorDiag
Esempio n. 14
0
    def setup(self, experiment):
        """ setup env """
        if self.__isSetuped:
            return 0, None
        self.__experiment = experiment
        thisExperiment = getExperiment(experiment)
        self.useTracingService = thisExperiment.useTracingService()
        si = getSiteInformation(experiment)
        self._defaultSetup = self.getLocalROOTSetup(si)

        _setupStr = self._defaultSetup  #self.getSetup()

        # get the user proxy if available
        envsetupTest = _setupStr.strip()
        if envsetupTest != "" and not envsetupTest.endswith(';'):
            envsetupTest += ";"
        if os.environ.has_key('X509_USER_PROXY'):
            envsetupTest += " export X509_USER_PROXY=%s;" % (
                os.environ['X509_USER_PROXY'])

        self.log("to verify site setup: %s " % envsetupTest)
        status, output = self.verifySetup(envsetupTest, experiment)
        self.log("site setup verifying: status: %s, output: %s" %
                 (status, output["errorLog"]))
        if status == 0:
            self._setup = envsetupTest
            self.__isSetuped = True
            return status, output
        else:
            if self._defaultSetup:
                #try to use default setup
                self.log("Try to use default envsetup")
                envsetupTest = self._defaultSetup.strip()
                if envsetupTest != "" and not envsetupTest.endswith(';'):
                    envsetupTest += ";"
                if os.environ.has_key('X509_USER_PROXY'):
                    envsetupTest += " export X509_USER_PROXY=%s;" % (
                        os.environ['X509_USER_PROXY'])

                self.log("verify default setup: %s " % envsetupTest)
                status, output = self.verifySetup(envsetupTest, experiment)
                self.log("default setup verifying: status: %s, output: %s" %
                         (status, output["errorLog"]))
                if status == 0:
                    self._setup = envsetupTest
                    self.__isSetuped = True
                    return status, output

        return status, output
Esempio n. 15
0
    def interpretPayload(self, job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, failureCode):
        """ Interpret the payload, look for specific errors in the stdout """

        # get the experiment object
        thisExperiment = getExperiment(job.experiment)
        if not thisExperiment:
            job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory"
            job.result[2] = self.__error.ERR_GENERALERROR # change to better/new error code
            tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag))
            return job

        ### WARNING: EXPERIMENT SPECIFIC, MOVE LATER
        try:
            ec, pilotErrorDiag = self.processJobReport(job.workdir)
        except Exception, e:
            tolog("!!WARNING!!1114!! Caught exception: %s" % (e))
Esempio n. 16
0
    def interpretPayload(self, job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, failureCode):
        """ Interpret the payload, look for specific errors in the stdout """

        # get the experiment object
        thisExperiment = getExperiment(job.experiment)
        if not thisExperiment:
            job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory"
            job.result[2] = self.__error.ERR_GENERALERROR # change to better/new error code
            tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag))
            return job

        ### WARNING: EXPERIMENT SPECIFIC, MOVE LATER
        try:
            ec, pilotErrorDiag = self.processJobReport(job.workdir)
        except Exception, e:
            tolog("!!WARNING!!1114!! Caught exception: %s" % (e))
Esempio n. 17
0
    def getUtilityInfo(self, node, experiment, workdir):
        """ Add the utility info to the node structure if available """

        # Get the experiment object and check if the special utility (e.g. a memory monitor) was used
        thisExperiment = getExperiment(experiment)
        if thisExperiment.shouldExecuteUtility():

            # Try to get the memory monitor info from the workdir first
            path = os.path.join(workdir, thisExperiment.getUtilityJSONFilename())
            init_path = os.path.join(self.__pilot_initdir, thisExperiment.getUtilityJSONFilename())
            if not os.path.exists(path):
                tolog("File does not exist: %s" % (path))
                if os.path.exists(init_path):
                    path = init_path
                else:
                    tolog("File does not exist either: %s" % (path))
                    path = ""

            if path != "":
                tolog("Reading memory monitoring info from: %s" % (path))

                # Get the dictionary
                d = getJSONDictionary(path)
                if d and d != {}:
                    try:
                        node['maxRSS'] = d['Max']['maxRSS']
                        node['maxVMEM'] = d['Max']['maxVMEM']
                        node['maxSWAP'] = d['Max']['maxSwap']
                        node['maxPSS'] = d['Max']['maxPSS']
                        node['avgRSS'] = d['Avg']['avgRSS']
                        node['avgVMEM'] = d['Avg']['avgVMEM']
                        node['avgSWAP'] = d['Avg']['avgSwap']
                        node['avgPSS'] = d['Avg']['avgPSS']
                    except Exception, e:
                        tolog("!!WARNING!!54541! Exception caught while parsing memory monitor JSON: %s" % (e))
                    else:
                        tolog("Extracted info from memory monitor JSON")

            # Done with the memory monitor for this job (if the file is read from the pilots' init dir), remove the file in case there are other jobs to be run
            if os.path.exists(init_path):
                try:
                    os.system("rm -rf %s" % (init_path))
                except Exception, e:
                    tolog("!!WARNING!!4343!! Failed to remove %s: %s" % (init_path), e)
                else:
                    tolog("Removed %s" % (init_path))
Esempio n. 18
0
    def setup(self, experiment):
        """ setup env """
        if self.__isSetuped:
            return 0, None
        self.__experiment = experiment
        thisExperiment = getExperiment(experiment)
        self.useTracingService = thisExperiment.useTracingService()
        si = getSiteInformation(experiment)
        self._defaultSetup = self.getLocalROOTSetup(si)

        _setupStr = self._defaultSetup #self.getSetup()

        # get the user proxy if available
        envsetupTest = _setupStr.strip()
        if envsetupTest != "" and not envsetupTest.endswith(';'):
            envsetupTest += ";"
        if os.environ.has_key('X509_USER_PROXY'):
            envsetupTest += " export X509_USER_PROXY=%s;" % (os.environ['X509_USER_PROXY'])

        self.log("to verify site setup: %s " % envsetupTest)
        status, output = self.verifySetup(envsetupTest, experiment)
        self.log("site setup verifying: status: %s, output: %s" % (status, output["errorLog"]))
        if status == 0:
            self._setup = envsetupTest
            self.__isSetuped = True
            return status, output
        else:
            if self._defaultSetup:
                #try to use default setup
                self.log("Try to use default envsetup")
                envsetupTest = self._defaultSetup.strip()
                if envsetupTest != "" and not envsetupTest.endswith(';'):
                     envsetupTest += ";"
                if os.environ.has_key('X509_USER_PROXY'):
                     envsetupTest += " export X509_USER_PROXY=%s;" % (os.environ['X509_USER_PROXY'])

                self.log("verify default setup: %s " % envsetupTest)
                status, output = self.verifySetup(envsetupTest, experiment)
                self.log("default setup verifying: status: %s, output: %s" % (status, output["errorLog"]))
                if status == 0:
                    self._setup = envsetupTest
                    self.__isSetuped = True
                    return status, output

        return status, output
Esempio n. 19
0
    def getJobMetrics(self, job, workerNode):
        """ Return a properly formatted job metrics string """

        # style: Number of events read | Number of events written | vmPeak maximum | vmPeak average | RSS average | JEM activation
        # format: nEvents=<int> nEventsW=<int> vmPeakMax=<int> vmPeakMean=<int> RSSMean=<int> JEM=<string>
        #         hs06=<float> shutdownTime=<int> cpuFactor=<float> cpuLimit=<float> diskLimit=<float> jobStart=<int> memLimit=<int> runLimit=<float>

        # get the experiment object
        thisExperiment = getExperiment(job.experiment)

        if job.coreCount:
            # Always use the ATHENA_PROC_NUMBER first, if set
            if os.environ.has_key('ATHENA_PROC_NUMBER'):
                try:
                    job.coreCount = int(os.environ['ATHENA_PROC_NUMBER'])
                except Exception, e:
                    tolog("ATHENA_PROC_NUMBER is not properly set: %s (will use existing job.coreCount value)" % (e))
                    
            coreCount = job.coreCount
Esempio n. 20
0
    def getJobMetrics(self, job, workerNode):
        """ Return a properly formatted job metrics string """

        # style: Number of events read | Number of events written | vmPeak maximum | vmPeak average | RSS average | JEM activation
        # format: nEvents=<int> nEventsW=<int> vmPeakMax=<int> vmPeakMean=<int> RSSMean=<int> JEM=<string>
        #         hs06=<float> shutdownTime=<int> cpuFactor=<float> cpuLimit=<float> diskLimit=<float> jobStart=<int> memLimit=<int> runLimit=<float>

        # get the experiment object
        thisExperiment = getExperiment(job.experiment)

        if job.coreCount:
            # Always use the ATHENA_PROC_NUMBER first, if set
            if os.environ.has_key('ATHENA_PROC_NUMBER'):
                try:
                    job.coreCount = int(os.environ['ATHENA_PROC_NUMBER'])
                except Exception, e:
                    tolog("ATHENA_PROC_NUMBER is not properly set: %s (will use existing job.coreCount value)" % (e))
                    
            coreCount = job.coreCount
Esempio n. 21
0
    def extractJobInformation(self, job, runCommandList):
        """ Extract relevant job information, e.g. number of events """

        # get the experiment object
        thisExperiment = getExperiment(job.experiment)
        if not thisExperiment:
            job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory"
            job.result[2] = self.__error.ERR_GENERALERROR # change to better/new error code
            tolog("!!WARNING!!3234!! %s" % (job.pilotErrorDiag))
            return job

        # note that this class should not be experiment specific, so move anything related to ATLAS to ATLASExperiment.py
        # and use thisExperiment.whatever() to retrieve it here

        # grab the number of events
        try:
            # nEvents_str can be a string of the form N|N|..|N with the number of jobs in the trf(s) [currently not used]
            # Add to Job class if necessary
            job.nEvents, job.nEventsW, nEvents_str = thisExperiment.getNumberOfEvents(job=job, number_of_jobs=len(runCommandList))
        except Exception, e:
            tolog("!!WARNING!!2999!! Failed to get number of events: %s (ignore)" % str(e))
Esempio n. 22
0
        if runJob.getPilotLogFilename() != "":
            pUtil.setPilotlogFilename(runJob.getPilotLogFilename())

        # set node info
        node = Node.Node()
        node.setNodeName(os.uname()[1])
        node.collectWNInfo(jobSite.workdir)

        # redirect stder
        sys.stderr = open("%s/runjob.stderr" % (jobSite.workdir), "w")

        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % jobSite.workdir)
        # get the experiment object
        thisExperiment = getExperiment(runJob.getExperiment())

        tolog("RunJob will serve experiment: %s" %
              (thisExperiment.getExperiment()))

        # set the cache (used e.g. by LSST)
        #if runJob.getCache():
        #    thisExperiment.setCache(runJob.getCache())

        #JR = JobRecovery()
        try:
            job = Job.Job()
            job.setJobDef(newJobDef.job)
            job.workdir = jobSite.workdir
            job.experiment = runJob.getExperiment()
            # figure out and set payload file names
Esempio n. 23
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'lcg2', lfn, guid)

        # get a proper envsetup
        envsetup = self.getEnvsetup(get=True)

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport('RFCP_FAIL', report)
            return ec, pilotErrorDiag

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        if proxycheck:
            # do we have a valid proxy?
            s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup)
            if s != 0:
                self.prepareReport('PROXYFAIL', report)
                return s, pilotErrorDiag
        else:
            tolog("Proxy verification turned off")

        getfile = gpfn

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        # can not test filesize and checksum if remote values are not known
        if fsize == 0 or fchecksum == 0:
            tolog("!!WARNING!!2999!! Remote file size/checksum not known: %s/%s" % (fsize, fchecksum))

        # Maybe be a comma list but take first always
        # (Remember that se can be a list where the first is used for output but any can be used for input)
        se = readpar('se').split(",")[0]
        _dummytoken, se = self.extractSE(se)
        tolog("Using SE: %s" % (se))

        # se = srm://head01.aglt2.org:8443/srm/managerv2?SFN=
        # for srm protocol, use the full info from 'se'
        if getfile[:3] == "srm":
            try:
                # e.g. tmp = ['srm:', '', 'head01.aglt2.org', 'pnfs/aglt2.org/rucio/panda/dis/08/...']
                tmp = getfile.split('/',3)[2]
            except Exception, e:
                tolog('!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping getfile variable as it is: %s (%s)' %\
                      (getfile, str(e)))
            else:
                # replace srm with 'srm://head01.aglt2.org:8443/srm/managerv2?SFN=' if not there already
                if not '?SFN=' in getfile:
                    # srm = 'srm://head01.aglt2.org'
                    srm = 'srm://' + tmp

                    # does seopt contain any matching srm's?
                    sematch = self.getSEMatchFromSEOpt(srm)
                    if sematch != "":
                        getfile = getfile.replace(srm, sematch)
                        tolog("Replaced %s with %s (from seopt) in getfile: %s" % (srm, sematch, getfile))
                    else:
                        getfile = getfile.replace(srm, se)
                        tolog("Replaced %s with %s (from se) in getfile: %s" % (srm, se, getfile))
                else:
                    tolog("Found SFN part in getfile: %s" % (getfile))

                    # add port number from se to getfile if necessary
                    getfile = self.addPortToPath(se, getfile)
Esempio n. 24
0
def getFileSystemRootPath(experiment):
    """ Return the proper file system root path (cvmfs) """

    e = getExperiment(experiment)
    return e.getCVMFSPath()
Esempio n. 25
0
        if pilotlogfilename != "":
            pUtil.setPilotlogFilename(pilotlogfilename)
    
        # set node info
        node = Node.Node()
        node.setNodeName(os.uname()[1])
        node.collectWNInfo(jobSite.workdir)
    
        # redirect stder
        sys.stderr = open("%s/runjob.stderr" % (jobSite.workdir), "w")
    
        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % jobSite.workdir)

        # get the experiment object
        thisExperiment = getExperiment(experiment)
        tolog("runJob will serve experiment: %s" % (thisExperiment.getExperiment()))

        region = readpar('region')
        JR = JobRecovery()
        try:
            job = Job.Job()
            job.setJobDef(newJobDef.job)
            job.workdir = jobSite.workdir
            job.experiment = experiment
            # figure out and set payload file names
            job.setPayloadName(thisExperiment.getPayloadName(job))
        except Exception, e:
            pilotErrorDiag = "Failed to process job info: %s" % str(e)
            tolog("!!WARNING!!3000!! %s" % (pilotErrorDiag))
            failJob(0, error.ERR_UNKNOWN, job, pilotserver, pilotport, pilotErrorDiag=pilotErrorDiag)
Esempio n. 26
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))
        if prodSourceLabel == 'ddm' and analysisJob:
            tolog("Treating PanDA Mover job as a production job during stage-out")
            analysisJob = False

        # get the Rucio tracing report
        try:
            report = pdict['report']
        except:
            report = {}
        else:
            # set the proper protocol
            report['protocol'] = 'curl'
            # mark the relative start
            report['catStart'] = time()
            # the current file
            report['filename'] = lfn
            # guid
            report['guid'] = guid.replace('-','')

        # preparing variables
        if fsize == 0 or fchecksum == 0:
            ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(source, csumtype="adler32")
            if ec != 0:
                self.prepareReport('LOCAL_FILE_INFO_FAIL', report)
                return self.put_data_retfail(ec, pilotErrorDiag)

        # now that the file size is known, add it to the tracing report
        report['filesize'] = fsize

        # get the checksum type
        if fchecksum != 0 and fchecksum != "":
            csumtype = self.getChecksumType(fchecksum)
        else:
            csumtype = "default"

        # get a proper envsetup
        envsetup = self.getEnvsetup()

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        if proxycheck:
            s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2)
            if s != 0:
                self.prepareReport('NO_PROXY', report)
                return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag)
        else:
            tolog("Proxy verification turned off")

        filename = os.path.basename(source)

        # get all the proper paths
        ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, sitemover=self) # quick workaround
        if ec != 0:
            self.prepareReport(tracer_error, report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        putfile = surl
        full_surl = putfile
        if full_surl[:len('token:')] == 'token:':
            # remove the space token (e.g. at Taiwan-LCG2) from the SURL info
            full_surl = full_surl[full_surl.index('srm://'):]

        # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/
        #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/
        #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz
        tolog("putfile: %s" % (putfile))
        tolog("full_surl: %s" % (full_surl))

        # get https surl
        full_http_surl = full_surl.replace("srm://", "https://")

        # get the RSE from ToA
        try:
            _RSE = self.getRSE(surl=putfile)
        except Exception, e:
            tolog("Warning: Failed to get RSE: %s (can not add this info to tracing report)" % str(e))
Esempio n. 27
0
    def interpretPayloadStdout(self, job, res, getstatusoutput_was_interrupted, current_job_number, runCommandList, failureCode):
        """ payload error handling """

        # NOTE: Move away ATLAS specific info in this method, e.g. vmPeak stuff

        error = PilotErrors()
        #Mancinelli: moved it in experiment class method handleTrfExitcode
        #transExitCode = res[0]%255
        tolog("Mancinellidebug: res = %s res[0] = %s" % (res, res[0]))

        # Get the proper stdout filename
        number_of_jobs = len(runCommandList)
        filename = getStdoutFilename(job.workdir, job.stdout, current_job_number, number_of_jobs)

        # get the experiment object
        thisExperiment = getExperiment(job.experiment)
        if not thisExperiment:
            job.pilotErrorDiag = "ErrorDiagnosis did not get an experiment object from the factory"
            job.result[2] = error.ERR_GENERALERROR # change to better/new error code
            tolog("!!WARNING!!3334!! %s" % (job.pilotErrorDiag))
            return job

        # Try to identify out of memory errors in the stderr
        out_of_memory = thisExperiment.isOutOfMemory(job=job, number_of_jobs=number_of_jobs)
        failed = out_of_memory # failed boolean used below

        # Always look for the max and average VmPeak?
        setup = getSourceSetup(runCommandList[0])
        job.vmPeakMax, job.vmPeakMean, job.RSSMean = findVmPeaks(setup)

        # A killed job can have empty output but still transExitCode == 0
        no_payload_output = False
        installation_error = False
        if getstatusoutput_was_interrupted:
            if os.path.exists(filename):
                if os.path.getsize(filename) > 0:
                    tolog("Payload produced stdout but was interrupted (getstatusoutput threw an exception)")
                else:
                    no_payload_output = True
                failed = True
            else:
                failed = True
                no_payload_output = True
        elif len(res[1]) < 20: # protect the following comparison against massive outputs
            if res[1] == 'Undefined':
                failed = True
                no_payload_output = True
        elif failureCode:
            failed = True
        else:
            # check for installation error
            res_tmp = res[1][:1024]
            if res_tmp[0:3] == "sh:" and 'setup.sh' in res_tmp and 'No such file or directory' in res_tmp:
                failed = True
                installation_error = True

        if res[0] or failed:
            #Mancinelli: all this common part with CMS?
            if failureCode:
                job.pilotErrorDiag = "Payload failed: Interrupt failure code: %d" % (failureCode)
                # (do not set pilot error code)
            elif getstatusoutput_was_interrupted:
                raise Exception, "Job execution was interrupted (see stderr)"
            elif out_of_memory:
                job.pilotErrorDiag = "Payload ran out of memory"
                job.result[2] = error.ERR_ATHENAOUTOFMEMORY
            elif no_payload_output:
                job.pilotErrorDiag = "Payload failed: No output"
                job.result[2] = error.ERR_NOPAYLOADOUTPUT
            elif installation_error:
                job.pilotErrorDiag = "Payload failed: Missing installation"
                job.result[2] = error.ERR_MISSINGINSTALLATION
            elif res[0]:
                #Mancinelli: calling for experiment class method to manage transformation exit code
                job = thisExperiment.handleTrfExitcode(job, res, error, filename)
            else:
                job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)"
                job.result[2] = error.ERR_UNKNOWN
            tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag))


        # note: several errors below are atlas specific (not all), should be handled through ATLASExperiment via thisExperiment object
        # move entire section below to ATLASExperiment, define prototype [empty] methods in Experiment and OtherExperiment classes, implement in ATLASExperiment
        # non experiment specific errors should be handled here (e.g. no_payload_output)

        # handle non-zero failed job return code but do not set pilot error codes to all payload errors
        """
        if transExitCode or failed:
            if failureCode:
                job.pilotErrorDiag = "Payload failed: Interrupt failure code: %d" % (failureCode)
                # (do not set pilot error code)
            elif getstatusoutput_was_interrupted:
                raise Exception, "Job execution was interrupted (see stderr)"
            elif out_of_memory:
                job.pilotErrorDiag = "Payload ran out of memory"
                job.result[2] = error.ERR_ATHENAOUTOFMEMORY
            elif no_payload_output:
                job.pilotErrorDiag = "Payload failed: No output"
                job.result[2] = error.ERR_NOPAYLOADOUTPUT
            elif installation_error:
                job.pilotErrorDiag = "Payload failed: Missing installation"
                job.result[2] = error.ERR_MISSINGINSTALLATION
            elif transExitCode:
                # Handle PandaMover errors
                if transExitCode == 176:
                    job.pilotErrorDiag = "PandaMover staging error: File is not cached"
                    job.result[2] = error.ERR_PANDAMOVERFILENOTCACHED
                elif transExitCode == 86:
                    job.pilotErrorDiag = "PandaMover transfer failure"
                    job.result[2] = error.ERR_PANDAMOVERTRANSFER
                else:
                    # check for specific errors in athena stdout
                    if os.path.exists(filename):
                        e1 = "prepare 5 database is locked"
                        e2 = "Error SQLiteStatement"
                        _out = commands.getoutput('grep "%s" %s | grep "%s"' % (e1, filename, e2))
                        if 'sqlite' in _out:
                            job.pilotErrorDiag = "NFS/SQLite locking problems: %s" % (_out)
                            job.result[2] = error.ERR_NFSSQLITE
                        else:
                            job.pilotErrorDiag = "Job failed: Non-zero failed job return code: %d" % (transExitCode)
                            # (do not set a pilot error code)
                    else:
                        job.pilotErrorDiag = "Job failed: Non-zero failed job return code: %d (%s does not exist)" % (transExitCode, filename)
                        # (do not set a pilot error code)
            else:
                job.pilotErrorDiag = "Payload failed due to unknown reason (check payload stdout)"
                job.result[2] = error.ERR_UNKNOWN
            tolog("!!FAILED!!3000!! %s" % (job.pilotErrorDiag))

        # set the trf diag error
        if res[2] != "":
            tolog("TRF diagnostics: %s" % (res[2]))
            job.exeErrorDiag = res[2]

        job.result[1] = transExitCode
        """
        return job
Esempio n. 28
0
        if runJob.getPilotLogFilename() != "":
            pUtil.setPilotlogFilename(runJob.getPilotLogFilename())

        # set node info
        node = Node.Node()
        node.setNodeName(os.uname()[1])
        node.collectWNInfo(jobSite.workdir)

        # redirect stder
        sys.stderr = open("%s/runjob.stderr" % (jobSite.workdir), "w")

        tolog("Current job workdir is: %s" % os.getcwd())
        tolog("Site workdir is: %s" % jobSite.workdir)
        # get the experiment object
        thisExperiment = getExperiment(runJob.getExperiment())

        tolog("RunJob will serve experiment: %s" % (thisExperiment.getExperiment()))

        # set the cache (used e.g. by LSST)
        #if runJob.getCache():
        #    thisExperiment.setCache(runJob.getCache())

        #JR = JobRecovery()
        try:
            job = Job.Job()
            job.setJobDef(newJobDef.job)
            job.workdir = jobSite.workdir
            job.experiment = runJob.getExperiment()
            # figure out and set payload file names
            job.setPayloadName(thisExperiment.getPayloadName(job))
Esempio n. 29
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'lcg2', lfn, guid)

        # get a proper envsetup
        envsetup = self.getEnvsetup(get=True)

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport('RFCP_FAIL', report)
            return ec, pilotErrorDiag

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        if proxycheck:
            # do we have a valid proxy?
            s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup)
            if s != 0:
                self.prepareReport('PROXYFAIL', report)
                return s, pilotErrorDiag
        else:
            tolog("Proxy verification turned off")

        getfile = gpfn

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        # can not test filesize and checksum if remote values are not known
        if fsize == 0 or fchecksum == 0:
            tolog(
                "!!WARNING!!2999!! Remote file size/checksum not known: %s/%s"
                % (fsize, fchecksum))

        # Maybe be a comma list but take first always
        # (Remember that se can be a list where the first is used for output but any can be used for input)
        se = readpar('se').split(",")[0]
        _dummytoken, se = self.extractSE(se)
        tolog("Using SE: %s" % (se))

        # se = srm://head01.aglt2.org:8443/srm/managerv2?SFN=
        # for srm protocol, use the full info from 'se'
        if getfile[:3] == "srm":
            try:
                # e.g. tmp = ['srm:', '', 'head01.aglt2.org', 'pnfs/aglt2.org/rucio/panda/dis/08/...']
                tmp = getfile.split('/', 3)[2]
            except Exception, e:
                tolog('!!WARNING!!2999!! Could not extract srm protocol for replacement, keeping getfile variable as it is: %s (%s)' %\
                      (getfile, str(e)))
            else:
                # replace srm with 'srm://head01.aglt2.org:8443/srm/managerv2?SFN=' if not there already
                if not '?SFN=' in getfile:
                    # srm = 'srm://head01.aglt2.org'
                    srm = 'srm://' + tmp

                    # does seopt contain any matching srm's?
                    sematch = self.getSEMatchFromSEOpt(srm)
                    if sematch != "":
                        getfile = getfile.replace(srm, sematch)
                        tolog(
                            "Replaced %s with %s (from seopt) in getfile: %s" %
                            (srm, sematch, getfile))
                    else:
                        getfile = getfile.replace(srm, se)
                        tolog("Replaced %s with %s (from se) in getfile: %s" %
                              (srm, se, getfile))
                else:
                    tolog("Found SFN part in getfile: %s" % (getfile))

                    # add port number from se to getfile if necessary
                    getfile = self.addPortToPath(se, getfile)
Esempio n. 30
0
    def put_data(self, pfn, destination, fsize=0, fchecksum=0, dsname='', extradirs='', **pdict):
        """ copy output file from disk to local SE """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        logFile = pdict.get('logFile', '')
        sitename = pdict.get('sitename', '')
        proxycheck = pdict.get('proxycheck', False)
        experiment = pdict.get('experiment', '')
        analysisJob = pdict.get('analJob', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        if prodSourceLabel == 'ddm' and analysisJob:
            tolog("Treating PanDA Mover job as a production job during stage-out")
            analysisJob = False

        filename = pfn.split('/')[-1]

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'], 'lcg', lfn, guid)

        # is the dataset defined?
        if dsname == '':
            pilotErrorDiag = "Dataset name not specified to put_data"
            tolog('!!WARNING!!2990!! %s' % (pilotErrorDiag))
            self.prepareReport('DSN_UNDEF', report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag)

        # preparing variables
        if fsize == 0 or fchecksum == 0:
            ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(pfn, csumtype="adler32")
            if ec != 0:
                self.prepareReport('LOCAL_FILE_INFO_FAIL', report)
                return self.put_data_retfail(ec, pilotErrorDiag)

        # now that the file size is known, add it to the tracing report
        report['filesize'] = fsize

        # get a proper envsetup
        envsetup = self.getEnvsetup()

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport('RFCP_FAIL', report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # do we need to check the user proxy?
        if proxycheck:
            s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2)
            if s != 0:
                self.prepareReport('PROXY_FAIL', report)
                return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag)
        else:
            tolog("Proxy verification turned off")

        # get all the proper paths
        ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, sitemover=self) # quick workaround
        if ec != 0:
            self.prepareReport(tracer_error, report)
            return self.put_data_retfail(ec, pilotErrorDiag, surl=dst_gpfn)

        lfclfn = os.path.join(lfcdir, lfn)
        # LFC LFN = /grid/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492/
        #364aeb74-8b62-4c8f-af43-47b447192ced_0.job.log.tgz

        # putfile is the SURL
        putfile = surl
        full_surl = putfile
        if full_surl[:len('token:')] == 'token:':
            # remove the space token (e.g. at Taiwan-LCG2) from the SURL info
            full_surl = full_surl[full_surl.index('srm://'):]

        # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/
        #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/
        #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz
        tolog("putfile = %s" % (putfile))
        tolog("full_surl = %s" % (full_surl))

        # get the DQ2 site name from ToA
        try:
            _dq2SiteName = self.getDQ2SiteName(surl=putfile)
        except:
            # WARNING: do not print the exception here since it can sometimes not be converted to a string! (problem seen at Taiwan)
            tolog("Warning: Failed to get the DQ2 site name (can not add this info to tracing report)")
        else:
            report['localSite'], report['remoteSite'] = (_dq2SiteName, _dq2SiteName)
            tolog("DQ2 site name: %s" % (_dq2SiteName))

        # get the absolute (full) path to the file
        fppfn = os.path.abspath(pfn)
        tolog("pfn=%s" % (pfn))

        cmd = '%s echo "LFC_HOST=$LFC_HOST"; lfc-mkdir -p %s' % (envsetup, lfcdir)
        # export LFC_HOST=lfc0448.gridpp.rl.ac.uk ; echo "LFC_HOST=$LFC_HOST";
        #lfc-mkdir -p /grid/atlas/dq2/testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647
        tolog("Executing command: %s" % (cmd))
        s, o = commands.getstatusoutput(cmd)
        if s == 0:
            tolog("LFC setup and mkdir succeeded")
            tolog("Command output: %s" % (o))
        else:
            tolog("!!WARNING!!2990!! LFC setup and mkdir failed. Status=%s Output=%s" % (s, o))
            if o == "Could not establish context":
                pilotErrorDiag = "Could not establish context: Proxy / VO extension of proxy has probably expired"
                tolog("!!WARNING!!2990!! %s" % (pilotErrorDiag))
                self.dumpExtendedProxy(envsetup)
                self.prepareReport('CONTEXT_FAIL', report)
                return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag, surl=full_surl)
            else:
                pilotErrorDiag = "LFC setup and mkdir failed: %s" % (o)
                self.prepareReport('LFC_SETUP_FAIL', report)
                return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag, surl=full_surl)

        # determine which timeout option to use
        if self.isNewLCGVersion("%s lcg-cr" % (envsetup)):
            timeout_option = "--srm-timeout=%d --connect-timeout=300 --sendreceive-timeout=%d" % (self.timeout, self.timeout)
        else:
            timeout_option = "-t %d" % (self.timeout)

        # used lcg-cr options:
        # --verbose: verbosity on
        #      --vo: specifies the Virtual Organization the user belongs to
        #        -T: specify SRM version
        #        -s: space token description
        #        -b: BDII disabling
        #        -t: time-out
        #        -l: specifies the Logical File Name associated with the file. If this option is present, an entry is added to the LFC
        #        -g: specifies the Grid Unique IDentifier. If this option is not present, a GUID is generated internally
        #        -d: specifies the destination. It can be the Storage Element fully qualified hostname or an SURL. In the latter case,
        #            the scheme can be sfn: for a classical SE or srm:. If only the fully qualified hostname is given, a filename is
        #            generated in the same format as with the Replica Manager
        if token:
            # Special case for GROUPDISK (do not remove dst: bit before this stage, needed in several places)
            if "dst:" in token:
                token = token[len('dst:'):]
                tolog("Dropped dst: part of space token descriptor; token=%s" % (token))
                token = "ATLASGROUPDISK"
                tolog("Space token descriptor reset to: %s" % (token))

            surl = putfile[putfile.index('srm://'):]
            _cmd_str = '%s which lcg-cr; lcg-cr --version; lcg-cr --verbose --vo atlas -T srmv2 -s %s -b %s -l %s -g %s -d %s file:%s' % (envsetup, token, timeout_option, lfclfn, guid, surl, fppfn)
        else:
            surl = putfile
            _cmd_str = '%s which lcg-cr; lcg-cr --version; lcg-cr --verbose --vo atlas %s -l %s -g %s -d %s file:%s' % (envsetup, timeout_option, lfclfn, guid, surl, fppfn)

        # GoeGrid testing: _cmd_str = '%s which lcg-cr; lcg-cr --version; lcg-crXXX --verbose --vo atlas %s -l %s -g %s -d %s file:%s' % (envsetup, timeout_option, lfclfn, guid, surl, fppfn)

        tolog("Executing command: %s" % (_cmd_str))
        s = -1
        t0 = os.times()
        report['relativeStart'] = time()
        report['transferStart'] =  time()
        try:
            s, o = commands.getstatusoutput(_cmd_str)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s" % (str(e)))
            o = str(e)
Esempio n. 31
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ copy input file from SE to local dir """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        token = pdict.get('token', None)
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)

        # try to get the direct reading control variable (False for direct reading mode; file should not be copied)
        useCT = pdict.get('usect', True)
        prodDBlockToken = pdict.get('access', '')

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'], 'lcg', lfn, guid)

        # get a proper envsetup
        envsetup = self.getEnvsetup(get=True)

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport('RFCP_FAIL', report)
            return ec, pilotErrorDiag

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        if proxycheck:
            # do we have a valid proxy?
            s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup)
            if s != 0:
                self.prepareReport('PROXYFAIL', report)
                return s, pilotErrorDiag
        else:
            tolog("Proxy verification turned off")

        getfile = gpfn

        if path == '': path = './'
        fullname = os.path.join(path, lfn)

        # should the root file be copied or read directly by athena?
        directIn, useFileStager = self.getTransferModes()
        if directIn:
            if useCT:
                directIn = False
                tolog("Direct access mode is switched off (file will be transferred with the copy tool)")
                updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", type="input")
            else:
                # determine if the file is a root file according to its name
                rootFile = self.isRootFileName(lfn)

                if prodDBlockToken == 'local' or not rootFile:
                    directIn = False
                    tolog("Direct access mode has been switched off for this file (will be transferred with the copy tool)")
                    updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", type="input")
                elif rootFile:
                    tolog("Found root file according to file name: %s (will not be transferred in direct reading mode)" % (lfn))
                    report['relativeStart'] = None
                    report['transferStart'] = None
                    self.prepareReport('FOUND_ROOT', report)
                    if useFileStager:
                        updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="file_stager", type="input")
                    else:
                        updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="remote_io", type="input")
                    return error.ERR_DIRECTIOFILE, pilotErrorDiag
                else:
                    tolog("Normal file transfer")

        # get remote filesize and checksum
        if fsize == 0 or fchecksum == 0:
            try:
                import lfc
            except Exception, e:
                pilotErrorDiag = "get_data() could not import lfc module: %s" % str(e)
                tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                self.prepareReport('LFC_IMPORT', report)
                return error.ERR_GETLFCIMPORT, pilotErrorDiag

            os.environ['LFC_HOST'] = readpar('lfchost')
            try:
                ret, res = lfc.lfc_getreplicas([str(guid)],"")
            except Exception, e:
                pilotErrorDiag = "Failed to get LFC replicas: %s" % str(e)
                tolog("!!WARNING!!2990!! Exception caught: %s" % (pilotErrorDiag))
                tolog("Mover get_data finished (failed)")
                self.prepareReport('NO_LFC_REPS', report)
                return error.ERR_FAILEDLFCGETREPS, pilotErrorDiag
Esempio n. 32
0
                node['cpuConsumptionUnit'] = '?'
            node['cpuConversionFactor'] = job.cpuConversionFactor

            # report specific time measures
            # node['pilotTiming'] = "getJob=%s setup=%s stageIn=%s payload=%s stageOut=%s" % (job.timeGetJob, job.timeSetup, job.timeStageIn, job.timeExe, job.timeStageOut)
            node['pilotTiming'] = "%s|%s|%s|%s|%s" % (job.timeGetJob, job.timeStageIn, job.timeExe, job.timeStageOut, job.timeSetup)

        elif job.result[0] == 'holding':
            node['exeErrorCode'] = job.result[2]
            node['exeErrorDiag'] = self.__error.getPilotErrorDiag(job.result[2])

        else:
            node['cpuConsumptionUnit'] = getCPUmodel()

        # Add the utility info if it is available
        thisExperiment = getExperiment(job.experiment)
        if thisExperiment.shouldExecuteUtility():
            utility_node = thisExperiment.getUtilityInfo(job.workdir, self.__pilot_initdir, allowTxtFile=True)
            node = merge_dictionaries(node, utility_node)

        return node

    def getXML(self, job, sitename, workdir, xmlstr=None, jr=False):
        """ Get the metadata xml """

        node_xml = ""
        tolog("getXML called")

        # for backwards compatibility
        try:
            experiment = job.experiment
Esempio n. 33
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        if prodSourceLabel == 'ddm' and analysisJob:
            tolog(
                "Treating PanDA Mover job as a production job during stage-out"
            )
            analysisJob = False

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'], 'lcg2', lfn, guid)

        # preparing variables
        if fsize == 0 or fchecksum == 0:
            ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(
                source, csumtype="adler32")
            if ec != 0:
                self.prepareReport('LOCAL_FILE_INFO_FAIL', report)
                return self.put_data_retfail(ec, pilotErrorDiag)

        # now that the file size is known, add it to the tracing report
        report['filesize'] = fsize

        # get the checksum type
        if fchecksum != 0 and fchecksum != "":
            csumtype = self.getChecksumType(fchecksum)
        else:
            csumtype = "default"

        # get a proper envsetup
        if alt:
            # use a cvmfs setup for stage-out to alternative SE
            envsetup = si.getLocalEMISetup()
            if envsetup[-1] != ";":
                envsetup += "; "
        else:
            envsetup = self.getEnvsetup(alt=alt)

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport('RFCP_FAIL', report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        if proxycheck:
            s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup,
                                                           limit=2)
            if s != 0:
                self.prepareReport('NO_PROXY', report)
                return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag)
        else:
            tolog("Proxy verification turned off")

        filename = os.path.basename(source)

        # get all the proper paths
        ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(
            error,
            analysisJob,
            token,
            prodSourceLabel,
            dsname,
            filename,
            scope=scope,
            alt=alt)
        if ec != 0:
            self.prepareReport(tracer_error, report)
            return self.put_data_retfail(ec, pilotErrorDiag, surl=dst_gpfn)

        putfile = surl
        full_surl = putfile
        if full_surl[:len('token:')] == 'token:':
            # remove the space token (e.g. at Taiwan-LCG2) from the SURL info
            full_surl = full_surl[full_surl.index('srm://'):]

        # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/
        #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/
        #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz
        tolog("putfile = %s" % (putfile))
        tolog("full_surl = %s" % (full_surl))

        # get the DQ2 site name from ToA
        try:
            _dq2SiteName = self.getDQ2SiteName(surl=putfile)
        except Exception, e:
            tolog(
                "Warning: Failed to get the DQ2 site name: %s (can not add this info to tracing report)"
                % str(e))
Esempio n. 34
0
    def put_data(self, pfn, ddm_storage, fsize=0, fchecksum=0, dsname="", extradirs="", **pdict):
        """ Copy all output file to the local SE """

        error = PilotErrors()
        pilotErrorDiag = ""

        tolog("put_data() got ddm_storage=%s" % (ddm_storage))

        # Get input parameters from pdict
        lfn = pdict.get("lfn", "")
        guid = pdict.get("guid", "")
        analJob = pdict.get("analJob", False)
        experiment = pdict.get("experiment", "")

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict["report"], "castor", lfn, guid)

        # get a proper envsetup
        envsetup = self.getEnvsetup()

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport("RFCP_FAIL", report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # do we have a valid proxy?
        s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2)
        if s != 0:
            self.prepareReport("PROXYFAIL", report)
            return self.put_data_retfail(s, pilotErrorDiag)
        filename = pfn.split("/")[-1]

        # the current file
        report["filename"] = lfn

        # guid
        report["guid"] = guid.replace("-", "")

        # Destination is the top level Castor store area. Append a subdirectory which is first two fields of dsname, or 'other'
        destination = ""
        if not analJob:
            # seprodpath can have a complex structure in case of space tokens
            # although currently not supported in this site mover, prepare the code anyway
            # (use the first list item only)
            destination = self.getDirList(readpar("seprodpath"))[0]
            if destination == "":
                tolog("!!WARNING!!2999!! seprodpath not defined, using sepath")
                destination = readpar("sepath")
            tolog("Going to store production job output")
        else:
            destination = readpar("sepath")
            tolog("Going to store analysis job output")

        if destination == "":
            pilotErrorDiag = "put_data destination path in SE not defined"
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            self.prepareReport("DEST_PATH_UNDEF", report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag)
        else:
            tolog("destination: %s" % (destination))

        if dsname == "":
            pilotErrorDiag = "Dataset name not specified to put_data"
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            self.prepareReport("NO_DSN", report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag)
        #        else:
        #            dsname = self.remove_sub(dsname)
        #            tolog("dsname: %s" % (dsname))

        lfcpath, pilotErrorDiag = self.getLFCPath(analJob)
        if lfcpath == "":
            self.prepareReport("LFC_PATH_FAIL", report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag)
        tolog("LFC path: %s" % (lfcpath))

        pat = re.compile("([^\.]+\.[^\.]+)\..*")
        mat = pat.match(dsname)
        if mat:
            prefixdir = mat.group(1)
            castor_destination = os.path.join(destination, prefixdir)
        else:
            pilotErrorDiag = "Unexpected dataset name format: %s" % (dsname)
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            self.prepareReport("DSN_FORMAT_FAIL", report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED, pilotErrorDiag)
        tolog("SE destination: %s" % (castor_destination))

        # set up paths differently for analysis and production jobs
        # use conventional LFC paths or production jobs
        # use OSG style for analysis jobs (for the time being)
        if not analJob:
            # return full lfc file path (beginning lfcpath might need to be replaced)
            native_lfc_path = self.to_native_lfn(dsname, filename)
            # /grid/atlas/dq2/testpanda/testpanda.destDB.b7cd4b56-1b5e-465a-a5d7-38d5e2609724_sub01000457/
            # 58f836d5-ff4b-441a-979b-c37094257b72_0.job.log.tgz
            tolog("Native_lfc_path: %s" % (native_lfc_path))

            # replace the default path /grid/atlas/rucio with lfcpath if different
            # (to_native_lfn returns a path begining with /grid/atlas/rucio)
            default_lfcpath = "/grid/atlas/rucio"  # to_native_lfn always returns this at the beginning of the string
            if default_lfcpath != lfcpath:
                final_lfc_path = native_lfc_path.replace(default_lfcpath, lfcpath)
            else:
                final_lfc_path = native_lfc_path

            # name of dir to be created in LFC
            lfcdir = os.path.dirname(final_lfc_path)
            # /grid/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492
            tolog("LFC dir: %s" % (lfcdir))

            # dst_gpfn = destination
            # dst_gpfn = os.path.join(destination, os.path.join(dsname, filename))
            # /pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492
            # tolog("dst_gpfn: %s" % (dst_gpfn))

        else:  # for analysis jobs

            lfcdir = "%s/%s/%s" % (lfcpath, prefixdir, dsname)
            tolog("lfcdir: %s" % (lfcdir))

        report["relativeStart"] = time()

        # name of dir to be created on Castor
        dirname = os.path.join(castor_destination, dsname)

        dst_gpfn = os.path.join(castor_destination, os.path.join(dsname, filename))
        tolog("dst_gpfn: %s" % (dst_gpfn))
        fppfn = os.path.abspath(pfn)

        # get the RSE from ToA
        try:
            _RSE = self.getRSE(surl=dst_gpfn)
        except Exception, e:
            tolog("Warning: Failed to get RSE: %s (can not add this info to tracing report)" % str(e))
Esempio n. 35
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ stage-in function """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        useCT = pdict.get("usect", True)
        jobId = pdict.get("jobId", "")
        workDir = pdict.get("workDir", "")
        experiment = pdict.get("experiment", "")
        prodDBlockToken = pdict.get("access", "")

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict["report"], "castor", lfn, guid)

        # get a proper envsetup
        envsetup = self.getEnvsetup(get=True)

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport("RFCP_FAIL", report)
            return ec, pilotErrorDiag

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # do we have a valid proxy?
        s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup)
        if s != 0:
            self.prepareReport("PROXYFAIL", report)
            return s, pilotErrorDiag

        # Strip off prefix in order to use rfcp directly
        tolog("gpfn: %s" % (gpfn))
        pat = re.compile("^.*(/castor/.*)$")
        mat = pat.match(gpfn)
        if mat:
            getfile = mat.group(1)
        else:
            pilotErrorDiag = "Get file not in castor: %s" % (gpfn)
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            self.prepareReport("NO_FILE", report)
            return error.ERR_STAGEINFAILED, pilotErrorDiag

        # when the file has been copied we will rename it to the lfn (to remove the legacy __DQ2-string on some files)
        dest_path = os.path.join(path, lfn)

        # should the root file be copied or read directly by athena?
        directIn, useFileStager = self.getTransferModes()
        if directIn:
            if useCT:
                directIn = False
                tolog("Direct access mode is switched off (file will be transferred with the copy tool)")
                updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", ftype="input")
            else:
                # determine if the file is a root file according to its name
                rootFile = self.isRootFileName(lfn)

                if prodDBlockToken == "local" or not rootFile:
                    directIn = False
                    tolog(
                        "Direct access mode has been switched off for this file (will be transferred with the copy tool)"
                    )
                    updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="copy_to_scratch", ftype="input")
                elif rootFile:
                    tolog(
                        "Found root file according to file name: %s (will not be transferred in direct reading mode)"
                        % (lfn)
                    )
                    report["relativeStart"] = None
                    report["transferStart"] = None
                    self.prepareReport("FOUND_ROOT", report)
                    if useFileStager:
                        updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="file_stager", ftype="input")
                    else:
                        updateFileState(lfn, workDir, jobId, mode="transfer_mode", state="remote_io", ftype="input")
                    return error.ERR_DIRECTIOFILE, pilotErrorDiag
                else:
                    tolog("Normal file transfer")

        # transfer the input file with rfcp
        _cmd_str = "%srfcp %s %s" % (envsetup, getfile, dest_path)
        tolog("Executing command: %s" % (_cmd_str))
        report["transferStart"] = time()
        s, o = commands.getstatusoutput(_cmd_str)
        report["validateStart"] = time()
        if s != 0:
            o = o.replace("\n", " ")
            check_syserr(s, o)

            # remove the local file before any get retry is attempted
            _status = self.removeLocal(dest_path)
            if not _status:
                tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")

            if o.find("No such file or directory") >= 0:
                if getfile.find("DBRelease") >= 0:
                    pilotErrorDiag = "Missing DBRelease file: %s" % (getfile)
                    tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                    ec = error.ERR_MISSDBREL
                else:
                    pilotErrorDiag = "No such file or directory: %s" % (getfile)
                    tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                    ec = error.ERR_NOSUCHFILE
            else:
                pilotErrorDiag = "rfcp failed: %d, %s" % (s, o)
                tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                ec = error.ERR_STAGEINFAILED
            self.prepareReport("RFCP_FAIL", report)
            return ec, pilotErrorDiag

        # check file size and checksum
        if fsize != 0 or fchecksum != 0:
            # which checksum type are we using?
            if fchecksum != 0 and fchecksum != "":
                csumtype = self.getChecksumType(fchecksum)
            else:
                csumtype = "default"

            # get remote file size and checksum
            ec, pilotErrorDiag, dstfsize, dstfchecksum = self.getLocalFileInfo(dest_path, csumtype=csumtype)
            tolog("File info: %d, %s, %s" % (ec, dstfsize, dstfchecksum))
            if ec != 0:
                self.prepareReport("LOCAL_FILE_INFO_FAIL", report)

                # remove the local file before any get retry is attempted
                _status = self.removeLocal(dest_path)
                if not _status:
                    tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")

                return ec, pilotErrorDiag

            # compare remote and local file size
            if fsize != 0 and dstfsize != fsize:
                pilotErrorDiag = "Remote and local file sizes do not match for %s (%s != %s)" % (
                    os.path.basename(gpfn),
                    str(dstfsize),
                    str(fsize),
                )
                tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                self.prepareReport("FS_MISMATCH", report)

                # remove the local file before any get retry is attempted
                _status = self.removeLocal(dest_path)
                if not _status:
                    tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")

                return error.ERR_GETWRONGSIZE, pilotErrorDiag

            # compare remote and local file checksum
            if fchecksum != 0 and dstfchecksum != fchecksum and not self.isDummyChecksum(fchecksum):
                pilotErrorDiag = "Remote and local checksums (of type %s) do not match for %s (%s != %s)" % (
                    csumtype,
                    os.path.basename(gpfn),
                    dstfchecksum,
                    fchecksum,
                )
                tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))

                # remove the local file before any get retry is attempted
                _status = self.removeLocal(dest_path)
                if not _status:
                    tolog("!!WARNING!!1112!! Failed to remove local file, get retry will fail")

                if csumtype == "adler32":
                    self.prepareReport("AD_MISMATCH", report)
                    return error.ERR_GETADMISMATCH, pilotErrorDiag
                else:
                    self.prepareReport("MD5_MISMATCH", report)
                    return error.ERR_GETMD5MISMATCH, pilotErrorDiag

        updateFileState(lfn, workDir, jobId, mode="file_state", state="transferred", ftype="input")
        self.prepareReport("DONE", report)
        return 0, pilotErrorDiag
Esempio n. 36
0
class curlSiteMover(SiteMover.SiteMover):
    """ SiteMover for curl """

    copyCommand = "curl"
    checksum_command = "adler32"
    has_mkdir = False
    has_df = False
    has_getsize = False
    has_md5sum = True
    has_chmod = False
    timeout = 3600
    """ get proxy """

    si = SiteInformation()
    sslCert = si.getSSLCertificate()
    sslKey = sslCert
    sslCertDir = si.getSSLCertificatesDirectory()

    def __init__(self, setup_path, *args, **kwrds):
        self._setup = setup_path

    def get_timeout(self):
        return self.timeout

    def check_space(self, ub):
        """ For when space availability is not verifiable """
        return 999999

    def core_get_data(self, envsetup, token, source_surl, dest_path,
                      experiment):
        """ stage-in core function, can be overridden (see stormSiteMover) """

        error = PilotErrors()

        # determine which timeout option to use
        timeout_option = "--connect-timeout 300 --max-time %d" % (self.timeout)

        sslCert = self.sslCert
        sslKey = self.sslKey
        sslCertDir = self.sslCertDir

        # used curl options:
        # --cert: <cert[:passwd]> Client certificate file and password (SSL)
        # --capath: <directory> CA directory (made using c_rehash) to verify
        # --location: Follow Location: hints (H)
        # --output: <file> Write output to <file> instead of stdout
        # --cilent: Makes Curl mute
        # --show-error: When used with -s it makes curl show error message if it fails
        # Removed for SL6: --ciphers <list of ciphers> (SSL)  Specifies  which  ciphers  to use in the connection.
        """ define curl command string """
        _cmd_str = 'lcg-gt %s https' % (source_surl)
        try:
            s, o = commands.getstatusoutput(_cmd_str)
            tolog("Executing command: %s" % (_cmd_str))
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" %
                  (str(e), s, o))
            o = str(e)
        if s == 0:
            tolog("lcg-gt supported, get http path")
            source_surl = o.strip().split()
            source_surl = source_surl[0]
            _cmd_str = '%s curl --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (
                envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey,
                source_surl, dest_path)
#            _cmd_str = '%s curl --ciphers ALL:NULL --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path)
        else:
            tolog(
                "lcg-gt not supported, get http path by replacing source_surl")
            _cmd_str = '%s curl --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (
                envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey,
                source_surl, dest_path)
            #            _cmd_str = '%s curl --ciphers ALL:NULL --silent --show-error --cacert %s %s --capath %s --cert %s --key %s -L %s -o %s' % (envsetup, sslCert, timeout_option, sslCertDir, sslCert, sslKey, source_surl, dest_path)
            _cmd_str = _cmd_str.replace("srm://", "https://")
        # add the full stage-out command to the job setup script
        #_cmd_str = _cmd_str.replace("file://", "-o ")

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        to_script = _cmd_str
        to_script = to_script.lstrip(' ')  # remove any initial spaces
        if to_script.startswith('/'):
            to_script = 'source ' + to_script
        thisExperiment.updateJobSetupScript(os.path.dirname(dest_path),
                                            to_script=to_script)

        tolog("Executing command: %s" % (_cmd_str))
        s = -1
        o = '(not defined)'
        t0 = os.times()
        try:
            s, o = commands.getstatusoutput(_cmd_str)
        except Exception, e:
            tolog("!!WARNING!!2990!! Exception caught: %s (%d, %s)" %
                  (str(e), s, o))
            o = str(e)
Esempio n. 37
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        alt = pdict.get('alt', False)
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        if prodSourceLabel == 'ddm' and analysisJob:
            tolog("Treating PanDA Mover job as a production job during stage-out")
            analysisJob = False

        # get the DQ2 tracing report
        report = self.getStubTracingReport(pdict['report'], 'lcg2', lfn, guid)

        # preparing variables
        if fsize == 0 or fchecksum == 0:
            ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(source, csumtype="adler32")
            if ec != 0:
                self.__sendReport('LOCAL_FILE_INFO_FAIL', report)
                return self.put_data_retfail(ec, pilotErrorDiag)

        # now that the file size is known, add it to the tracing report
        report['filesize'] = fsize

        # get the checksum type
        if fchecksum != 0 and fchecksum != "":
            csumtype = self.getChecksumType(fchecksum)
        else:
            csumtype = "default"

        # get a proper envsetup
        if alt:
            # use a cvmfs setup for stage-out to alternative SE
            envsetup = si.getLocalEMISetup()
            if envsetup[-1] != ";":
                envsetup += "; "
        else:
            envsetup = self.getEnvsetup(alt=alt)

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.__sendReport('RFCP_FAIL', report)
            return self.put_data_retfail(ec, pilotErrorDiag) 

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        if proxycheck:
            s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup, limit=2)
            if s != 0:
                self.__sendReport('NO_PROXY', report)
                return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag)
        else:
            tolog("Proxy verification turned off")

        filename = os.path.basename(source)

        # get all the proper paths
        ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(error, analysisJob, token, prodSourceLabel, dsname, filename, scope=scope, alt=alt)
        if ec != 0:
            self.__sendReport(tracer_error, report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        putfile = surl
        full_surl = putfile
        if full_surl[:len('token:')] == 'token:':
            # remove the space token (e.g. at Taiwan-LCG2) from the SURL info
            full_surl = full_surl[full_surl.index('srm://'):]

        # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/
        #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/
        #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz
        tolog("putfile = %s" % (putfile))
        tolog("full_surl = %s" % (full_surl))

        # get the DQ2 site name from ToA
        try:
            _dq2SiteName = self.getDQ2SiteName(surl=putfile)
        except Exception, e: 
            tolog("Warning: Failed to get the DQ2 site name: %s (can not add this info to tracing report)" % str(e))
Esempio n. 38
0
    def put_data(self,
                 pfn,
                 ddm_storage,
                 fsize=0,
                 fchecksum=0,
                 dsname='',
                 extradirs='',
                 **pdict):
        """ Copy all output file to the local SE """

        error = PilotErrors()
        pilotErrorDiag = ""

        tolog("put_data() got ddm_storage=%s" % (ddm_storage))

        # Get input parameters from pdict
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        analJob = pdict.get('analJob', False)
        experiment = pdict.get('experiment', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'castor', lfn,
                                           guid)

        # get a proper envsetup
        envsetup = self.getEnvsetup()

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport('RFCP_FAIL', report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # do we have a valid proxy?
        s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup,
                                                       limit=2)
        if s != 0:
            self.prepareReport('PROXYFAIL', report)
            return self.put_data_retfail(s, pilotErrorDiag)
        filename = pfn.split('/')[-1]

        # the current file
        report['filename'] = lfn

        # guid
        report['guid'] = guid.replace('-', '')

        # Destination is the top level Castor store area. Append a subdirectory which is first two fields of dsname, or 'other'
        destination = ""
        if not analJob:
            # seprodpath can have a complex structure in case of space tokens
            # although currently not supported in this site mover, prepare the code anyway
            # (use the first list item only)
            destination = self.getDirList(readpar('seprodpath'))[0]
            if destination == "":
                tolog("!!WARNING!!2999!! seprodpath not defined, using sepath")
                destination = readpar('sepath')
            tolog("Going to store production job output")
        else:
            destination = readpar('sepath')
            tolog("Going to store analysis job output")

        if destination == '':
            pilotErrorDiag = "put_data destination path in SE not defined"
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            self.prepareReport('DEST_PATH_UNDEF', report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED,
                                         pilotErrorDiag)
        else:
            tolog("destination: %s" % (destination))

        if dsname == '':
            pilotErrorDiag = "Dataset name not specified to put_data"
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            self.prepareReport('NO_DSN', report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED,
                                         pilotErrorDiag)
#        else:
#            dsname = self.remove_sub(dsname)
#            tolog("dsname: %s" % (dsname))

        lfcpath, pilotErrorDiag = self.getLFCPath(analJob)
        if lfcpath == "":
            self.prepareReport('LFC_PATH_FAIL', report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED,
                                         pilotErrorDiag)
        tolog("LFC path: %s" % (lfcpath))

        pat = re.compile('([^\.]+\.[^\.]+)\..*')
        mat = pat.match(dsname)
        if mat:
            prefixdir = mat.group(1)
            castor_destination = os.path.join(destination, prefixdir)
        else:
            pilotErrorDiag = "Unexpected dataset name format: %s" % (dsname)
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            self.prepareReport('DSN_FORMAT_FAIL', report)
            return self.put_data_retfail(error.ERR_STAGEOUTFAILED,
                                         pilotErrorDiag)
        tolog("SE destination: %s" % (castor_destination))

        # set up paths differently for analysis and production jobs
        # use conventional LFC paths or production jobs
        # use OSG style for analysis jobs (for the time being)
        if not analJob:
            # return full lfc file path (beginning lfcpath might need to be replaced)
            native_lfc_path = self.to_native_lfn(dsname, filename)
            # /grid/atlas/dq2/testpanda/testpanda.destDB.b7cd4b56-1b5e-465a-a5d7-38d5e2609724_sub01000457/
            #58f836d5-ff4b-441a-979b-c37094257b72_0.job.log.tgz
            tolog("Native_lfc_path: %s" % (native_lfc_path))

            # replace the default path /grid/atlas/rucio with lfcpath if different
            # (to_native_lfn returns a path begining with /grid/atlas/rucio)
            default_lfcpath = '/grid/atlas/rucio'  # to_native_lfn always returns this at the beginning of the string
            if default_lfcpath != lfcpath:
                final_lfc_path = native_lfc_path.replace(
                    default_lfcpath, lfcpath)
            else:
                final_lfc_path = native_lfc_path

            # name of dir to be created in LFC
            lfcdir = os.path.dirname(final_lfc_path)
            # /grid/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492
            tolog("LFC dir: %s" % (lfcdir))

            # dst_gpfn = destination
            # dst_gpfn = os.path.join(destination, os.path.join(dsname, filename))
            # /pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/testpanda/testpanda.destDB.dfb45803-1251-43bb-8e7a-6ad2b6f205be_sub01000492
            # tolog("dst_gpfn: %s" % (dst_gpfn))

        else:  # for analysis jobs

            lfcdir = '%s/%s/%s' % (lfcpath, prefixdir, dsname)
            tolog("lfcdir: %s" % (lfcdir))

        report['relativeStart'] = time()

        # name of dir to be created on Castor
        dirname = os.path.join(castor_destination, dsname)

        dst_gpfn = os.path.join(castor_destination,
                                os.path.join(dsname, filename))
        tolog("dst_gpfn: %s" % (dst_gpfn))
        fppfn = os.path.abspath(pfn)

        # get the RSE from ToA
        try:
            _RSE = self.getRSE(surl=dst_gpfn)
        except Exception, e:
            tolog(
                "Warning: Failed to get RSE: %s (can not add this info to tracing report)"
                % str(e))
Esempio n. 39
0
def getFileSystemRootPath(experiment):
    """ Return the proper file system root path (cvmfs) """

    e = getExperiment(experiment)
    return e.getCVMFSPath()
Esempio n. 40
0
    def get_data(self, gpfn, lfn, path, fsize=0, fchecksum=0, guid=0, **pdict):
        """ stage-in function """

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        useCT = pdict.get('usect', True)
        jobId = pdict.get('jobId', '')
        workDir = pdict.get('workDir', '')
        experiment = pdict.get('experiment', '')
        prodDBlockToken = pdict.get('access', '')

        # get the Rucio tracing report
        report = self.getStubTracingReport(pdict['report'], 'castor', lfn,
                                           guid)

        # get a proper envsetup
        envsetup = self.getEnvsetup(get=True)

        ec, pilotErrorDiag = verifySetupCommand(error, envsetup)
        if ec != 0:
            self.prepareReport('RFCP_FAIL', report)
            return ec, pilotErrorDiag

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        # do we have a valid proxy?
        s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup)
        if s != 0:
            self.prepareReport('PROXYFAIL', report)
            return s, pilotErrorDiag

        # Strip off prefix in order to use rfcp directly
        tolog("gpfn: %s" % (gpfn))
        pat = re.compile('^.*(/castor/.*)$')
        mat = pat.match(gpfn)
        if mat:
            getfile = mat.group(1)
        else:
            pilotErrorDiag = "Get file not in castor: %s" % (gpfn)
            tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
            self.prepareReport('NO_FILE', report)
            return error.ERR_STAGEINFAILED, pilotErrorDiag

        # when the file has been copied we will rename it to the lfn (to remove the legacy __DQ2-string on some files)
        dest_path = os.path.join(path, lfn)

        # should the root file be copied or read directly by athena?
        directIn, useFileStager = self.getTransferModes()
        if directIn:
            if useCT:
                directIn = False
                tolog(
                    "Direct access mode is switched off (file will be transferred with the copy tool)"
                )
                updateFileState(lfn,
                                workDir,
                                jobId,
                                mode="transfer_mode",
                                state="copy_to_scratch",
                                ftype="input")
            else:
                # determine if the file is a root file according to its name
                rootFile = self.isRootFileName(lfn)

                if prodDBlockToken == 'local' or not rootFile:
                    directIn = False
                    tolog(
                        "Direct access mode has been switched off for this file (will be transferred with the copy tool)"
                    )
                    updateFileState(lfn,
                                    workDir,
                                    jobId,
                                    mode="transfer_mode",
                                    state="copy_to_scratch",
                                    ftype="input")
                elif rootFile:
                    tolog(
                        "Found root file according to file name: %s (will not be transferred in direct reading mode)"
                        % (lfn))
                    report['relativeStart'] = None
                    report['transferStart'] = None
                    self.prepareReport('FOUND_ROOT', report)
                    if useFileStager:
                        updateFileState(lfn,
                                        workDir,
                                        jobId,
                                        mode="transfer_mode",
                                        state="file_stager",
                                        ftype="input")
                    else:
                        updateFileState(lfn,
                                        workDir,
                                        jobId,
                                        mode="transfer_mode",
                                        state="remote_io",
                                        ftype="input")
                    return error.ERR_DIRECTIOFILE, pilotErrorDiag
                else:
                    tolog("Normal file transfer")

        # transfer the input file with rfcp
        _cmd_str = '%srfcp %s %s' % (envsetup, getfile, dest_path)
        tolog("Executing command: %s" % (_cmd_str))
        report['transferStart'] = time()
        s, o = commands.getstatusoutput(_cmd_str)
        report['validateStart'] = time()
        if s != 0:
            o = o.replace('\n', ' ')
            check_syserr(s, o)

            # remove the local file before any get retry is attempted
            _status = self.removeLocal(dest_path)
            if not _status:
                tolog(
                    "!!WARNING!!1112!! Failed to remove local file, get retry will fail"
                )

            if o.find("No such file or directory") >= 0:
                if getfile.find("DBRelease") >= 0:
                    pilotErrorDiag = "Missing DBRelease file: %s" % (getfile)
                    tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                    ec = error.ERR_MISSDBREL
                else:
                    pilotErrorDiag = "No such file or directory: %s" % (
                        getfile)
                    tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                    ec = error.ERR_NOSUCHFILE
            else:
                pilotErrorDiag = "rfcp failed: %d, %s" % (s, o)
                tolog("!!WARNING!!2999!! %s" % (pilotErrorDiag))
                ec = error.ERR_STAGEINFAILED
            self.prepareReport('RFCP_FAIL', report)
            return ec, pilotErrorDiag

        # check file size and checksum
        if fsize != 0 or fchecksum != 0:
            # which checksum type are we using?
            if fchecksum != 0 and fchecksum != "":
                csumtype = self.getChecksumType(fchecksum)
            else:
                csumtype = "default"

            # get remote file size and checksum
            ec, pilotErrorDiag, dstfsize, dstfchecksum = self.getLocalFileInfo(
                dest_path, csumtype=csumtype)
            tolog("File info: %d, %s, %s" % (ec, dstfsize, dstfchecksum))
            if ec != 0:
                self.prepareReport('LOCAL_FILE_INFO_FAIL', report)

                # remove the local file before any get retry is attempted
                _status = self.removeLocal(dest_path)
                if not _status:
                    tolog(
                        "!!WARNING!!1112!! Failed to remove local file, get retry will fail"
                    )

                return ec, pilotErrorDiag

            # compare remote and local file size
            if fsize != 0 and dstfsize != fsize:
                pilotErrorDiag = "Remote and local file sizes do not match for %s (%s != %s)" %\
                                 (os.path.basename(gpfn), str(dstfsize), str(fsize))
                tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))
                self.prepareReport('FS_MISMATCH', report)

                # remove the local file before any get retry is attempted
                _status = self.removeLocal(dest_path)
                if not _status:
                    tolog(
                        "!!WARNING!!1112!! Failed to remove local file, get retry will fail"
                    )

                return error.ERR_GETWRONGSIZE, pilotErrorDiag

            # compare remote and local file checksum
            if fchecksum != 0 and dstfchecksum != fchecksum and not self.isDummyChecksum(
                    fchecksum):
                pilotErrorDiag = "Remote and local checksums (of type %s) do not match for %s (%s != %s)" %\
                                 (csumtype, os.path.basename(gpfn), dstfchecksum, fchecksum)
                tolog('!!WARNING!!2999!! %s' % (pilotErrorDiag))

                # remove the local file before any get retry is attempted
                _status = self.removeLocal(dest_path)
                if not _status:
                    tolog(
                        "!!WARNING!!1112!! Failed to remove local file, get retry will fail"
                    )

                if csumtype == "adler32":
                    self.prepareReport('AD_MISMATCH', report)
                    return error.ERR_GETADMISMATCH, pilotErrorDiag
                else:
                    self.prepareReport('MD5_MISMATCH', report)
                    return error.ERR_GETMD5MISMATCH, pilotErrorDiag

        updateFileState(lfn,
                        workDir,
                        jobId,
                        mode="file_state",
                        state="transferred",
                        ftype="input")
        self.prepareReport('DONE', report)
        return 0, pilotErrorDiag
Esempio n. 41
0
    def getUtilityInfo(self, node, experiment, workdir):
        """ Add the utility info to the node structure if available """

        # Get the experiment object and check if the special utility (e.g. a memory monitor) was used
        thisExperiment = getExperiment(experiment)
        if thisExperiment.shouldExecuteUtility():

            # Try to get the memory monitor info from the workdir first
            path = os.path.join(workdir, thisExperiment.getUtilityJSONFilename())
            init_path = os.path.join(self.__pilot_initdir, thisExperiment.getUtilityJSONFilename())
            primary_location = False
            if not os.path.exists(path):
                tolog("File does not exist: %s" % (path))
                if os.path.exists(init_path):
                    path = init_path
                else:
                    tolog("File does not exist either: %s" % (path))
                    path = ""
                primary_location = False
            else:
                primary_location = True

            if path != "":
                tolog("Reading memory monitoring info from: %s" % (path))

                # If the file is the primary one (ie the one in the workdir and not the initdir, then also check the modification time)
                read_from_file = True
                if primary_location:
                    # Get the modification time
                    mod_time = None
                    max_time = 120
                    try:
                        file_modification_time = os.path.getmtime(path)
                        current_time = int(time())
                        mod_time = current_time - file_modification_time
                        tolog("File %s was modified %d seconds ago" % (mod_time))
                    except:
                        tolog("!!WARNING!!2323!! Could not read the modification time of %s" % (path))
                        tolog("!!WARNING!!2324!! Will add -1 values for the memory info")
                        node['maxRSS'] = -1
                        node['maxVMEM'] = -1
                        node['maxSWAP'] = -1
                        node['maxPSS'] = -1
                        node['avgRSS'] = -1
                        node['avgVMEM'] = -1
                        node['avgSWAP'] = -1
                        node['avgPSS'] = -1
                        read_from_file = False
                    else:
                        if mod_time > max_time:
                            tolog("!!WARNING!!2325!! File %s was modified over %d s ago, will add -1 values for the memory info" % (path, max_time))
                            node['maxRSS'] = -1
                            node['maxVMEM'] = -1
                            node['maxSWAP'] = -1
                            node['maxPSS'] = -1
                            node['avgRSS'] = -1
                            node['avgVMEM'] = -1
                            node['avgSWAP'] = -1
                            node['avgPSS'] = -1
                            read_from_file = False

                if read_from_file:
                    # Get the dictionary
                    d = getJSONDictionary(path)
                    if d and d != {}:
                        try:
                            # Move to experiment class?
                            node['maxRSS'] = d['Max']['maxRSS']
                            node['maxVMEM'] = d['Max']['maxVMEM']
                            node['maxSWAP'] = d['Max']['maxSwap']
                            node['maxPSS'] = d['Max']['maxPSS']
                            node['avgRSS'] = d['Avg']['avgRSS']
                            node['avgVMEM'] = d['Avg']['avgVMEM']
                            node['avgSWAP'] = d['Avg']['avgSwap']
                            node['avgPSS'] = d['Avg']['avgPSS']
                        except Exception, e:
                            tolog("!!WARNING!!54541! Exception caught while parsing memory monitor JSON: %s" % (e))
                        else:
                            tolog("Extracted info from memory monitor JSON")

            # Done with the memory monitor for this job (if the file is read from the pilots' init dir), remove the file in case there are other jobs to be run
            if os.path.exists(init_path):
                try:
                    os.system("rm -rf %s" % (init_path))
                except Exception, e:
                    tolog("!!WARNING!!4343!! Failed to remove %s: %s" % (init_path), e)
                else:
                    tolog("Removed %s" % (init_path))
Esempio n. 42
0
    def put_data(self, source, destination, fsize=0, fchecksum=0, **pdict):
        """ copy output file from disk to local SE """
        # function is based on dCacheSiteMover put function

        error = PilotErrors()
        pilotErrorDiag = ""

        # Get input parameters from pdict
        lfn = pdict.get('lfn', '')
        guid = pdict.get('guid', '')
        token = pdict.get('token', '')
        scope = pdict.get('scope', '')
        dsname = pdict.get('dsname', '')
        analysisJob = pdict.get('analJob', False)
        testLevel = pdict.get('testLevel', '0')
        extradirs = pdict.get('extradirs', '')
        experiment = pdict.get('experiment', '')
        proxycheck = pdict.get('proxycheck', False)
        prodSourceLabel = pdict.get('prodSourceLabel', '')

        # get the site information object
        si = getSiteInformation(experiment)

        tolog("put_data received prodSourceLabel=%s" % (prodSourceLabel))
        if prodSourceLabel == 'ddm' and analysisJob:
            tolog(
                "Treating PanDA Mover job as a production job during stage-out"
            )
            analysisJob = False

        # get the Rucio tracing report
        try:
            report = pdict['report']
        except:
            report = {}
        else:
            # set the proper protocol
            report['protocol'] = 'curl'
            # mark the relative start
            report['catStart'] = time()
            # the current file
            report['filename'] = lfn
            # guid
            report['guid'] = guid.replace('-', '')

        # preparing variables
        if fsize == 0 or fchecksum == 0:
            ec, pilotErrorDiag, fsize, fchecksum = self.getLocalFileInfo(
                source, csumtype="adler32")
            if ec != 0:
                self.prepareReport('LOCAL_FILE_INFO_FAIL', report)
                return self.put_data_retfail(ec, pilotErrorDiag)

        # now that the file size is known, add it to the tracing report
        report['filesize'] = fsize

        # get the checksum type
        if fchecksum != 0 and fchecksum != "":
            csumtype = self.getChecksumType(fchecksum)
        else:
            csumtype = "default"

        # get a proper envsetup
        envsetup = self.getEnvsetup()

        # get the experiment object
        thisExperiment = getExperiment(experiment)

        if proxycheck:
            s, pilotErrorDiag = thisExperiment.verifyProxy(envsetup=envsetup,
                                                           limit=2)
            if s != 0:
                self.prepareReport('NO_PROXY', report)
                return self.put_data_retfail(error.ERR_NOPROXY, pilotErrorDiag)
        else:
            tolog("Proxy verification turned off")

        filename = os.path.basename(source)

        # get all the proper paths
        ec, pilotErrorDiag, tracer_error, dst_gpfn, lfcdir, surl = si.getProperPaths(
            error,
            analysisJob,
            token,
            prodSourceLabel,
            dsname,
            filename,
            scope=scope,
            sitemover=self)  # quick workaround
        if ec != 0:
            self.prepareReport(tracer_error, report)
            return self.put_data_retfail(ec, pilotErrorDiag)

        putfile = surl
        full_surl = putfile
        if full_surl[:len('token:')] == 'token:':
            # remove the space token (e.g. at Taiwan-LCG2) from the SURL info
            full_surl = full_surl[full_surl.index('srm://'):]

        # srm://dcache01.tier2.hep.manchester.ac.uk/pnfs/tier2.hep.manchester.ac.uk/data/atlas/dq2/
        #testpanda.destDB/testpanda.destDB.604b4fbc-dbe9-4b05-96bb-6beee0b99dee_sub0974647/
        #86ecb30d-7baa-49a8-9128-107cbfe4dd90_0.job.log.tgz
        tolog("putfile: %s" % (putfile))
        tolog("full_surl: %s" % (full_surl))

        # get https surl
        full_http_surl = full_surl.replace("srm://", "https://")

        # get the RSE from ToA
        try:
            _RSE = self.getRSE(surl=putfile)
        except Exception, e:
            tolog(
                "Warning: Failed to get RSE: %s (can not add this info to tracing report)"
                % str(e))