예제 #1
0
 def checkDatasetStatus(self, dataset, kwargs):
     res = self.dbs.dbs.listDatasets(dataset=dataset,
                                     detail=1,
                                     dataset_access_type='*')
     if len(res) > 1:
         raise TaskWorkerException(
             "Found more than one dataset while checking in DBS the status of %s"
             % dataset)
     if len(res) == 0:
         raise TaskWorkerException(
             "Cannot find dataset %s in %s DBS instance" %
             (dataset, self.dbsInstance))
     res = res[0]
     self.logger.info("Input dataset details: %s", pprint.pformat(res))
     accessType = res['dataset_access_type']
     if accessType != 'VALID':
         # as per Dima's suggestion https://github.com/dmwm/CRABServer/issues/4739
         msgForDeprecDS = "Please contact your physics group if you think the dataset should not be deprecated."
         if kwargs['task']['tm_nonvalid_input_dataset'] != 'T':
             msg = "CRAB refuses to proceed in getting the details of the dataset %s from DBS, because the dataset is not 'VALID' but '%s'." % (
                 dataset, accessType)
             if accessType == 'DEPRECATED':
                 msg += " (%s)" % (msgForDeprecDS)
             msg += " To allow CRAB to consider a dataset that is not 'VALID', set Data.allowNonValidInputDataset = True in the CRAB configuration."
             msg += " Notice that this will not force CRAB to run over all files in the dataset;"
             msg += " CRAB will still check if there are any valid files in the dataset and run only over those files."
             raise TaskWorkerException(msg)
         msg = "The input dataset %s is not 'VALID' but '%s'." % (
             dataset, accessType)
         msg += " CRAB will check if there are any valid files in the dataset and run only over those files."
         if accessType == 'DEPRECATED':
             msg += " %s" % (msgForDeprecDS)
         self.uploadWarning(msg, kwargs['task']['user_proxy'],
                            kwargs['task']['tm_taskname'])
     return
예제 #2
0
 def resubmitPublication(self, asourl, asodb, proxy, taskname):
     """
     Resubmit failed publications by resetting the publication
     status in the CouchDB documents.
     """
     server = CouchServer(dburl=asourl, ckey=proxy, cert=proxy)
     try:
         database = server.connectDatabase(asodb)
     except Exception as ex:
         msg = "Error while trying to connect to CouchDB: %s" % (str(ex))
         raise TaskWorkerException(msg)
     try:
         failedPublications = database.loadView('DBSPublisher', 'PublicationFailedByWorkflow',\
                 {'reduce': False, 'startkey': [taskname], 'endkey': [taskname, {}]})['rows']
     except Exception as ex:
         msg = "Error while trying to load view 'DBSPublisher.PublicationFailedByWorkflow' from CouchDB: %s" % (str(ex))
         raise TaskWorkerException(msg)
     msg = "There are %d failed publications to resubmit: %s" % (len(failedPublications), failedPublications)
     self.logger.info(msg)
     for doc in failedPublications:
         docid = doc['id']
         if doc['key'][0] != taskname: # this should never happen...
             msg = "Skipping document %s as it seems to correspond to another task: %s" % (docid, doc['key'][0])
             self.logger.warning(msg)
             continue
         data = {'last_update': time.time(),
                 'retry': str(datetime.datetime.now()),
                 'publication_state': 'not_published',
                }
         try:
             database.updateDocument(docid, 'DBSPublisher', 'updateFile', data)
         except Exception as ex:
             msg = "Error updating document %s in CouchDB: %s" % (docid, str(ex))
             self.logger.error(msg)
     return
예제 #3
0
def getWritePFN(rucioClient=None, siteName='', lfn='', logger=None):
    """
    convert a single LFN into a PFN which can be used for Writing via Rucio
    Rucio supports the possibility that at some point in the future sites may
    require different protocols or hosts for read or write operations
    :param rucioClient: Rucio python client, e.g. the object returned by getNativeRucioClient above
    :param siteName: e.g. 'T2_CH_CERN'
    :param lfn: a CMS-style LFN
    :param logger: a valid logger instance
    :return: a CMS-style PFN
    """

    # add a scope to turn LFN into Rucio DID syntax
    did = 'cms:' + lfn
    try:
        didDict = rucioClient.lfns2pfns(siteName, [did], operation='write')
    except Exception as ex:
        logger.warning('Rucio lfn2pfn resolution for Write failed with:\n%s',
                       ex)
        logger.warning("Will try with operation='read'")
        try:
            didDict = rucioClient.lfns2pfns(siteName, [did], operation='read')
        except Exception as ex:
            msg = 'lfn2pfn resolution with Rucio failed for site: %s  LFN: %s' % (
                siteName, lfn)
            msg += ' with exception :\n%s' % str(ex)
            raise TaskWorkerException(msg)

    # lfns2pfns returns a dictionary with did as key and pfn as value:
    #  https://rucio.readthedocs.io/en/latest/api/rse.html
    # {u'cms:/store/user/rucio': u'gsiftp://eoscmsftp.cern.ch:2811/eos/cms/store/user/rucio'}
    pfn = didDict[did]

    return pfn
예제 #4
0
 def keepOnlyDisks(self, locationsMap):
     phedex = PhEDEx()  # TODO use certs from the config!
     # get all the PNNs that are of kind 'Disk'
     try:
         diskLocations = set([
             pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node']
             if pnn['kind'] == 'Disk'
         ])
     except HTTPException as ex:
         self.logger.error(ex.headers)
         raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\
                             "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\
                             " and contact the experts if the error persists.\nError reason: %s" % str(ex)) # TODO addo the nodes phedex so the user can check themselves
     diskLocationsMap = {}
     for block, locations in locationsMap.iteritems():
         locations[:] = [
             x for x in locations if x != 'T3_CH_CERN_OpenData'
         ]  # ignore OpenData until it is accessible by CRAB
         if set(locations) & diskLocations:
             # at least some locations are disk
             diskLocationsMap[block] = locationsMap[block]
         else:
             # no locations are in the disk list, assume that they are tape
             self.tapeLocations = self.tapeLocations.union(
                 set(locations) - diskLocations)
     locationsMap.clear()  # remove all blocks
     locationsMap.update(
         diskLocationsMap)  # add only blocks with disk locations
예제 #5
0
 def execute(self, *args, **kwargs):
     self.logger.info("Data discovery with DBS") ## to be changed into debug
     old_cert_val = os.getenv("X509_USER_CERT")
     old_key_val = os.getenv("X509_USER_KEY")
     os.environ['X509_USER_CERT'] = self.config.TaskWorker.cmscert
     os.environ['X509_USER_KEY'] = self.config.TaskWorker.cmskey
     # DBS3 requires X509_USER_CERT to be set - but we don't want to leak that to other modules
     dbsurl = self.config.Services.DBSUrl
     if kwargs['task']['tm_dbs_url']:
         dbsurl = kwargs['task']['tm_dbs_url']
     dbs = get_dbs(dbsurl)
     #
     if old_cert_val != None:
         os.environ['X509_USER_CERT'] = old_cert_val
     else:
         del os.environ['X509_USER_CERT']
     if old_key_val != None:
         os.environ['X509_USER_KEY'] = old_key_val
     else:
         del os.environ['X509_USER_KEY']
     self.logger.debug("Data discovery through %s for %s" %(dbs, kwargs['task']['tm_taskname']))
     try:
         # Get the list of blocks for the locations and then call dls.
         # The WMCore DBS3 implementation makes one call to dls for each block
         # with locations = True so we are using locations=False and looking up location later
         blocks = [ x['Name'] for x in dbs.getFileBlocksInfo(kwargs['task']['tm_input_dataset'], locations=False)]
     except DBSReaderError, dbsexc:
         #dataset not found in DBS is a known use case
         if str(dbsexc).find('No matching data'):
             raise TaskWorkerException("The CRAB3 server backend could not could not find dataset %s in this DBS instance: %s" % (kwargs['task']['tm_input_dataset'], dbsurl))
         raise
예제 #6
0
 def execute(self, *args, **kw):
     try:
         return self.executeInternal(*args, **kw)
     except Exception as e:
         msg = "Failed to upload dry run tarball for %s; '%s'" % (
             kw['task']['tm_taskname'], str(e))
         raise TaskWorkerException(msg)
예제 #7
0
def checkMemoryWalltime(info, task, cmd, logger, warningUploader):
    """ Check memory and walltime and if user requires too much:
        - upload warning back to crabserver
        - change walltime to max 47h Issue: #4742
    """

    stdmaxjobruntime = 2750
    runtime = task[cmd+'_maxjobruntime']
    memory = task[cmd+'_maxmemory']
    ncores = task[cmd+'_numcores']
    if ncores is None:
        ncores = 1
    absmaxmemory = max(MAX_MEMORY_SINGLE_CORE, ncores*MAX_MEMORY_PER_CORE)
    if runtime is not None and runtime > stdmaxjobruntime:
        msg = "Task requests %s minutes of runtime, but only %s minutes are guaranteed to be available." % (runtime, stdmaxjobruntime)
        msg += " Jobs may not find a site where to run."
        msg += " CRAB has changed this value to %s minutes." % (stdmaxjobruntime)
        logger.warning(msg)
        if info is not None:
            info['tm_maxjobruntime'] = str(stdmaxjobruntime)
        # somehow TaskAction/uploadWaning wants the user proxy to make a POST to task DB
        warningUploader(msg, task['user_proxy'], task['tm_taskname'])
    if memory is not None and memory > absmaxmemory:
        msg = "Task requests %s MB of memory, above the allowed maximum of %s" % (memory, absmaxmemory)
        msg += " for a %d core(s) job.\n" % ncores
        logger.error(msg)
        raise TaskWorkerException(msg)
    if memory is not None and memory > MAX_MEMORY_PER_CORE:
        if ncores is not None and ncores < 2:
            msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % (memory, MAX_MEMORY_PER_CORE)
            msg += " Jobs may not find a site where to run and stay idle forever."
            logger.warning(msg)
            # somehow TaskAction/uploadWaning wants the user proxy to make a POST to task DB
            warningUploader(msg, task['user_proxy'], task['tm_taskname'])
예제 #8
0
 def checkPermissions(self, Cmd):
     """
     Execute command and in case of permanent issue, raise error
     If issue unknown, upload warning message and return 1
     Return 0 otherwise
     """
     self.logger.info("Executing command: %s ", Cmd)
     out, err, exitcode = executeCommand(Cmd)
     if exitcode != 0:
         isPermanent, failure, dummyExitCode = isFailurePermanent(err)
         if isPermanent:
             msg = "CRAB3 refuses to send jobs to grid scheduler for %s. Error message: %s" %(self.task['tm_taskname'], failure)
             msg += "\n" + out
             msg += "\n" + err
             self.logger.warning(msg)
             raise TaskWorkerException(msg)
         else:
             # Unknown error. Operators should check it from time to time and add failures if they are permanent.
             self.logger.warning("CRAB3 was not able to identify if failure is permanent. Err: %s Out: %s ExitCode: %s", err, out, exitcode)
             # Upload warning to user about not being able to check stageout
             msg = "The CRAB3 server got a non-critical error while checking stageout permissions. Please use checkwrite to check if everything is fine."
             self.uploadWarning(msg, self.task['user_proxy'], self.task['tm_taskname'])
             self.logger.info("UNKNOWN ERROR. Operator should check if it is permanent, but for now we go ahead and submit a task.")
             return 1
     return 0
예제 #9
0
    def killAll(self, jobConst):

        # We need to keep ROOT, PROCESSING, and TAIL DAGs in hold until periodic remove kicks in.
        # See DagmanSubmitter.py#L390 (dagAd["PeriodicRemove"])
        # This is needed in case user wants to resubmit.
        rootConst = 'stringListMember(TaskType, "ROOT PROCESSING TAIL", " ") && CRAB_ReqName =?= %s' % HTCondorUtils.quote(self.workflow)

        # Holding DAG job does not mean that it will remove all jobs
        # and this must be done separately
        # --------------------------------------
        # From HTCondor documentation
        # http://research.cs.wisc.edu/htcondor/manual/v8.3/2_10DAGMan_Applications.html#SECTION003107000000000000000
        # --------------------------------------
        # After placing the condor_dagman job on hold, no new node jobs will be submitted,
        # and no PRE or POST scripts will be run. Any node jobs already in the HTCondor queue
        # will continue undisturbed. If the condor_dagman job is left on hold, it will remain
        # in the HTCondor queue after all of the currently running node jobs are finished.
        # --------------------------------------
        # TODO: Remove jobConst query when htcondor ticket is solved
        # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=5175

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
            if not parent:
                with self.schedd.transaction() as dummytsc:
                    self.schedd.act(htcondor.JobAction.Hold, rootConst)
                    self.schedd.act(htcondor.JobAction.Remove, jobConst)
        results = rpipe.read()
        if results != "OK":
            msg  = "The CRAB server backend was not able to kill the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
예제 #10
0
    def execute(self, *args, **kwargs):
        """
        The execute method of the DagmanKiller class.
        """
        self.executeInternal(*args, **kwargs)
        try:
            ## AndresT: If a task was in FAILED status before the kill, then the new status
            ## after killing some jobs should be FAILED again, not SUBMITTED. However, in
            ## the long term we would like to introduce a final node in the DAG, and I think
            ## the idea would be that the final node will put the task status into FAILED or
            ## COMPLETED (in the TaskDB) once all jobs are finished. In that case I think
            ## also the status method from HTCondorDataWorkflow would not have to return any
            ## adhoc task status anymore (it would just return what is in the TaskDB) and
            ## that also means that FAILED task status would only be a terminal status that
            ## I guess should not accept a kill (because it doesn't make sense to kill a
            ## task for which all jobs have already finished -successfully or not-).
            configreq = {
                'subresource': 'state',
                'workflow': kwargs['task']['tm_taskname'],
                'status': 'KILLED'
            }
            self.logger.debug(
                "Setting the task as successfully killed with %s",
                str(configreq))
            self.crabserver.post(api='workflowdb',
                                 data=urllib.urlencode(configreq))
        except HTTPException as hte:
            self.logger.error(hte.headers)
            msg = "The CRAB server successfully killed the task,"
            msg += " but was unable to update the task status to %s in the database." % (
                configreq['status'])
            msg += " This should be a harmless (temporary) error."
            raise TaskWorkerException(msg)

        return Result.Result(task=kwargs['task'], result='OK')
예제 #11
0
    def executeInternal(self, *args, **kw):
        tempDir = args[0][0]
        inputFiles = args[0][3]
        splitterResult = args[0][4]

        cwd = os.getcwd()
        try:
            os.chdir(tempDir)
            splittingSummary = SplittingSummary(kw['task']['tm_split_algo'])
            for jobgroup in splitterResult:
                jobs = jobgroup.getJobs()
                splittingSummary.addJobs(jobs)
            splittingSummary.dump('splitting-summary.json')
            inputFiles.append('splitting-summary.json')

            self.packSandbox(inputFiles)

            self.logger.info('Uploading dry run tarball to the user file cache')
            ufc = UserFileCache(dict={'cert': kw['task']['user_proxy'], 'key': kw['task']['user_proxy'], 'endpoint': kw['task']['tm_cache_url']})
            result = ufc.uploadLog('dry-run-sandbox.tar.gz')
            os.remove('dry-run-sandbox.tar.gz')
            if 'hashkey' not in result:
                raise TaskWorkerException('Failed to upload dry-run-sandbox.tar.gz to the user file cache: ' + str(result))
            else:
                self.logger.info('Uploaded dry run tarball to the user file cache: ' + str(result))
                update = {'workflow': kw['task']['tm_taskname'], 'subresource': 'state', 'status': 'UPLOADED'}
                self.logger.debug('Updating task status: %s' % str(update))
                self.server.post(self.resturi, data=urllib.urlencode(update))

        finally:
            os.chdir(cwd)

        return Result(task=kw['task'], result=args[0])
예제 #12
0
 def execute(self, *args, **kwargs):
     result = None
     proxycfg = {'vo': kwargs['task']['tm_user_vo'],
                 'logger': self.logger,
                 'myProxySvr': self.config.Services.MyProxy,
                 'proxyValidity' : '144:0',
                 'min_time_left' : 36000, ## do we need this ? or should we use self.myproxylen? 
                 'userDN' : kwargs['task']['tm_user_dn'],
                 'group' : kwargs['task']['tm_user_group'] if kwargs['task']['tm_user_group'] else '',
                 'role' : kwargs['task']['tm_user_role'] if kwargs['task']['tm_user_role'] else '',
                 'server_key': self.config.MyProxy.serverhostkey,
                 'server_cert': self.config.MyProxy.serverhostcert,
                 'serverDN': self.config.MyProxy.serverdn,
                 'uisource': getattr(self.config.MyProxy, 'uisource', ''),
                 'credServerPath': self.config.MyProxy.credpath,
                 'myproxyAccount' : self.server['host'],
                 'cleanEnvironment' : getattr(self.config.MyProxy, 'cleanEnvironment', False)
                }
     proxy = Proxy(proxycfg)
     userproxy = proxy.getProxyFilename(serverRenewer=True)
     proxy.logonRenewMyProxy()
     timeleft = proxy.getTimeLeft(userproxy)
     if timeleft is None or timeleft <= 0:
         msg = "Impossible to retrieve proxy from %s for %s." % (proxycfg['myProxySvr'], proxycfg['userDN'])
         raise TaskWorkerException(msg)
     else:
         kwargs['task']['user_proxy'] = userproxy
         result = Result(task=kwargs['task'], result='OK')
     return result
예제 #13
0
    def executeInternal(self, apmon, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        self.task = kwargs['task']
        if 'tm_taskname' not in self.task:
            raise ValueError("No taskname specified")
        self.workflow = self.task['tm_taskname']
        if 'user_proxy' not in self.task:
            raise ValueError("No proxy provided")
        self.proxy = self.task['user_proxy']

        self.logger.info("About to kill workflow: %s." % self.workflow)

        self.workflow = str(self.workflow)
        if not WORKFLOW_RE.match(self.workflow):
            raise Exception("Invalid workflow name.")

        # Query HTCondor for information about running jobs and update Dashboard appropriately
        if self.task['tm_collector']:
            self.backendurls['htcondorPool'] = self.task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        address = ""
        try:
            self.schedd, address = loc.getScheddObjNew(self.task['tm_schedd'])
        except Exception as exp:
            msg  = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (self.workflow, msg))
            raise TaskWorkerException(msg)

        try:
            hostname = socket.getfqdn()
        except:
            hostname = ''

        const = 'CRAB_ReqName =?= %s && TaskType=?="Job"' % HTCondorUtils.quote(self.workflow)
        try:
            for ad in list(self.schedd.xquery(const, ['CRAB_Id', 'CRAB_Retry'])):
                if ('CRAB_Id' not in ad) or ('CRAB_Retry' not in ad):
                    continue
                jobid = str(ad.eval('CRAB_Id'))
                jobretry = str(ad.eval('CRAB_Retry'))
                jinfo = {'broker': hostname,
                         'bossId': jobid,
                         'StatusValue': 'killed',
                        }
                insertJobIdSid(jinfo, jobid, self.workflow, jobretry)
                self.logger.info("Sending kill info to Dashboard: %s" % str(jinfo))
                apmon.sendToML(jinfo)
        except:
            self.logger.exception("Failed to notify Dashboard of job kills") #warning

        # Note that we can not send kills for jobs not in queue at this time; we'll need the
        # DAG FINAL node to be fixed and the node status to include retry number.
        return self.killAll(const)
예제 #14
0
    def executeInternal(self, *args, **kw):
        inputFiles = args[0][2]
        splitterResult = args[0][3][0]

        cwd = os.getcwd()
        try:
            os.chdir(kw['tempDir'])
            splittingSummary = SplittingSummary(kw['task']['tm_split_algo'])
            for jobgroup in splitterResult:
                jobs = jobgroup.getJobs()
                splittingSummary.addJobs(jobs)
            splittingSummary.dump('splitting-summary.json')
            inputFiles.append('splitting-summary.json')

            self.packSandbox(inputFiles)

            self.logger.info(
                'Uploading dry run tarball to the user file cache')
            if 'S3' in kw['task']['tm_cache_url'].upper():
                uploadToS3(crabserver=self.crabserver,
                           filepath='dry-run-sandbox.tar.gz',
                           objecttype='runtimefiles',
                           taskname=kw['task']['tm_taskname'],
                           logger=self.logger)
                result = {
                    'hashkey': 'ok'
                }  # a dummy one to keep same semantics as when using UserFileCache
                os.remove('dry-run-sandbox.tar.gz')
            else:
                ufc = UserFileCache(
                    mydict={
                        'cert': kw['task']['user_proxy'],
                        'key': kw['task']['user_proxy'],
                        'endpoint': kw['task']['tm_cache_url']
                    })
                result = ufc.uploadLog('dry-run-sandbox.tar.gz')
                os.remove('dry-run-sandbox.tar.gz')
            if 'hashkey' not in result:
                raise TaskWorkerException(
                    'Failed to upload dry-run-sandbox.tar.gz to the user file cache: '
                    + str(result))
            self.logger.info(
                'Uploaded dry run tarball to the user file cache: %s',
                str(result))
            update = {
                'workflow': kw['task']['tm_taskname'],
                'subresource': 'state',
                'status': 'UPLOADED'
            }
            self.logger.debug('Updating task status: %s', str(update))
            self.crabserver.post(api='workflowdb',
                                 data=urllib.urlencode(update))

        finally:
            os.chdir(cwd)

        return Result(task=kw['task'], result=args[0])
예제 #15
0
    def execute(self, *args, **kw):
        """
        Main execute
        """
        self.task = kw['task']
        # Do not check it for HC
        # ActivitiesToRunEverywhere is used mainly for HC and there is no need to check for it.
        if hasattr(self.config.TaskWorker, 'ActivitiesToRunEverywhere') and \
                   self.task['tm_activity'] in self.config.TaskWorker.ActivitiesToRunEverywhere:
            self.logger.info("Will not check possibility to write to destination site because activity: %s is in ActivitiesToRunEverywhere", self.task['tm_activity'])
            return
        # If user specified no output and no logs transfer, there is also no need to check it.
        if self.task['tm_save_logs'] == 'F' and self.task['tm_transfer_outputs'] == 'F':
            self.logger.info("Will not check possibility to write to destination site because user specified not transfer any output/log files.")
            return
        # Do not need to check if it is dryrun.
        if self.task['tm_dry_run'] == 'T':
            self.logger.info("Will not check possibility to write to destination site. User specified dryrun option.")
            return
        self.workflow = self.task['tm_taskname']
        self.proxy = self.task['user_proxy']

        # In test machines this check is often only annoying
        if hasattr(self.config.TaskWorker, 'checkStageout') and not self.config.TaskWorker.checkStageout:
            self.logger.info("StageoutCheck disabled in this TaskWorker configuration. Skipping.")
            return

        # OK, we are interested in telling if output can be actually transferred to user destination
        # if user wants to user Rucio, we can only check quota, since transfer will be done
        # by Rucio robot without using user credentials
        if self.task['tm_output_lfn'].startswith('/store/user/rucio'):
            # to be filled with actual quota check, for the time being.. just go
            return
        # if not using Rucio, old code:
        else:
            cpCmd, rmCmd, append = getCheckWriteCommand(self.proxy, self.logger)
            if not cpCmd:
                self.logger.info("Can not check write permissions. No GFAL2 or LCG commands installed. Continuing")
                return
            self.logger.info("Will check stageout at %s", self.task['tm_asyncdest'])
            filename = re.sub("[:-_]", "", self.task['tm_taskname']) + '_crab3check.tmp'
            try:
                lfn = os.path.join(self.task['tm_output_lfn'], filename)
                pfn = getWritePFN(self.rucioClient, siteName=self.task['tm_asyncdest'], lfn=lfn, logger=self.logger)
                cpCmd += append + os.path.abspath(filename) + " " + pfn
                rmCmd += " " + pfn
                createDummyFile(filename, self.logger)
                self.logger.info("Executing cp command: %s ", cpCmd)
                res = self.checkPermissions(cpCmd)
                if res == 0:
                    self.logger.info("Executing rm command: %s ", rmCmd)
                    self.checkPermissions(rmCmd)
            except IOError as er:
                raise TaskWorkerException("TaskWorker disk is full: %s" % er)
            finally:
                removeDummyFile(filename, self.logger)
            return
예제 #16
0
    def execute(self, *args, **kwargs):
        """ Execute is the core method that submit the task to the schedd.
            The schedd can be defined in the tm_schedd task parameter if the user selected a schedd or it is a retry, otherwise it is empty.
                If it is empty the method will choose one
                If it contains a schedd it will do the duplicate check and try to submit the task to it
                In case of multiple failures is will set a new schedd and return back to the asction handler for retries.
        """
        task =  kwargs['task']
        schedd = task['tm_schedd']
        info = args[0][0]
        dashboardParams = args[0][1]
        inputFiles = args[0][2]

        self.checkMemoryWalltime(info, task)

        if not schedd:
            schedd = self.pickAndSetSchedd(task)

        self.logger.debug("Starting duplicate check")
        dupRes = self.duplicateCheck(task)
        self.logger.debug("Duplicate check finished with result %s", dupRes)
        if dupRes != None:
            return dupRes


        for retry in range(self.config.TaskWorker.max_retry + 1): #max_retry can be 0
            self.logger.debug("Trying to submit task %s to schedd %s for the %s time.", task['tm_taskname'], schedd, str(retry))
            try:
                execInt = self.executeInternal(info, dashboardParams, inputFiles, **kwargs)
                scheddStats.success(schedd, self.clusterId)
                return execInt
            except Exception as ex: #pylint: disable=broad-except
                scheddStats.failure(schedd)
                msg = "Failed to submit task %s; '%s'"% (task['tm_taskname'], str(ex))
                self.logger.exception(msg)
                scheddStats.taskError(schedd, msg)
                if retry < self.config.TaskWorker.max_retry: #do not sleep on the last retry
                    self.logger.error("Will retry in %s seconds on %s.", self.config.TaskWorker.retry_interval[retry], schedd)
                    time.sleep(self.config.TaskWorker.retry_interval[retry])
            finally:
                self.logger.info(scheddStats)
            ## All the submission retries to the current schedd have failed. Record the
            ## failures.

        ## Returning back to Handler.py for retries, and in case try on a new schedd
        self.logger.debug("Choosing a new schedd and then retrying")
        schedd = self.pickAndSetSchedd(task)

        ## All the submission retries to this schedd have failed.
        msg = "The CRAB server backend was not able to submit the jobs to the Grid schedulers."
        msg += " This could be a temporary glitch. Please try again later."
        msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
        msg += " The submission was retried %s times on %s schedulers." % (sum([len(x) for x in scheddStats.taskErrors.values()]), len(scheddStats.taskErrors))
        msg += " These are the failures per Grid scheduler: %s" % (str(scheddStats.taskErrors))

        raise TaskWorkerException(msg, retry=(schedd != None))
예제 #17
0
 def keepOnlyDisks(self, locationsMap):
     phedex = PhEDEx() #TODO use certs from the config!
     #get all the PNN that are of kind disk
     try:
         diskLocations = set([pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node'] if pnn['kind']=='Disk'])
     except Exception, ex: #TODO should we catch HttpException instead?
         self.logger.exception(ex)
         raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\
                             "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\
                             " and contact the experts if the error persists.\nError reason: %s" % str(ex)) #TODO addo the nodes phedex so the user can check themselves
예제 #18
0
    def execute(self, *args, **kwargs):
        wmwork = Workflow(name=kwargs['task']['tm_taskname'])

        wmsubs = Subscription(
            fileset=args[0],
            workflow=wmwork,
            split_algo=kwargs['task']['tm_split_algo'],
            type=self.jobtypeMapper[kwargs['task']['tm_job_type']])
        splitter = SplitterFactory()
        jobfactory = splitter(subscription=wmsubs)
        splitparam = kwargs['task']['tm_split_args']
        splitparam['algorithm'] = kwargs['task']['tm_split_algo']
        if kwargs['task']['tm_job_type'] == 'Analysis':
            if kwargs['task']['tm_split_algo'] == 'FileBased':
                splitparam['total_files'] = kwargs['task']['tm_totalunits']
            elif kwargs['task']['tm_split_algo'] == 'LumiBased':
                splitparam['total_lumis'] = kwargs['task']['tm_totalunits']
        elif kwargs['task']['tm_job_type'] == 'PrivateMC':
            if 'tm_events_per_lumi' in kwargs['task'] and kwargs['task'][
                    'tm_events_per_lumi']:
                splitparam['events_per_lumi'] = kwargs['task'][
                    'tm_events_per_lumi']
            if 'tm_generator' in kwargs['task'] and kwargs['task'][
                    'tm_generator'] == 'lhe':
                splitparam['lheInputFiles'] = True
        splitparam['applyLumiCorrection'] = True
        factory = jobfactory(**splitparam)
        if len(factory) == 0:
            raise TaskWorkerException("The CRAB3 server backend could not submit any job to the Grid scheduler:\n"+\
                        "splitting task %s on dataset %s with %s method does not generate any job")
        #printing duplicated lumis if any
        lumiChecker = getattr(jobfactory, 'lumiChecker', None)
        if lumiChecker and lumiChecker.splitLumiFiles:
            self.logger.warning(
                "The input dataset contains the following duplicated lumis %s"
                % lumiChecker.splitLumiFiles.keys())
            try:
                configreq = {
                    'subresource':
                    'addwarning',
                    'workflow':
                    kwargs['task']['tm_taskname'],
                    'warning':
                    b64encode(
                        'The CRAB3 server backend detected lumis split across files in the input dataset.'
                        ' Will apply the necessary corrections in the splitting algorithms'
                    )
                }
                self.server.post(self.restURInoAPI + '/task',
                                 data=urllib.urlencode(configreq))
            except Exception, e:
                self.logger.error(e.headers)
                self.logger.warning(
                    "Cannot add warning to REST after finding duplicates")
예제 #19
0
def serverCall(ddmServer, cert, key, verbose, call, api, data):
    server = HTTPRequests(url=ddmServer, localcert=cert, localkey=key, verbose=verbose)
    commonAPI = '/registry/request'
    try:
        ddmRequest = getattr(server, call)(commonAPI+'/'+api, data=data)
    except HTTPException as hte:
        msg = "HTTP Error while contacting the DDM server %s:\n%s" % (ddmServer, str(hte))
        msg += "\nHTTP Headers are: %s" % hte.headers
        raise TaskWorkerException(msg, retry=True)

    return ddmRequest[0]
예제 #20
0
 def checkBlocksSize(self, blocks):
     """ Make sure no single blocks has more than 100k lumis. See
         https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html
     """
     MAX_LUMIS = 100000
     for block in blocks:
         blockInfo = self.dbs.getDBSSummaryInfo(block=block)
         if blockInfo['NumberOfLumis'] > MAX_LUMIS:
             msg = "Block %s contains more than %s lumis and cannot be processed for splitting. " % (
                 block, MAX_LUMIS)
             msg += "For memory/time contraint big blocks are not allowed. Use another dataset as input."
             raise TaskWorkerException(msg)
예제 #21
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (
         HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent,
                                                                rpipe):
         if not parent:
             self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         raise TaskWorkerException("The CRAB3 server backend could not kill jobs [%s]. because the Grid scheduler answered with an error\n" % ", ".join(ids)+\
                                   "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                   "Error reason %s" % results)
예제 #22
0
 def checkBlocksSize(self, blocks):
     """ Make sure no single blocks has more than 100k lumis. See
         https://hypernews.cern.ch/HyperNews/CMS/get/dmDevelopment/2022/1/1/1/1/1/1/2.html
     """
     MAX_LUMIS = 100000
     for block in blocks:
         blockInfo = self.dbs.getDBSSummaryInfo(block=block)
         if blockInfo.get('NumberOfLumis', 0) > MAX_LUMIS:
             msg = "Block %s contains more than %s lumis.\nThis blows up CRAB server memory" % (block, MAX_LUMIS)
             msg += "\nCRAB can only split this by ignoring lumi information. You can do this"
             msg += "\nusing FileBased split algorithm and avoiding any additional request"
             msg += "\nwich may cause lumi information to be looked up. See CRAB FAQ for more info:"
             msg += "\nhttps://twiki.cern.ch/twiki/bin/view/CMSPublic/CRAB3FAQ"
             raise TaskWorkerException(msg)
예제 #23
0
 def sendScheddToREST(self, task, schedd):
     """ Try to set the schedd to the oracle database in the REST interface
         Raises TaskWorkerException in case of failure
     """
     task['tm_schedd'] = schedd
     configreq = {'workflow':task['tm_taskname'],
                  'subresource':'updateschedd', 'scheddname':schedd}
     try:
         self.crabserver.post(api='task', data=urllib.urlencode(configreq))
     except HTTPException as hte:
         msg = "Unable to contact cmsweb and update scheduler on which task will be submitted. Error msg: %s" % hte.headers
         self.logger.warning(msg)
         time.sleep(20)
         raise TaskWorkerException(msg) #we already tried 20 times, give up
예제 #24
0
    def killAll(self):

        # Search for and hold the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            self.workflow)

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent,
                                                                   rpipe):
            if not parent:
                self.schedd.act(htcondor.JobAction.Hold, rootConst)
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not kill the task because the Grid scheduler answered with an error\n"\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                      "Error reason %s" % results)
예제 #25
0
    def __init__(self, task, result=None, err=None, warn=None):
        """Inintializer

        :arg TaskWorker.DataObjects.Task task: the task the result is referring to
        :arg * result: the result can actually be any needed type
        :arg * err: the error can actually be any needed type
                       (exception, traceback, int, ...)
        :arg str warn: a warning message."""
        if not task:
            raise TaskWorkerException(
                "Task object missing! Internal error to be fixed.")
        self._task = task
        self._result = result
        self._error = err
        self._warning = warn
예제 #26
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
         if not parent:
             self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         msg  = "The CRAB server backend was not able to kill these jobs %s," % (ids)
         msg += " because the Grid scheduler answered with an error."
         msg += " This is probably a temporary glitch. Please try again later."
         msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
         msg += " Error reason: %s" % (results)
         raise TaskWorkerException(msg)
예제 #27
0
    def duplicateCheck(self, task):
        """
        Look to see if the task we are about to submit is already in the schedd.
        If so, assume that this task in TaskWorker was run successfully, but killed
        before it could update the frontend.
        """
        workflow = task['tm_taskname']

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        try:
            self.logger.debug("Duplicate check is getting the schedd obj. Collector is: %s", task['tm_collector'])
            schedd, _address = loc.getScheddObjNew(task['tm_schedd'])
            self.logger.debug("Got schedd obj for %s ", task['tm_schedd'])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", workflow, msg)
            raise TaskWorkerException(msg)

        rootConst = 'TaskType =?= "ROOT" && CRAB_ReqName =?= %s && (isUndefined(CRAB_Attempt) || CRAB_Attempt == 0)' % HTCondorUtils.quote(workflow)

        self.logger.debug("Duplicate check is querying the schedd: %s", rootConst)
        results = list(schedd.xquery(rootConst, []))
        self.logger.debug("Schedd queried %s", results)

        if not results:
            # Task not already in schedd
            return None

        configreq = {'workflow': workflow,
                     'status': "SUBMITTED",
                     'jobset': "-1",
                     'subresource': 'success',
                    }
        self.logger.warning("Task %s already submitted to HTCondor; pushing information centrally: %s", workflow, str(configreq))
        data = urllib.urlencode(configreq)
        self.server.post(self.resturi, data=data)

        # Note that we don't re-send Dashboard jobs; we assume this is a rare occurrance and
        # don't want to upset any info already in the Dashboard.

        return Result.Result(task=task, result=(-1))
예제 #28
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        groups = CMSGroupMapper.map_user_to_groups(dagAd["CRAB_UserHN"])
        if groups:
            dagAd["CMSGroups"] = groups

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])
        dagAd["LeaveJobInQueue"] = classad.ExprTree("(JobStatus == 4) && ((StageOutFinish =?= UNDEFINED) || (StageOutFinish == 0))")
        dagAd["PeriodicRemove"] = classad.ExprTree("(JobStatus == 5) && (time()-EnteredCurrentStatus > 30*86400)")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["TaskType"] = "ROOT"
        dagAd["X509UserProxy"] = info['user_proxy']

        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy']) as (parent, rpipe):
            if not parent:
                resultAds = []
                schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                if resultAds:
                    id = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id], "LeaveJobInQueue", classad.ExprTree("(JobStatus == 4) && (time()-EnteredCurrentStatus < 30*86400)"))
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results)
예제 #29
0
    def execute(self, *args, **kwargs):
        self.logger.info(
            "Data discovery and splitting for %s using user-provided files" %
            kwargs['task']['tm_taskname'])

        userfiles = kwargs['task']['tm_user_files']
        splitting = kwargs['task']['tm_split_algo']
        total_units = kwargs['task']['tm_totalunits']
        if not userfiles or splitting != 'FileBased':
            if not userfiles:
                msg = "No files specified to process for task %s." % kwargs[
                    'task']['tm_taskname']
            if splitting != 'FileBased':
                msg = "Data.splitting must be set to 'FileBased' when using a custom set of files."
            raise TaskWorkerException(msg)

        if hasattr(self.config.Sites, 'available'):
            locations = self.config.Sites.available
        else:
            with self.config.TaskWorker.envForCMSWEB:
                configDict = {
                    "cacheduration": 1,
                    "pycurl": True
                }  # cache duration is in hours
                resourceCatalog = CRIC(logger=self.logger,
                                       configDict=configDict)
                locations = resourceCatalog.getAllPSNs()

        userFileset = Fileset(name=kwargs['task']['tm_taskname'])
        self.logger.info("There are %d files specified by the user." %
                         len(userfiles))
        if total_units > 0:
            self.logger.info("Will run over the first %d files." % total_units)
        file_counter = 0
        for userfile, idx in zip(userfiles, range(len(userfiles))):
            newFile = File(userfile, size=1000, events=1)
            newFile.setLocation(locations)
            newFile.addRun(Run(1, idx))
            newFile["block"] = 'UserFilesFakeBlock'
            newFile["first_event"] = 1
            newFile["last_event"] = 2
            userFileset.addFile(newFile)
            file_counter += 1
            if total_units > 0 and file_counter >= total_units:
                break

        return Result(task=kwargs['task'], result=userFileset)
예제 #30
0
 def keepOnlyDisks(self, locationsMap):
     phedex = PhEDEx()  # TODO use certs from the config!
     # get all the PNNs that are of kind 'Disk'
     try:
         diskLocations = set([
             pnn['name'] for pnn in phedex.getNodeMap()['phedex']['node']
             if pnn['kind'] == 'Disk'
         ])
     except HTTPException as ex:
         self.logger.error(ex.headers)
         raise TaskWorkerException("The CRAB3 server backend could not contact phedex to get the list of site storages.\n"+\
                             "This is could be a temporary phedex glitch, please try to submit a new task (resubmit will not work)"+\
                             " and contact the experts if the error persists.\nError reason: %s" % str(ex)) # TODO addo the nodes phedex so the user can check themselves
     for block, locations in locationsMap.iteritems():
         locationsMap[block] = set(locations) & diskLocations
         self.otherLocations = self.otherLocations.union(
             set(locations) - diskLocations)