def killAll(self, jobConst):

        # We need to keep ROOT, PROCESSING, and TAIL DAGs in hold until periodic remove kicks in.
        # See DagmanSubmitter.py#L390 (dagAd["PeriodicRemove"])
        # This is needed in case user wants to resubmit.
        rootConst = 'stringListMember(TaskType, "ROOT PROCESSING TAIL", " ") && CRAB_ReqName =?= %s' % HTCondorUtils.quote(self.workflow)

        # Holding DAG job does not mean that it will remove all jobs
        # and this must be done separately
        # --------------------------------------
        # From HTCondor documentation
        # http://research.cs.wisc.edu/htcondor/manual/v8.3/2_10DAGMan_Applications.html#SECTION003107000000000000000
        # --------------------------------------
        # After placing the condor_dagman job on hold, no new node jobs will be submitted,
        # and no PRE or POST scripts will be run. Any node jobs already in the HTCondor queue
        # will continue undisturbed. If the condor_dagman job is left on hold, it will remain
        # in the HTCondor queue after all of the currently running node jobs are finished.
        # --------------------------------------
        # TODO: Remove jobConst query when htcondor ticket is solved
        # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=5175

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
            if not parent:
                with self.schedd.transaction() as dummytsc:
                    self.schedd.act(htcondor.JobAction.Hold, rootConst)
                    self.schedd.act(htcondor.JobAction.Remove, jobConst)
        results = rpipe.read()
        if results != "OK":
            msg  = "The CRAB server backend was not able to kill the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
 def push_new_proxy_to_schedd(self, schedd, ad, proxy):
     if not hasattr(schedd, 'refreshGSIProxy'):
         raise NotImplementedError()
     with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
         if not parent:
             schedd.refreshGSIProxy(ad['ClusterId'], ad['ProcID'], proxy,
                                    -1)
     results = rpipe.read()
     if results != "OK":
         raise Exception("Failure when renewing HTCondor task proxy: '%s'" %
                         results)
 def renew_proxy(self, schedd, ad, proxy):
     now = time.time()
     self.logger.info("Renewing proxy for task %s." % ad['CRAB_ReqName'])
     if not hasattr(schedd, 'refreshGSIProxy'):
         raise NotImplementedError()
     with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe):
         if not parent:
             lifetime = schedd.refreshGSIProxy(ad['ClusterId'], ad['ProcID'], proxy, -1)
             schedd.edit(['%s.%s' % (ad['ClusterId'], ad['ProcId'])], 'x509userproxyexpiration', str(int(now+lifetime)))
     results = rpipe.read()
     if results != "OK":
         raise Exception("Failure when renewing HTCondor task proxy: '%s'" % results)
Exemple #4
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (
         HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent,
                                                                rpipe):
         if not parent:
             self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         raise TaskWorkerException("The CRAB3 server backend could not kill jobs [%s]. because the Grid scheduler answered with an error\n" % ", ".join(ids)+\
                                   "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                   "Error reason %s" % results)
Exemple #5
0
    def killAll(self):

        # Search for and hold the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            self.workflow)

        with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent,
                                                                   rpipe):
            if not parent:
                self.schedd.act(htcondor.JobAction.Hold, rootConst)
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not kill the task because the Grid scheduler answered with an error\n"\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                      "Error reason %s" % results)
Exemple #6
0
 def killJobs(self, ids):
     ad = classad.ClassAd()
     ad['foo'] = ids
     const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__())
     with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe):
         if not parent:
             self.schedd.act(htcondor.JobAction.Remove, const)
     results = rpipe.read()
     if results != "OK":
         msg  = "The CRAB server backend was not able to kill these jobs %s," % (ids)
         msg += " because the Grid scheduler answered with an error."
         msg += " This is probably a temporary glitch. Please try again later."
         msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL)
         msg += " Error reason: %s" % (results)
         raise TaskWorkerException(msg)
Exemple #7
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        groups = CMSGroupMapper.map_user_to_groups(dagAd["CRAB_UserHN"])
        if groups:
            dagAd["CMSGroups"] = groups

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])
        dagAd["LeaveJobInQueue"] = classad.ExprTree("(JobStatus == 4) && ((StageOutFinish =?= UNDEFINED) || (StageOutFinish == 0))")
        dagAd["PeriodicRemove"] = classad.ExprTree("(JobStatus == 5) && (time()-EnteredCurrentStatus > 30*86400)")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["TaskType"] = "ROOT"
        dagAd["X509UserProxy"] = info['user_proxy']

        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy']) as (parent, rpipe):
            if not parent:
                resultAds = []
                schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                if resultAds:
                    id = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id], "LeaveJobInQueue", classad.ExprTree("(JobStatus == 4) && (time()-EnteredCurrentStatus < 30*86400)"))
        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results)
Exemple #8
0
    def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201
        """
        Submit directly to the schedd using the HTCondor module
        """
        dagAd = classad.ClassAd()
        addCRABInfoToClassAd(dagAd, info)

        if info["CMSGroups"]:
            dagAd["CMSGroups"] = ','.join(info["CMSGroups"])
        else:
            dagAd["CMSGroups"] = classad.Value.Undefined

        # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker
        dagAd["CRAB_Attempt"] = 0
        # We switched from local to scheduler universe.  Why?  It seems there's no way in the
        # local universe to change the hold signal at runtime.  That's fairly important for our
        # resubmit implementation.
        #dagAd["JobUniverse"] = 12
        dagAd["JobUniverse"] = 7
        dagAd["HoldKillSig"] = "SIGUSR1"
        dagAd["X509UserProxy"] = info['user_proxy']
        dagAd["Requirements"] = classad.ExprTree('true || false')
        dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";")))
        dagAd["RemoteCondorSetup"] = info['remote_condor_setup']

        dagAd["CRAB_TaskSubmitTime"] = classad.ExprTree("%s" % info["start_time"].encode('ascii', 'ignore'))
        dagAd['CRAB_TaskLifetimeDays'] = TASKLIFETIME // 24 // 60 // 60
        dagAd['CRAB_TaskEndTime'] = int(info["start_time"]) + TASKLIFETIME
        #For task management info see https://github.com/dmwm/CRABServer/issues/4681#issuecomment-302336451
        dagAd["LeaveJobInQueue"] = classad.ExprTree("true")
        dagAd["PeriodicHold"] = classad.ExprTree("time() > CRAB_TaskEndTime")
        dagAd["TransferOutput"] = info['outputFilesString']
        dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)")
        dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))")
        dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId")
        dagAd["RemoveKillSig"] = "SIGUSR1"

        with open('subdag.ad' ,'w') as fd:
            for k, v in dagAd.items():
                if k == 'X509UserProxy':
                    v = os.path.basename(v)
                if isinstance(v, basestring):
                    value = classad.quote(v)
                elif isinstance(v, classad.ExprTree):
                    value = repr(v)
                elif isinstance(v, list):
                    value = "{{{0}}}".format(json.dumps(v)[1:-1])
                else:
                    value = v
                fd.write('+{0} = {1}\n'.format(k, value))

        dagAd["TaskType"] = "ROOT"
        dagAd["Out"] = str(os.path.join(info['scratch'], "request.out"))
        dagAd["Err"] = str(os.path.join(info['scratch'], "request.err"))
        dagAd["Cmd"] = cmd
        dagAd['Args'] = arg
        dagAd["TransferInput"] = str(info['inputFilesString'])

        condorIdDict = {}
        with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy'], pickleOut=True, outputObj=condorIdDict, logger=self.logger) as (parent, rpipe):
            if not parent:
                resultAds = []
                condorIdDict['ClusterId'] = schedd.submit(dagAd, 1, True, resultAds)
                schedd.spool(resultAds)
                # editing the LeaveJobInQueue since the remote submit overwrites it
                # see https://github.com/dmwm/CRABServer/pull/5212#issuecomment-216519749
                if resultAds:
                    id_ = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId'])
                    schedd.edit([id_], "LeaveJobInQueue", classad.ExprTree("true"))

        try:
            results = pickle.load(rpipe)
        except EOFError:
            #Do not want to retry this since error may happen after submit (during edit for example).
            #And this can cause the task to be submitted twice (although we have a protection in the duplicatedCheck)
            raise TaskWorkerException("Timeout executing condor submit command.", retry=False)

        #notice that the clusterId might be set even if there was a failure. This is if the schedd.submit succeded, but the spool  call failed
        if 'ClusterId' in results.outputObj:
            self.logger.debug("Condor cluster ID just submitted is: %s", results.outputObj['ClusterId'])
        if results.outputMessage != "OK":
            self.logger.debug("Now printing the environment used for submission:\n" + "-"*70 + "\n" + results.environmentStr + "-"*70)
            raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results.outputMessage, retry=True)

        #if we don't raise exception above the id is here
        return results.outputObj['ClusterId']
Exemple #9
0
    def executeInternal(self, *args, **kwargs):  #pylint: disable=unused-argument
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        task = kwargs['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        if task.get('resubmit_publication', False):
            resubmitWhat = "publications"
        else:
            resubmitWhat = "jobs"

        self.logger.info("About to resubmit %s for workflow: %s.",
                         resubmitWhat, workflow)
        self.logger.debug("Task info: %s", str(task))

        if task.get('resubmit_publication', False):
            asourl = task.get('tm_asourl', None)
            #Let's not assume the db has been updated (mostly for devs), let's default asodb to asynctransfer!
            #Also the "or" takes care of the case were the new code is executed on old task
            #i.e.: tm_asodb is there but empty.
            asodb = task.get('tm_asodb', 'asynctransfer') or 'asynctransfer'
            if not asourl:
                msg = "ASO URL not set. Can not resubmit publication."
                raise TaskWorkerException(msg)
            self.logger.info("Will resubmit failed publications")
            self.resubmitPublication(asourl, asodb, proxy, workflow)
            return

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        dummyAddress = ""
        try:
            schedd, dummyAddress = loc.getScheddObjNew(task['tm_schedd'])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s", workflow, msg)
            raise TaskWorkerException(msg)

        # Check memory and walltime
        stdmaxjobruntime = 2800
        stdmaxmemory = 2500
        if task['resubmit_maxjobruntime'] is not None and task[
                'resubmit_maxjobruntime'] > stdmaxjobruntime:
            msg = "Task requests %s minutes of walltime, but only %s are guaranteed to be available." % (
                task['resubmit_maxjobruntime'], stdmaxjobruntime)
            msg += " Jobs may not find a site where to run."
            msg += " CRAB has changed this value to %s minutes." % (
                stdmaxjobruntime)
            self.logger.warning(msg)
            task['resubmit_maxjobruntime'] = str(stdmaxjobruntime)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])
        if task['resubmit_maxmemory'] is not None and task[
                'resubmit_maxmemory'] > stdmaxmemory:
            msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % (
                task['resubmit_maxmemory'], stdmaxmemory)
            msg += " Jobs may not find a site where to run and stay idle forever."
            self.logger.warning(msg)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])

        # Find only the originally submitted DAG to hold and release: this
        # will re-trigger the scripts and adjust retries and other
        # resubmission parameters.
        #
        # Processing and tail DAGs will be restarted by these scrips on the
        # schedd after the modifications are made.
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            workflow)

        ## Calculate new parameters for resubmitted jobs. These parameters will
        ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below.
        ad = classad.ClassAd()
        params = {
            'CRAB_ResubmitList': 'jobids',
            'CRAB_SiteBlacklist': 'site_blacklist',
            'CRAB_SiteWhitelist': 'site_whitelist',
            'MaxWallTimeMins': 'maxjobruntime',
            'RequestMemory': 'maxmemory',
            'RequestCpus': 'numcores',
            'JobPrio': 'priority'
        }
        overwrite = False
        for taskparam in params.values():
            if ('resubmit_' + taskparam
                    in task) and task['resubmit_' + taskparam] != None:
                # In case resubmission parameters contain a list of unicode strings,
                # convert it to a list of ascii strings because of HTCondor unicode
                # incompatibility.
                # Note that unicode strings that are not in a list are not handled,
                # but so far they don't exist in this part of the code.
                if isinstance(task['resubmit_' + taskparam], list):
                    nonUnicodeList = []
                    for p in task['resubmit_' + taskparam]:
                        if isinstance(p, unicode):
                            nonUnicodeList.append(p.encode('ascii', 'ignore'))
                        else:
                            nonUnicodeList.append(p)
                    ad[taskparam] = nonUnicodeList
                if taskparam != 'jobids':
                    overwrite = True

        if ('resubmit_jobids' in task) and task['resubmit_jobids']:
            with HTCondorUtils.AuthenticatedSubprocess(
                    proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect
                    ## all the jobs, not only the ones we want to resubmit. That's why the pre-job
                    ## is saving the values of the parameters for each job retry in text files (the
                    ## files are in the directory resubmit_info in the schedd).
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            schedd.edit(rootConst, adparam,
                                        ad.lookup(taskparam))
                        elif task['resubmit_' + taskparam] != None:
                            schedd.edit(rootConst, adparam,
                                        str(task['resubmit_' + taskparam]))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        elif overwrite:
            self.logger.debug("Resubmitting under condition overwrite = True")
            with HTCondorUtils.AuthenticatedSubprocess(
                    proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            if taskparam == 'jobids' and len(
                                    list(ad[taskparam])) == 0:
                                self.logger.debug(
                                    "Setting %s = True in the task ad.",
                                    adparam)
                                schedd.edit(rootConst, adparam,
                                            classad.ExprTree("true"))
                            else:
                                schedd.edit(rootConst, adparam,
                                            ad.lookup(taskparam))
                        elif task['resubmit_' + taskparam] != None:
                            schedd.edit(rootConst, adparam,
                                        str(task['resubmit_' + taskparam]))
                    schedd.act(htcondor.JobAction.Release, rootConst)
        else:
            ## This should actually not occur anymore in CRAB 3.3.16 or above, because
            ## starting from CRAB 3.3.16 the resubmission parameters are written to the
            ## Task DB with value != None, so the overwrite variable should never be False.
            self.logger.debug("Resubmitting under condition overwrite = False")
            with HTCondorUtils.AuthenticatedSubprocess(
                    proxy, logger=self.logger) as (parent, rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList",
                                classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        try:
            results = rpipe.read()
        except EOFError:
            results = "Timeout while executing condor commands for resubmission"
        if results != "OK":
            msg = "The CRAB server backend was not able to resubmit the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)
Exemple #10
0
    def execute_internal(self, *args, **kw):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kw:
            raise ValueError("No task specified.")
        task = kw['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        self.logger.info("About to resubmit workflow: %s." % workflow)
        self.logger.info("Task info: %s" % str(task))

        loc = HTCondorLocator.HTCondorLocator(self.backendurls)
        schedd, address = loc.getScheddObj(workflow)  #TODO wrap

        # Release the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            workflow)

        # Calculate a new white/blacklist
        ad = classad.ClassAd()
        ad['whitelist'] = task['resubmit_site_whitelist']
        ad['blacklist'] = task['resubmit_site_blacklist']

        if ('resubmit_ids' in task) and task['resubmit_ids']:
            ad['resubmit'] = task['resubmit_ids']
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList", ad['resubmit'])
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        elif task['resubmit_site_whitelist'] or task['resubmit_site_blacklist'] or \
                task['resubmit_priority'] != None or task['resubmit_maxmemory'] != None or \
                task['resubmit_numcores'] != None or task['resubmit_maxjobruntime'] != None:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    if task['resubmit_site_blacklist']:
                        schedd.edit(rootConst, "CRAB_SiteResubmitBlacklist",
                                    ad['blacklist'])
                    if task['resubmit_site_whitelist']:
                        schedd.edit(rootConst, "CRAB_SiteResubmitWhitelist",
                                    ad['whitelist'])
                    if task['resubmit_priority'] != None:
                        schedd.edit(rootConst, "JobPrio",
                                    task['resubmit_priority'])
                    if task['resubmit_numcores'] != None:
                        schedd.edit(rootConst, "RequestCpus",
                                    task['resubmit_numcores'])
                    if task['resubmit_maxjobruntime'] != None:
                        schedd.edit(rootConst, "MaxWallTimeMins",
                                    task['resubmit_maxjobruntime'])
                    if task['resubmit_maxmemory'] != None:
                        schedd.edit(rootConst, "RequestMemory",
                                    task['resubmit_maxmemory'])
                    schedd.act(htcondor.JobAction.Release, rootConst)

        else:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList",
                                classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        results = rpipe.read()
        if results != "OK":
            raise TaskWorkerException("The CRAB3 server backend could not reubmit your task because the Grid scheduler answered with an error\n"+\
                                      "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\
                                      "Error reason %s" % results)
Exemple #11
0
    def executeInternal(self, *args, **kwargs):
        #Marco: I guess these value errors only happens for development instances
        if 'task' not in kwargs:
            raise ValueError("No task specified.")
        task = kwargs['task']
        if 'tm_taskname' not in task:
            raise ValueError("No taskname specified.")
        workflow = str(task['tm_taskname'])
        if 'user_proxy' not in task:
            raise ValueError("No proxy provided")
        proxy = task['user_proxy']

        if task.get('resubmit_publication', False):
            resubmitWhat = "publications"
        else:
            resubmitWhat = "jobs"

        self.logger.info("About to resubmit %s for workflow: %s." %
                         (resubmitWhat, workflow))
        self.logger.info("Task info: %s" % str(task))

        if task.get('resubmit_publication', False):
            asourl = task.get('tm_asourl', None)
            if not asourl:
                msg = "ASO URL not set. Can not resubmit publication."
                raise TaskWorkerException(msg)
            self.logger.info("Will resubmit failed publications")
            self.resubmitPublication(asourl, proxy, workflow)
            return

        if task['tm_collector']:
            self.backendurls['htcondorPool'] = task['tm_collector']
        loc = HTCondorLocator.HTCondorLocator(self.backendurls)

        schedd = ""
        address = ""
        try:
            schedd, address = loc.getScheddObjNew(task['tm_schedd'])
        except Exception as exp:
            msg = "The CRAB server backend was not able to contact the Grid scheduler."
            msg += " Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Message from the scheduler: %s" % (str(exp))
            self.logger.exception("%s: %s" % (workflow, msg))
            raise TaskWorkerException(msg)

        # Check memory and walltime
        stdmaxjobruntime = 2800
        stdmaxmemory = 2500
        if task['resubmit_maxjobruntime'] is not None and task[
                'resubmit_maxjobruntime'] > stdmaxjobruntime:
            msg = "Task requests %s minutes of walltime, but only %s are guaranteed to be available." % (
                task['resubmit_maxjobruntime'], stdmaxjobruntime)
            msg += " Jobs may not find a site where to run."
            msg += " CRAB has changed this value to %s minutes." % (
                stdmaxjobruntime)
            self.logger.warning(msg)
            task['resubmit_maxjobruntime'] = str(stdmaxjobruntime)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])
        if task['resubmit_maxmemory'] is not None and task[
                'resubmit_maxmemory'] > stdmaxmemory:
            msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % (
                task['resubmit_maxmemory'], stdmaxmemory)
            msg += " Jobs may not find a site where to run and stay idle forever."
            self.logger.warning(msg)
            self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname'])

        # Release the DAG
        rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote(
            workflow)

        ## Calculate new parameters for resubmited jobs. These parameters will
        ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below.
        ad = classad.ClassAd()
        params = {
            'CRAB_ResubmitList': 'jobids',
            'CRAB_SiteBlacklist': 'site_blacklist',
            'CRAB_SiteWhitelist': 'site_whitelist',
            'MaxWallTimeMins': 'maxjobruntime',
            'RequestMemory': 'maxmemory',
            'RequestCpus': 'numcores',
            'JobPrio': 'priority'
        }
        overwrite = False
        for taskparam in params.values():
            if ('resubmit_' + taskparam
                    in task) and task['resubmit_' + taskparam] != None:
                if isinstance(task['resubmit_' + taskparam], list):
                    ad[taskparam] = task['resubmit_' + taskparam]
                if taskparam != 'jobids':
                    overwrite = True

        if ('resubmit_jobids' in task) and task['resubmit_jobids']:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect
                    ## all the jobs, not only the ones we want to resubmit. That's why the pre-job
                    ## is saving the values of the parameters for each job retry in text files (the
                    ## files are in the directory resubmit_info in the schedd).
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            schedd.edit(rootConst, adparam, ad[taskparam])
                        elif task['resubmit_' + taskparam] != None:
                            schedd.edit(rootConst, adparam,
                                        str(task['resubmit_' + taskparam]))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)
        elif overwrite:
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    self.logger.debug(
                        "Resubmitting under condition overwrite = True")
                    for adparam, taskparam in params.iteritems():
                        if taskparam in ad:
                            if taskparam == 'jobids' and len(
                                    list(ad[taskparam])) == 0:
                                self.logger.debug(
                                    "Setting %s = True in the task ad." %
                                    (adparam))
                                schedd.edit(rootConst, adparam,
                                            classad.ExprTree("true"))
                            else:
                                schedd.edit(rootConst, adparam, ad[taskparam])
                        elif task['resubmit_' + taskparam] != None:
                            schedd.edit(rootConst, adparam,
                                        str(task['resubmit_' + taskparam]))
                    schedd.act(htcondor.JobAction.Release, rootConst)
        else:
            ## This should actually not occur anymore in CRAB 3.3.16 or above, because
            ## starting from CRAB 3.3.16 the resubmission parameters are written to the
            ## Task DB with value != None, so the overwrite variable should never be False.
            with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent,
                                                                  rpipe):
                if not parent:
                    self.logger.debug(
                        "Resubmitting under condition overwrite = False")
                    schedd.edit(rootConst, "HoldKillSig", 'SIGKILL')
                    schedd.edit(rootConst, "CRAB_ResubmitList",
                                classad.ExprTree("true"))
                    schedd.act(htcondor.JobAction.Hold, rootConst)
                    schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1')
                    schedd.act(htcondor.JobAction.Release, rootConst)

        results = rpipe.read()
        if results != "OK":
            msg = "The CRAB server backend was not able to resubmit the task,"
            msg += " because the Grid scheduler answered with an error."
            msg += " This is probably a temporary glitch. Please try again later."
            msg += " If the error persists send an e-mail to %s." % (
                FEEDBACKMAIL)
            msg += " Error reason: %s" % (results)
            raise TaskWorkerException(msg)