def killAll(self, jobConst): # We need to keep ROOT, PROCESSING, and TAIL DAGs in hold until periodic remove kicks in. # See DagmanSubmitter.py#L390 (dagAd["PeriodicRemove"]) # This is needed in case user wants to resubmit. rootConst = 'stringListMember(TaskType, "ROOT PROCESSING TAIL", " ") && CRAB_ReqName =?= %s' % HTCondorUtils.quote(self.workflow) # Holding DAG job does not mean that it will remove all jobs # and this must be done separately # -------------------------------------- # From HTCondor documentation # http://research.cs.wisc.edu/htcondor/manual/v8.3/2_10DAGMan_Applications.html#SECTION003107000000000000000 # -------------------------------------- # After placing the condor_dagman job on hold, no new node jobs will be submitted, # and no PRE or POST scripts will be run. Any node jobs already in the HTCondor queue # will continue undisturbed. If the condor_dagman job is left on hold, it will remain # in the HTCondor queue after all of the currently running node jobs are finished. # -------------------------------------- # TODO: Remove jobConst query when htcondor ticket is solved # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=5175 with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe): if not parent: with self.schedd.transaction() as dummytsc: self.schedd.act(htcondor.JobAction.Hold, rootConst) self.schedd.act(htcondor.JobAction.Remove, jobConst) results = rpipe.read() if results != "OK": msg = "The CRAB server backend was not able to kill the task," msg += " because the Grid scheduler answered with an error." msg += " This is probably a temporary glitch. Please try again later." msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL) msg += " Error reason: %s" % (results) raise TaskWorkerException(msg)
def push_new_proxy_to_schedd(self, schedd, ad, proxy): if not hasattr(schedd, 'refreshGSIProxy'): raise NotImplementedError() with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe): if not parent: schedd.refreshGSIProxy(ad['ClusterId'], ad['ProcID'], proxy, -1) results = rpipe.read() if results != "OK": raise Exception("Failure when renewing HTCondor task proxy: '%s'" % results)
def renew_proxy(self, schedd, ad, proxy): now = time.time() self.logger.info("Renewing proxy for task %s." % ad['CRAB_ReqName']) if not hasattr(schedd, 'refreshGSIProxy'): raise NotImplementedError() with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe): if not parent: lifetime = schedd.refreshGSIProxy(ad['ClusterId'], ad['ProcID'], proxy, -1) schedd.edit(['%s.%s' % (ad['ClusterId'], ad['ProcId'])], 'x509userproxyexpiration', str(int(now+lifetime))) results = rpipe.read() if results != "OK": raise Exception("Failure when renewing HTCondor task proxy: '%s'" % results)
def killJobs(self, ids): ad = classad.ClassAd() ad['foo'] = ids const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % ( HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__()) with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe): if not parent: self.schedd.act(htcondor.JobAction.Remove, const) results = rpipe.read() if results != "OK": raise TaskWorkerException("The CRAB3 server backend could not kill jobs [%s]. because the Grid scheduler answered with an error\n" % ", ".join(ids)+\ "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\ "Error reason %s" % results)
def killAll(self): # Search for and hold the DAG rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote( self.workflow) with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe): if not parent: self.schedd.act(htcondor.JobAction.Hold, rootConst) results = rpipe.read() if results != "OK": raise TaskWorkerException("The CRAB3 server backend could not kill the task because the Grid scheduler answered with an error\n"\ "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\ "Error reason %s" % results)
def killJobs(self, ids): ad = classad.ClassAd() ad['foo'] = ids const = "CRAB_ReqName =?= %s && member(CRAB_Id, %s)" % (HTCondorUtils.quote(self.workflow), ad.lookup("foo").__repr__()) with HTCondorUtils.AuthenticatedSubprocess(self.proxy) as (parent, rpipe): if not parent: self.schedd.act(htcondor.JobAction.Remove, const) results = rpipe.read() if results != "OK": msg = "The CRAB server backend was not able to kill these jobs %s," % (ids) msg += " because the Grid scheduler answered with an error." msg += " This is probably a temporary glitch. Please try again later." msg += " If the error persists send an e-mail to %s." % (FEEDBACKMAIL) msg += " Error reason: %s" % (results) raise TaskWorkerException(msg)
def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201 """ Submit directly to the schedd using the HTCondor module """ dagAd = classad.ClassAd() addCRABInfoToClassAd(dagAd, info) groups = CMSGroupMapper.map_user_to_groups(dagAd["CRAB_UserHN"]) if groups: dagAd["CMSGroups"] = groups # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker dagAd["Out"] = str(os.path.join(info['scratch'], "request.out")) dagAd["Err"] = str(os.path.join(info['scratch'], "request.err")) dagAd["CRAB_Attempt"] = 0 # We switched from local to scheduler universe. Why? It seems there's no way in the # local universe to change the hold signal at runtime. That's fairly important for our # resubmit implementation. #dagAd["JobUniverse"] = 12 dagAd["JobUniverse"] = 7 dagAd["HoldKillSig"] = "SIGUSR1" dagAd["Cmd"] = cmd dagAd['Args'] = arg dagAd["TransferInput"] = str(info['inputFilesString']) dagAd["LeaveJobInQueue"] = classad.ExprTree("(JobStatus == 4) && ((StageOutFinish =?= UNDEFINED) || (StageOutFinish == 0))") dagAd["PeriodicRemove"] = classad.ExprTree("(JobStatus == 5) && (time()-EnteredCurrentStatus > 30*86400)") dagAd["TransferOutput"] = info['outputFilesString'] dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))") dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId") dagAd["RemoveKillSig"] = "SIGUSR1" dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)") dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";"))) dagAd["RemoteCondorSetup"] = info['remote_condor_setup'] dagAd["Requirements"] = classad.ExprTree('true || false') dagAd["TaskType"] = "ROOT" dagAd["X509UserProxy"] = info['user_proxy'] with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy']) as (parent, rpipe): if not parent: resultAds = [] schedd.submit(dagAd, 1, True, resultAds) schedd.spool(resultAds) if resultAds: id = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId']) schedd.edit([id], "LeaveJobInQueue", classad.ExprTree("(JobStatus == 4) && (time()-EnteredCurrentStatus < 30*86400)")) results = rpipe.read() if results != "OK": raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results)
def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201 """ Submit directly to the schedd using the HTCondor module """ dagAd = classad.ClassAd() addCRABInfoToClassAd(dagAd, info) if info["CMSGroups"]: dagAd["CMSGroups"] = ','.join(info["CMSGroups"]) else: dagAd["CMSGroups"] = classad.Value.Undefined # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker dagAd["CRAB_Attempt"] = 0 # We switched from local to scheduler universe. Why? It seems there's no way in the # local universe to change the hold signal at runtime. That's fairly important for our # resubmit implementation. #dagAd["JobUniverse"] = 12 dagAd["JobUniverse"] = 7 dagAd["HoldKillSig"] = "SIGUSR1" dagAd["X509UserProxy"] = info['user_proxy'] dagAd["Requirements"] = classad.ExprTree('true || false') dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";"))) dagAd["RemoteCondorSetup"] = info['remote_condor_setup'] dagAd["CRAB_TaskSubmitTime"] = classad.ExprTree("%s" % info["start_time"].encode('ascii', 'ignore')) dagAd['CRAB_TaskLifetimeDays'] = TASKLIFETIME // 24 // 60 // 60 dagAd['CRAB_TaskEndTime'] = int(info["start_time"]) + TASKLIFETIME #For task management info see https://github.com/dmwm/CRABServer/issues/4681#issuecomment-302336451 dagAd["LeaveJobInQueue"] = classad.ExprTree("true") dagAd["PeriodicHold"] = classad.ExprTree("time() > CRAB_TaskEndTime") dagAd["TransferOutput"] = info['outputFilesString'] dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)") dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))") dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId") dagAd["RemoveKillSig"] = "SIGUSR1" with open('subdag.ad' ,'w') as fd: for k, v in dagAd.items(): if k == 'X509UserProxy': v = os.path.basename(v) if isinstance(v, basestring): value = classad.quote(v) elif isinstance(v, classad.ExprTree): value = repr(v) elif isinstance(v, list): value = "{{{0}}}".format(json.dumps(v)[1:-1]) else: value = v fd.write('+{0} = {1}\n'.format(k, value)) dagAd["TaskType"] = "ROOT" dagAd["Out"] = str(os.path.join(info['scratch'], "request.out")) dagAd["Err"] = str(os.path.join(info['scratch'], "request.err")) dagAd["Cmd"] = cmd dagAd['Args'] = arg dagAd["TransferInput"] = str(info['inputFilesString']) condorIdDict = {} with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy'], pickleOut=True, outputObj=condorIdDict, logger=self.logger) as (parent, rpipe): if not parent: resultAds = [] condorIdDict['ClusterId'] = schedd.submit(dagAd, 1, True, resultAds) schedd.spool(resultAds) # editing the LeaveJobInQueue since the remote submit overwrites it # see https://github.com/dmwm/CRABServer/pull/5212#issuecomment-216519749 if resultAds: id_ = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId']) schedd.edit([id_], "LeaveJobInQueue", classad.ExprTree("true")) try: results = pickle.load(rpipe) except EOFError: #Do not want to retry this since error may happen after submit (during edit for example). #And this can cause the task to be submitted twice (although we have a protection in the duplicatedCheck) raise TaskWorkerException("Timeout executing condor submit command.", retry=False) #notice that the clusterId might be set even if there was a failure. This is if the schedd.submit succeded, but the spool call failed if 'ClusterId' in results.outputObj: self.logger.debug("Condor cluster ID just submitted is: %s", results.outputObj['ClusterId']) if results.outputMessage != "OK": self.logger.debug("Now printing the environment used for submission:\n" + "-"*70 + "\n" + results.environmentStr + "-"*70) raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results.outputMessage, retry=True) #if we don't raise exception above the id is here return results.outputObj['ClusterId']
def executeInternal(self, *args, **kwargs): #pylint: disable=unused-argument #Marco: I guess these value errors only happens for development instances if 'task' not in kwargs: raise ValueError("No task specified.") task = kwargs['task'] if 'tm_taskname' not in task: raise ValueError("No taskname specified.") workflow = str(task['tm_taskname']) if 'user_proxy' not in task: raise ValueError("No proxy provided") proxy = task['user_proxy'] if task.get('resubmit_publication', False): resubmitWhat = "publications" else: resubmitWhat = "jobs" self.logger.info("About to resubmit %s for workflow: %s.", resubmitWhat, workflow) self.logger.debug("Task info: %s", str(task)) if task.get('resubmit_publication', False): asourl = task.get('tm_asourl', None) #Let's not assume the db has been updated (mostly for devs), let's default asodb to asynctransfer! #Also the "or" takes care of the case were the new code is executed on old task #i.e.: tm_asodb is there but empty. asodb = task.get('tm_asodb', 'asynctransfer') or 'asynctransfer' if not asourl: msg = "ASO URL not set. Can not resubmit publication." raise TaskWorkerException(msg) self.logger.info("Will resubmit failed publications") self.resubmitPublication(asourl, asodb, proxy, workflow) return if task['tm_collector']: self.backendurls['htcondorPool'] = task['tm_collector'] loc = HTCondorLocator.HTCondorLocator(self.backendurls) schedd = "" dummyAddress = "" try: schedd, dummyAddress = loc.getScheddObjNew(task['tm_schedd']) except Exception as exp: msg = "The CRAB server backend was not able to contact the Grid scheduler." msg += " Please try again later." msg += " If the error persists send an e-mail to %s." % ( FEEDBACKMAIL) msg += " Message from the scheduler: %s" % (str(exp)) self.logger.exception("%s: %s", workflow, msg) raise TaskWorkerException(msg) # Check memory and walltime stdmaxjobruntime = 2800 stdmaxmemory = 2500 if task['resubmit_maxjobruntime'] is not None and task[ 'resubmit_maxjobruntime'] > stdmaxjobruntime: msg = "Task requests %s minutes of walltime, but only %s are guaranteed to be available." % ( task['resubmit_maxjobruntime'], stdmaxjobruntime) msg += " Jobs may not find a site where to run." msg += " CRAB has changed this value to %s minutes." % ( stdmaxjobruntime) self.logger.warning(msg) task['resubmit_maxjobruntime'] = str(stdmaxjobruntime) self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname']) if task['resubmit_maxmemory'] is not None and task[ 'resubmit_maxmemory'] > stdmaxmemory: msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % ( task['resubmit_maxmemory'], stdmaxmemory) msg += " Jobs may not find a site where to run and stay idle forever." self.logger.warning(msg) self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname']) # Find only the originally submitted DAG to hold and release: this # will re-trigger the scripts and adjust retries and other # resubmission parameters. # # Processing and tail DAGs will be restarted by these scrips on the # schedd after the modifications are made. rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote( workflow) ## Calculate new parameters for resubmitted jobs. These parameters will ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below. ad = classad.ClassAd() params = { 'CRAB_ResubmitList': 'jobids', 'CRAB_SiteBlacklist': 'site_blacklist', 'CRAB_SiteWhitelist': 'site_whitelist', 'MaxWallTimeMins': 'maxjobruntime', 'RequestMemory': 'maxmemory', 'RequestCpus': 'numcores', 'JobPrio': 'priority' } overwrite = False for taskparam in params.values(): if ('resubmit_' + taskparam in task) and task['resubmit_' + taskparam] != None: # In case resubmission parameters contain a list of unicode strings, # convert it to a list of ascii strings because of HTCondor unicode # incompatibility. # Note that unicode strings that are not in a list are not handled, # but so far they don't exist in this part of the code. if isinstance(task['resubmit_' + taskparam], list): nonUnicodeList = [] for p in task['resubmit_' + taskparam]: if isinstance(p, unicode): nonUnicodeList.append(p.encode('ascii', 'ignore')) else: nonUnicodeList.append(p) ad[taskparam] = nonUnicodeList if taskparam != 'jobids': overwrite = True if ('resubmit_jobids' in task) and task['resubmit_jobids']: with HTCondorUtils.AuthenticatedSubprocess( proxy, logger=self.logger) as (parent, rpipe): if not parent: schedd.edit(rootConst, "HoldKillSig", 'SIGKILL') ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect ## all the jobs, not only the ones we want to resubmit. That's why the pre-job ## is saving the values of the parameters for each job retry in text files (the ## files are in the directory resubmit_info in the schedd). for adparam, taskparam in params.iteritems(): if taskparam in ad: schedd.edit(rootConst, adparam, ad.lookup(taskparam)) elif task['resubmit_' + taskparam] != None: schedd.edit(rootConst, adparam, str(task['resubmit_' + taskparam])) schedd.act(htcondor.JobAction.Hold, rootConst) schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, rootConst) elif overwrite: self.logger.debug("Resubmitting under condition overwrite = True") with HTCondorUtils.AuthenticatedSubprocess( proxy, logger=self.logger) as (parent, rpipe): if not parent: for adparam, taskparam in params.iteritems(): if taskparam in ad: if taskparam == 'jobids' and len( list(ad[taskparam])) == 0: self.logger.debug( "Setting %s = True in the task ad.", adparam) schedd.edit(rootConst, adparam, classad.ExprTree("true")) else: schedd.edit(rootConst, adparam, ad.lookup(taskparam)) elif task['resubmit_' + taskparam] != None: schedd.edit(rootConst, adparam, str(task['resubmit_' + taskparam])) schedd.act(htcondor.JobAction.Release, rootConst) else: ## This should actually not occur anymore in CRAB 3.3.16 or above, because ## starting from CRAB 3.3.16 the resubmission parameters are written to the ## Task DB with value != None, so the overwrite variable should never be False. self.logger.debug("Resubmitting under condition overwrite = False") with HTCondorUtils.AuthenticatedSubprocess( proxy, logger=self.logger) as (parent, rpipe): if not parent: schedd.edit(rootConst, "HoldKillSig", 'SIGKILL') schedd.edit(rootConst, "CRAB_ResubmitList", classad.ExprTree("true")) schedd.act(htcondor.JobAction.Hold, rootConst) schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, rootConst) try: results = rpipe.read() except EOFError: results = "Timeout while executing condor commands for resubmission" if results != "OK": msg = "The CRAB server backend was not able to resubmit the task," msg += " because the Grid scheduler answered with an error." msg += " This is probably a temporary glitch. Please try again later." msg += " If the error persists send an e-mail to %s." % ( FEEDBACKMAIL) msg += " Error reason: %s" % (results) raise TaskWorkerException(msg)
def execute_internal(self, *args, **kw): #Marco: I guess these value errors only happens for development instances if 'task' not in kw: raise ValueError("No task specified.") task = kw['task'] if 'tm_taskname' not in task: raise ValueError("No taskname specified.") workflow = str(task['tm_taskname']) if 'user_proxy' not in task: raise ValueError("No proxy provided") proxy = task['user_proxy'] self.logger.info("About to resubmit workflow: %s." % workflow) self.logger.info("Task info: %s" % str(task)) loc = HTCondorLocator.HTCondorLocator(self.backendurls) schedd, address = loc.getScheddObj(workflow) #TODO wrap # Release the DAG rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote( workflow) # Calculate a new white/blacklist ad = classad.ClassAd() ad['whitelist'] = task['resubmit_site_whitelist'] ad['blacklist'] = task['resubmit_site_blacklist'] if ('resubmit_ids' in task) and task['resubmit_ids']: ad['resubmit'] = task['resubmit_ids'] with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe): if not parent: schedd.edit(rootConst, "HoldKillSig", 'SIGKILL') schedd.edit(rootConst, "CRAB_ResubmitList", ad['resubmit']) schedd.act(htcondor.JobAction.Hold, rootConst) schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, rootConst) elif task['resubmit_site_whitelist'] or task['resubmit_site_blacklist'] or \ task['resubmit_priority'] != None or task['resubmit_maxmemory'] != None or \ task['resubmit_numcores'] != None or task['resubmit_maxjobruntime'] != None: with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe): if not parent: if task['resubmit_site_blacklist']: schedd.edit(rootConst, "CRAB_SiteResubmitBlacklist", ad['blacklist']) if task['resubmit_site_whitelist']: schedd.edit(rootConst, "CRAB_SiteResubmitWhitelist", ad['whitelist']) if task['resubmit_priority'] != None: schedd.edit(rootConst, "JobPrio", task['resubmit_priority']) if task['resubmit_numcores'] != None: schedd.edit(rootConst, "RequestCpus", task['resubmit_numcores']) if task['resubmit_maxjobruntime'] != None: schedd.edit(rootConst, "MaxWallTimeMins", task['resubmit_maxjobruntime']) if task['resubmit_maxmemory'] != None: schedd.edit(rootConst, "RequestMemory", task['resubmit_maxmemory']) schedd.act(htcondor.JobAction.Release, rootConst) else: with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe): if not parent: schedd.edit(rootConst, "HoldKillSig", 'SIGKILL') schedd.edit(rootConst, "CRAB_ResubmitList", classad.ExprTree("true")) schedd.act(htcondor.JobAction.Hold, rootConst) schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, rootConst) results = rpipe.read() if results != "OK": raise TaskWorkerException("The CRAB3 server backend could not reubmit your task because the Grid scheduler answered with an error\n"+\ "This is probably a temporary glitch, please try it again and contact an expert if the error persist\n"+\ "Error reason %s" % results)
def executeInternal(self, *args, **kwargs): #Marco: I guess these value errors only happens for development instances if 'task' not in kwargs: raise ValueError("No task specified.") task = kwargs['task'] if 'tm_taskname' not in task: raise ValueError("No taskname specified.") workflow = str(task['tm_taskname']) if 'user_proxy' not in task: raise ValueError("No proxy provided") proxy = task['user_proxy'] if task.get('resubmit_publication', False): resubmitWhat = "publications" else: resubmitWhat = "jobs" self.logger.info("About to resubmit %s for workflow: %s." % (resubmitWhat, workflow)) self.logger.info("Task info: %s" % str(task)) if task.get('resubmit_publication', False): asourl = task.get('tm_asourl', None) if not asourl: msg = "ASO URL not set. Can not resubmit publication." raise TaskWorkerException(msg) self.logger.info("Will resubmit failed publications") self.resubmitPublication(asourl, proxy, workflow) return if task['tm_collector']: self.backendurls['htcondorPool'] = task['tm_collector'] loc = HTCondorLocator.HTCondorLocator(self.backendurls) schedd = "" address = "" try: schedd, address = loc.getScheddObjNew(task['tm_schedd']) except Exception as exp: msg = "The CRAB server backend was not able to contact the Grid scheduler." msg += " Please try again later." msg += " If the error persists send an e-mail to %s." % ( FEEDBACKMAIL) msg += " Message from the scheduler: %s" % (str(exp)) self.logger.exception("%s: %s" % (workflow, msg)) raise TaskWorkerException(msg) # Check memory and walltime stdmaxjobruntime = 2800 stdmaxmemory = 2500 if task['resubmit_maxjobruntime'] is not None and task[ 'resubmit_maxjobruntime'] > stdmaxjobruntime: msg = "Task requests %s minutes of walltime, but only %s are guaranteed to be available." % ( task['resubmit_maxjobruntime'], stdmaxjobruntime) msg += " Jobs may not find a site where to run." msg += " CRAB has changed this value to %s minutes." % ( stdmaxjobruntime) self.logger.warning(msg) task['resubmit_maxjobruntime'] = str(stdmaxjobruntime) self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname']) if task['resubmit_maxmemory'] is not None and task[ 'resubmit_maxmemory'] > stdmaxmemory: msg = "Task requests %s MB of memory, but only %s MB are guaranteed to be available." % ( task['resubmit_maxmemory'], stdmaxmemory) msg += " Jobs may not find a site where to run and stay idle forever." self.logger.warning(msg) self.uploadWarning(msg, proxy, kwargs['task']['tm_taskname']) # Release the DAG rootConst = "TaskType =?= \"ROOT\" && CRAB_ReqName =?= %s" % HTCondorUtils.quote( workflow) ## Calculate new parameters for resubmited jobs. These parameters will ## be (re)written in the _CONDOR_JOB_AD when we do schedd.edit() below. ad = classad.ClassAd() params = { 'CRAB_ResubmitList': 'jobids', 'CRAB_SiteBlacklist': 'site_blacklist', 'CRAB_SiteWhitelist': 'site_whitelist', 'MaxWallTimeMins': 'maxjobruntime', 'RequestMemory': 'maxmemory', 'RequestCpus': 'numcores', 'JobPrio': 'priority' } overwrite = False for taskparam in params.values(): if ('resubmit_' + taskparam in task) and task['resubmit_' + taskparam] != None: if isinstance(task['resubmit_' + taskparam], list): ad[taskparam] = task['resubmit_' + taskparam] if taskparam != 'jobids': overwrite = True if ('resubmit_jobids' in task) and task['resubmit_jobids']: with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe): if not parent: schedd.edit(rootConst, "HoldKillSig", 'SIGKILL') ## Overwrite parameters in the os.environ[_CONDOR_JOB_AD] file. This will affect ## all the jobs, not only the ones we want to resubmit. That's why the pre-job ## is saving the values of the parameters for each job retry in text files (the ## files are in the directory resubmit_info in the schedd). for adparam, taskparam in params.iteritems(): if taskparam in ad: schedd.edit(rootConst, adparam, ad[taskparam]) elif task['resubmit_' + taskparam] != None: schedd.edit(rootConst, adparam, str(task['resubmit_' + taskparam])) schedd.act(htcondor.JobAction.Hold, rootConst) schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, rootConst) elif overwrite: with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe): if not parent: self.logger.debug( "Resubmitting under condition overwrite = True") for adparam, taskparam in params.iteritems(): if taskparam in ad: if taskparam == 'jobids' and len( list(ad[taskparam])) == 0: self.logger.debug( "Setting %s = True in the task ad." % (adparam)) schedd.edit(rootConst, adparam, classad.ExprTree("true")) else: schedd.edit(rootConst, adparam, ad[taskparam]) elif task['resubmit_' + taskparam] != None: schedd.edit(rootConst, adparam, str(task['resubmit_' + taskparam])) schedd.act(htcondor.JobAction.Release, rootConst) else: ## This should actually not occur anymore in CRAB 3.3.16 or above, because ## starting from CRAB 3.3.16 the resubmission parameters are written to the ## Task DB with value != None, so the overwrite variable should never be False. with HTCondorUtils.AuthenticatedSubprocess(proxy) as (parent, rpipe): if not parent: self.logger.debug( "Resubmitting under condition overwrite = False") schedd.edit(rootConst, "HoldKillSig", 'SIGKILL') schedd.edit(rootConst, "CRAB_ResubmitList", classad.ExprTree("true")) schedd.act(htcondor.JobAction.Hold, rootConst) schedd.edit(rootConst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, rootConst) results = rpipe.read() if results != "OK": msg = "The CRAB server backend was not able to resubmit the task," msg += " because the Grid scheduler answered with an error." msg += " This is probably a temporary glitch. Please try again later." msg += " If the error persists send an e-mail to %s." % ( FEEDBACKMAIL) msg += " Error reason: %s" % (results) raise TaskWorkerException(msg)