def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201 """ Submit directly to the schedd using the HTCondor module """ dagAd = classad.ClassAd() addCRABInfoToClassAd(dagAd, info) groups = CMSGroupMapper.map_user_to_groups(dagAd["CRAB_UserHN"]) if groups: dagAd["CMSGroups"] = groups # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker dagAd["Out"] = str(os.path.join(info['scratch'], "request.out")) dagAd["Err"] = str(os.path.join(info['scratch'], "request.err")) dagAd["CRAB_Attempt"] = 0 # We switched from local to scheduler universe. Why? It seems there's no way in the # local universe to change the hold signal at runtime. That's fairly important for our # resubmit implementation. #dagAd["JobUniverse"] = 12 dagAd["JobUniverse"] = 7 dagAd["HoldKillSig"] = "SIGUSR1" dagAd["Cmd"] = cmd dagAd['Args'] = arg dagAd["TransferInput"] = str(info['inputFilesString']) dagAd["LeaveJobInQueue"] = classad.ExprTree("(JobStatus == 4) && ((StageOutFinish =?= UNDEFINED) || (StageOutFinish == 0))") dagAd["PeriodicRemove"] = classad.ExprTree("(JobStatus == 5) && (time()-EnteredCurrentStatus > 30*86400)") dagAd["TransferOutput"] = info['outputFilesString'] dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))") dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId") dagAd["RemoveKillSig"] = "SIGUSR1" dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)") dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";"))) dagAd["RemoteCondorSetup"] = info['remote_condor_setup'] dagAd["Requirements"] = classad.ExprTree('true || false') dagAd["TaskType"] = "ROOT" dagAd["X509UserProxy"] = info['user_proxy'] with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy']) as (parent, rpipe): if not parent: resultAds = [] schedd.submit(dagAd, 1, True, resultAds) schedd.spool(resultAds) if resultAds: id = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId']) schedd.edit([id], "LeaveJobInQueue", classad.ExprTree("(JobStatus == 4) && (time()-EnteredCurrentStatus < 30*86400)")) results = rpipe.read() if results != "OK": raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results)
def executeInternal(self, *args, **kwargs): """Internal execution to submit to selected scheduler Before submission it does duplicate check to see if task was not submitted by previous time""" if not htcondor: raise Exception("Unable to import HTCondor module") task = kwargs["task"] workflow = task["tm_taskname"] info = args[0][0] # self.logger.debug("Task input information: %s" % str(info)) dashboardParams = args[0][1] inputFiles = args[0][2] self.logger.debug("Starting duplicate check") dup = self.duplicateCheck(task) self.logger.debug("Duplicate check finished %s", dup) if dup != None: return dup cwd = os.getcwd() os.chdir(kwargs["tempDir"]) info["inputFilesString"] = ", ".join(inputFiles) outputFiles = ["RunJobs.dag.dagman.out", "RunJobs.dag.rescue.001"] info["outputFilesString"] = ", ".join(outputFiles) arg = "RunJobs.dag" info["resthost"] = '"%s"' % (self.server["host"]) # info['resthost'] = self.config.TaskWorker.resturl info["resturinoapi"] = '"%s"' % (self.restURInoAPI) try: info["remote_condor_setup"] = "" if task["tm_collector"]: self.backendurls["htcondorPool"] = task["tm_collector"] loc = HTCondorLocator.HTCondorLocator(self.backendurls) address = "" schedd = "" try: self.logger.debug("Getting schedd object") schedd, address = loc.getScheddObjNew(task["tm_schedd"]) self.logger.debug("Got schedd object") except Exception as exp: msg = "The CRAB server backend was not able to contact the Grid scheduler." msg += " Please try again later." msg += " Message from the scheduler: %s" % (str(exp)) self.logger.exception("%s: %s", workflow, msg) raise TaskWorkerException(msg) try: dummyAddress = loc.scheddAd["Machine"] except: raise TaskWorkerException("Unable to get schedd address for task %s" % (task["tm_taskname"])) # Get location of schedd-specific environment script from schedd ad. info["remote_condor_setup"] = loc.scheddAd.get("RemoteCondorSetup", "") info["CMSGroups"] = set.union( CMSGroupMapper.map_user_to_groups(kwargs["task"]["tm_username"]), kwargs["task"]["user_groups"] ) self.logger.info("User %s mapped to local groups %s." % (kwargs["task"]["tm_username"], info["CMSGroups"])) self.logger.debug("Finally submitting to the schedd") if address: self.clusterId = self.submitDirect(schedd, "dag_bootstrap_startup.sh", arg, info) else: raise TaskWorkerException("Not able to get schedd address.") self.logger.debug("Submission finished") finally: os.chdir(cwd) configreq = {"workflow": kwargs["task"]["tm_taskname"], "status": "SUBMITTED", "subresource": "success"} self.logger.debug("Pushing information centrally %s", configreq) data = urllib.urlencode(configreq) self.server.post(self.resturi, data=data) self.sendDashboardJobs(dashboardParams, info["apmon"]) return Result.Result(task=kwargs["task"], result=(-1))
def executeInternal(self, info, dashboardParams, inputFiles, **kwargs): """Internal execution to submit to selected scheduler Before submission it does duplicate check to see if task was not submitted by previous time""" if not htcondor: raise Exception("Unable to import HTCondor module") task = kwargs['task'] workflow = task['tm_taskname'] cwd = os.getcwd() os.chdir(kwargs['tempDir']) info['start_time'] = task['tm_start_time'] info['inputFilesString'] = ", ".join(inputFiles + ['subdag.ad']) outputFiles = ["RunJobs.dag.dagman.out", "RunJobs.dag.rescue.001"] info['outputFilesString'] = ", ".join(outputFiles) arg = "RunJobs.dag" info['resthost'] = '"%s"' % (self.server['host']) #info['resthost'] = self.config.TaskWorker.resturl info['resturinoapi'] = '"%s"' % (self.restURInoAPI) try: info['remote_condor_setup'] = '' if task['tm_collector']: self.backendurls['htcondorPool'] = task['tm_collector'] loc = HTCondorLocator.HTCondorLocator(self.backendurls) address = "" schedd = "" try: self.logger.debug("Getting schedd object") schedd, address = loc.getScheddObjNew(task['tm_schedd']) self.logger.debug("Got schedd object") except Exception as exp: msg = "The CRAB server backend was not able to contact the Grid scheduler." msg += " Please try again later." msg += " Message from the scheduler: %s" % (str(exp)) self.logger.exception("%s: %s", workflow, msg) raise TaskWorkerException(msg, retry=True) try: dummyAddress = loc.scheddAd['Machine'] except: raise TaskWorkerException("Unable to get schedd address for task %s" % (task['tm_taskname']), retry=True) # Get location of schedd-specific environment script from schedd ad. info['remote_condor_setup'] = loc.scheddAd.get("RemoteCondorSetup", "") info["CMSGroups"] = set.union(CMSGroupMapper.map_user_to_groups(kwargs['task']['tm_username']), kwargs['task']['user_groups']) self.logger.info("User %s mapped to local groups %s.", kwargs['task']['tm_username'], info["CMSGroups"]) self.logger.debug("Finally submitting to the schedd") if address: self.clusterId = self.submitDirect(schedd, 'dag_bootstrap_startup.sh', arg, info) else: raise TaskWorkerException("Not able to get schedd address.", retry=True) self.logger.debug("Submission finished") finally: os.chdir(cwd) configreq = {'workflow': kwargs['task']['tm_taskname'], 'status': "SUBMITTED", 'subresource': 'success', 'clusterid' : self.clusterId } #that's the condor cluster id of the dag (actually dag_bootstrap.sh that becomes that dag if everything goes well) self.logger.debug("Pushing information centrally %s", configreq) data = urllib.urlencode(configreq) self.server.post(self.resturi, data=data) self.sendDashboardJobs(dashboardParams, info['apmon']) return Result.Result(task=kwargs['task'], result=(-1))
def alter_submit(self, crab_retry): """ Copy the content of the generic file Job.submit into a job-specific file Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry). Add also parameters that can be overwritten at each manual job resubmission (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES). """ ## Start the Job.<job_id>.submit content with the CRAB_Retry. new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry) msg = "Setting CRAB_Retry = %s" % (crab_retry) self.logger.info(msg) ## Add job and postjob log URLs job_retry = "%s.%s" % (self.job_id, crab_retry) new_submit_text += '+CRAB_JobLogURL = %s\n' % classad.quote( os.path.join(self.userWebDirPrx, "job_out." + job_retry + ".txt")) new_submit_text += '+CRAB_PostJobLogURL = %s\n' % classad.quote( os.path.join(self.userWebDirPrx, "postjob." + job_retry + ".txt")) ## For the parameters that can be overwritten at each manual job resubmission, ## read them from the task ad, unless there is resubmission information there ## and this job is not one that has to be resubmitted, in which case we should ## use the same parameters (site black- and whitelists, requested memory, etc) ## as used by the previous job retry (which are saved in self.resubmit_info). CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad) use_resubmit_info = False resubmit_jobids = [] if 'CRAB_ResubmitList' in self.task_ad: resubmit_jobids = map(str, self.task_ad['CRAB_ResubmitList']) try: resubmit_jobids = set(resubmit_jobids) if resubmit_jobids and self.job_id not in resubmit_jobids: use_resubmit_info = True except TypeError: resubmit_jobids = True ## If there is no resubmit_info, we can of course not use it. if not self.resubmit_info: use_resubmit_info = False ## Get the resubmission parameters. maxjobruntime = None maxmemory = None numcores = None priority = None if not use_resubmit_info: #if 'MaxWallTimeMins_RAW' in self.task_ad: # if self.task_ad['MaxWallTimeMins_RAW'] != 1315: # maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW') # self.resubmit_info['maxjobruntime'] = maxjobruntime if 'MaxWallTimeMinsProbe' in self.task_ad and self.stage == 'probe': maxjobruntime = int( str(self.task_ad.lookup('MaxWallTimeMinsProbe'))) elif 'MaxWallTimeMinsTail' in self.task_ad and self.stage == 'tail': maxjobruntime = int( str(self.task_ad.lookup('MaxWallTimeMinsTail'))) elif 'MaxWallTimeMinsRun' in self.task_ad: maxjobruntime = int( str(self.task_ad.lookup('MaxWallTimeMinsRun'))) if 'RequestMemory' in self.task_ad: maxmemory = int(str(self.task_ad.lookup('RequestMemory'))) if 'RequestCpus' in self.task_ad: numcores = int(str(self.task_ad.lookup('RequestCpus'))) if 'JobPrio' in self.task_ad: priority = int(str(self.task_ad['JobPrio'])) if str(self.job_id) == '0': #jobids can be like 1-1 for subjobs priority = 20 #the maximum for splitting jobs else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime') maxmemory = self.resubmit_info[inkey].get('maxmemory') numcores = self.resubmit_info[inkey].get('numcores') priority = self.resubmit_info[inkey].get('priority') ## Save the (new) values of the resubmission parameters in self.resubmit_info ## for the current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime self.resubmit_info[outkey]['maxmemory'] = maxmemory self.resubmit_info[outkey]['numcores'] = numcores self.resubmit_info[outkey]['priority'] = priority self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info self.resubmit_info[outkey][ 'CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad ## Add the resubmission parameters to the Job.<job_id>.submit content. savelogs = 0 if self.stage == 'probe' else self.task_ad.lookup( 'CRAB_SaveLogsFlag') saveoutputs = 0 if self.stage == 'probe' else self.task_ad.lookup( 'CRAB_TransferOutputs') new_submit_text += '+CRAB_TransferOutputs = {0}\n+CRAB_SaveLogsFlag = {1}\n'.format( saveoutputs, savelogs) if maxjobruntime is not None: new_submit_text += '+EstimatedWallTimeMins = %s\n' % str( maxjobruntime) new_submit_text += '+MaxWallTimeMinsRun = %s\n' % str( maxjobruntime) # how long it can run new_submit_text += '+MaxWallTimeMins = %s\n' % str( maxjobruntime) # how long a slot can it match to # no plus sign for next 3 attributes, since those are Condor standard ones if maxmemory is not None: new_submit_text += 'RequestMemory = %s\n' % (str(maxmemory)) if numcores is not None: new_submit_text += 'RequestCpus = %s\n' % (str(numcores)) if priority is not None: new_submit_text += 'JobPrio = %s\n' % (str(priority)) ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority. pre_job_prio = 1 if int(self.job_id.split('-')[0]) <= 5: pre_job_prio = 0 new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will ## run the job with the higher PostJobPrio1. new_submit_text += '+PostJobPrio1 = -%s\n' % str( self.task_ad.lookup('QDate')) ## Order retries before all other jobs in this task new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry ## Add the site black- and whitelists and the DESIRED_SITES to the ## Job.<job_id>.submit content. new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info) ## Add group information: username = self.task_ad.get('CRAB_UserHN') if 'CMSGroups' in self.task_ad: new_submit_text += '+CMSGroups = %s\n' % classad.quote( self.task_ad['CMSGroups']) elif username: groups = CMSGroupMapper.map_user_to_groups(username) if groups: new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups) ## Finally add (copy) all the content of the generic Job.submit file. with open("Job.submit", 'r') as fd: new_submit_text += fd.read() ## Write the Job.<job_id>.submit file. with open("Job.%s.submit" % (self.job_id), 'w') as fd: fd.write(new_submit_text)
def alter_submit(self, crab_retry): """ Copy the content of the generic file Job.submit into a job-specific file Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry). Add also parameters that can be overwritten at each manual job resubmission (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES). """ ## Start the Job.<job_id>.submit content with the CRAB_Retry. new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry) msg = "Setting CRAB_Retry = %s" % (crab_retry) self.logger.info(msg) ## For the parameters that can be overwritten at each manual job resubmission, ## read them from the task ad, unless there is resubmission information there ## and this job is not one that has to be resubmitted, in which case we should ## use the same parameters (site black- and whitelists, requested memory, etc) ## as used by the previous job retry (which are saved in self.resubmit_info). CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad) use_resubmit_info = False resubmit_jobids = [] if 'CRAB_ResubmitList' in self.task_ad: resubmit_jobids = map(str, self.task_ad['CRAB_ResubmitList']) try: resubmit_jobids = set(resubmit_jobids) if resubmit_jobids and self.job_id not in resubmit_jobids: use_resubmit_info = True except TypeError: resubmit_jobids = True ## If there is no resubmit_info, we can of course not use it. if not self.resubmit_info: use_resubmit_info = False ## Get the resubmission parameters. maxjobruntime = None maxmemory = None numcores = None priority = None if not use_resubmit_info: #if 'MaxWallTimeMins_RAW' in self.task_ad: # if self.task_ad['MaxWallTimeMins_RAW'] != 1315: # maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW') # self.resubmit_info['maxjobruntime'] = maxjobruntime if 'MaxWallTimeMinsProbe' in self.task_ad and self.stage == 'probe': maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMinsProbe'))) elif 'MaxWallTimeMinsTail' in self.task_ad and self.stage == 'tail': maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMinsTail'))) elif 'MaxWallTimeMins' in self.task_ad: maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMins'))) if 'RequestMemory' in self.task_ad: maxmemory = int(str(self.task_ad.lookup('RequestMemory'))) if 'RequestCpus' in self.task_ad: numcores = int(str(self.task_ad.lookup('RequestCpus'))) if 'JobPrio' in self.task_ad: priority = int(str(self.task_ad['JobPrio'])) if str(self.job_id) == '0': #jobids can be like 1-1 for subjobs priority = 20 #the maximum for splitting jobs else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime') maxmemory = self.resubmit_info[inkey].get('maxmemory') numcores = self.resubmit_info[inkey].get('numcores') priority = self.resubmit_info[inkey].get('priority') ## Save the (new) values of the resubmission parameters in self.resubmit_info ## for the current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime self.resubmit_info[outkey]['maxmemory'] = maxmemory self.resubmit_info[outkey]['numcores'] = numcores self.resubmit_info[outkey]['priority'] = priority self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info self.resubmit_info[outkey]['CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad ## Add the resubmission parameters to the Job.<job_id>.submit content. if maxjobruntime is not None: new_submit_text += '+EstimatedWallTimeMins = %s\n' % str(maxjobruntime) new_submit_text += '+MaxWallTimeMins = (JobStatus=?=1) ? EstimatedWallTimeMins : %s\n' % str(maxjobruntime) if maxmemory is not None: new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory)) if numcores is not None: new_submit_text += '+RequestCpus = %s\n' % (str(numcores)) if priority is not None: new_submit_text += '+JobPrio = %s\n' % (str(priority)) ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority. pre_job_prio = 1 if int(self.job_id.split('-')[0]) <= 5: pre_job_prio = 0 new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will ## run the job with the higher PostJobPrio1. new_submit_text += '+PostJobPrio1 = -%s\n' % str(self.task_ad.lookup('QDate')) ## Order retries before all other jobs in this task new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry ## Add the site black- and whitelists and the DESIRED_SITES to the ## Job.<job_id>.submit content. new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info) ## Add group information: username = self.task_ad.get('CRAB_UserHN') if 'CMSGroups' in self.task_ad: new_submit_text += '+CMSGroups = %s\n' % classad.quote(self.task_ad['CMSGroups']) elif username: groups = CMSGroupMapper.map_user_to_groups(username) if groups: new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups) ## Finally add (copy) all the content of the generic Job.submit file. with open("Job.submit", 'r') as fd: new_submit_text += fd.read() ## Write the Job.<job_id>.submit file. with open("Job.%s.submit" % (self.job_id), 'w') as fd: fd.write(new_submit_text)
def alter_submit(self, crab_retry): """ Copy the content of the generic file Job.submit into a job-specific file Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry). Add also parameters that can be overwritten at each manual job resubmission (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES). """ ## Start the Job.<job_id>.submit content with the CRAB_Retry. new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry) msg = "Setting CRAB_Retry = %s" % (crab_retry) self.logger.info(msg) ## For the parameters that can be overwritten at each manual job resubmission, ## read them from the task ad, unless there is resubmission information there ## and this job is not one that has to be resubmitted, in which case we should ## use the same parameters (site black- and whitelists, requested memory, etc) ## as used by the previous job retry (which are saved in self.resubmit_info). CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad) use_resubmit_info = False resubmit_jobids = [] if 'CRAB_ResubmitList' in self.task_ad: resubmit_jobids = self.task_ad['CRAB_ResubmitList'] try: resubmit_jobids = set(resubmit_jobids) if resubmit_jobids and self.job_id not in resubmit_jobids: use_resubmit_info = True except TypeError: resubmit_jobids = True ## If there is no resubmit_info, we can of course not use it. if not self.resubmit_info: use_resubmit_info = False ## Get the resubmission parameters. maxjobruntime = None maxmemory = None numcores = None priority = None if not use_resubmit_info: #if 'MaxWallTimeMins_RAW' in self.task_ad: # if self.task_ad['MaxWallTimeMins_RAW'] != 1315: # maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW') # self.resubmit_info['maxjobruntime'] = maxjobruntime if 'MaxWallTimeMins' in self.task_ad: maxjobruntime = int(str( self.task_ad.lookup('MaxWallTimeMins'))) if 'RequestMemory' in self.task_ad: maxmemory = int(str(self.task_ad.lookup('RequestMemory'))) if 'RequestCpus' in self.task_ad: numcores = int(str(self.task_ad.lookup('RequestCpus'))) if 'JobPrio' in self.task_ad: priority = int(str(self.task_ad['JobPrio'])) else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime') maxmemory = self.resubmit_info[inkey].get('maxmemory') numcores = self.resubmit_info[inkey].get('numcores') priority = self.resubmit_info[inkey].get('priority') ## Save the (new) values of the resubmission parameters in self.resubmit_info ## for the current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime self.resubmit_info[outkey]['maxmemory'] = maxmemory self.resubmit_info[outkey]['numcores'] = numcores self.resubmit_info[outkey]['priority'] = priority self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info self.resubmit_info[outkey][ 'CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad ## Add the resubmission parameters to the Job.<job_id>.submit content. if maxjobruntime is not None: new_submit_text += '+MaxWallTimeMins = %s\n' % (str(maxjobruntime)) if maxmemory is not None: new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory)) if numcores is not None: new_submit_text += '+RequestCpus = %s\n' % (str(numcores)) if priority is not None: new_submit_text += '+JobPrio = %s\n' % (str(priority)) ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority. pre_job_prio = 1 if self.job_id <= 5: pre_job_prio = 0 new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will ## run the job with the higher PostJobPrio1. new_submit_text += '+PostJobPrio1 = -%s\n' % str( self.task_ad.lookup('QDate')) ## Order retries before all other jobs in this task new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry ## Add the site black- and whitelists and the DESIRED_SITES to the ## Job.<job_id>.submit content. new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info) ## Add group information: username = self.task_ad.get('CRAB_UserHN') if 'CMSGroups' in self.task_ad: new_submit_text += '+CMSGroups = %s\n' % classad.quote( self.task_ad['CMSGroups']) elif username: groups = CMSGroupMapper.map_user_to_groups(username) if groups: new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups) ## Finally add (copy) all the content of the generic Job.submit file. with open("Job.submit", 'r') as fd: new_submit_text += fd.read() ## Write the Job.<job_id>.submit file. with open("Job.%d.submit" % (self.job_id), 'w') as fd: fd.write(new_submit_text)
def executeInternal(self, info, dashboardParams, inputFiles, **kwargs): """Internal execution to submit to selected scheduler Before submission it does duplicate check to see if task was not submitted by previous time""" if not htcondor: raise Exception("Unable to import HTCondor module") task = kwargs['task'] workflow = task['tm_taskname'] cwd = os.getcwd() os.chdir(kwargs['tempDir']) info['start_time'] = task['tm_start_time'] info['inputFilesString'] = ", ".join(inputFiles + ['subdag.ad']) outputFiles = ["RunJobs.dag.dagman.out", "RunJobs.dag.rescue.001"] info['outputFilesString'] = ", ".join(outputFiles) arg = "RunJobs.dag" info['resthost'] = '"%s"' % (self.crabserver.server['host']) info['dbinstance'] = '"%s"' % (self.crabserver.getDbInstance()) try: info['remote_condor_setup'] = '' if task['tm_collector']: self.backendurls['htcondorPool'] = task['tm_collector'] loc = HTCondorLocator.HTCondorLocator(self.backendurls) address = "" schedd = "" try: self.logger.debug("Getting schedd object") schedd, address = loc.getScheddObjNew(task['tm_schedd']) self.logger.debug("Got schedd object") except Exception as exp: msg = "The CRAB server backend was not able to contact the Grid scheduler." msg += " Please try again later." msg += " Message from the scheduler: %s" % (str(exp)) self.logger.exception("%s: %s", workflow, msg) raise TaskWorkerException(msg, retry=True) try: dummyAddress = loc.scheddAd['Machine'] except: raise TaskWorkerException("Unable to get schedd address for task %s" % (task['tm_taskname']), retry=True) # Get location of schedd-specific environment script from schedd ad. info['remote_condor_setup'] = loc.scheddAd.get("RemoteCondorSetup", "") info["CMSGroups"] = set.union(CMSGroupMapper.map_user_to_groups(kwargs['task']['tm_username']), kwargs['task']['user_groups']) self.logger.info("User %s mapped to local groups %s.", kwargs['task']['tm_username'], info["CMSGroups"]) if not info["CMSGroups"]: raise TaskWorkerException("CMSGroups can not be empty. Failing task %s" % (task['tm_taskname']), retry=True) self.logger.debug("Finally submitting to the schedd") if address: try: self.clusterId = self.submitDirect(schedd, 'dag_bootstrap_startup.sh', arg, info) except Exception as submissionError: msg = "Something went wrong: %s \n" % str(submissionError) if self.clusterId: msg += "But a dagman_bootstrap was submitted with clusterId %s." % self.clusterId else: msg += 'No clusterId was returned to DagmanSubmitter.' msg += " Clean up condor queue before trying again." self.logger.error(msg) constrain = 'crab_reqname=="%s"' % kwargs['task']['tm_taskname'] constrain = str(constrain) # beware unicode, it breaks htcondor binding self.logger.error("Sending: condor_rm -constrain '%s'", constrain) schedd.act(htcondor.JobAction.Remove, constrain) # raise again to communicate failure upstream raise submissionError else: raise TaskWorkerException("Not able to get schedd address.", retry=True) self.logger.debug("Submission finished") finally: os.chdir(cwd) configreq = {'workflow': kwargs['task']['tm_taskname'], 'status': "SUBMITTED", 'subresource': 'success', 'clusterid' : self.clusterId} #that's the condor cluster id of the dag_bootstrap.sh self.logger.debug("Pushing information centrally %s", configreq) data = urllib.urlencode(configreq) self.crabserver.post(api='workflowdb', data=data) self.sendDashboardJobs(dashboardParams, info['apmon']) return Result.Result(task=kwargs['task'], result='OK')
def alter_submit(self, crab_retry): """ Copy the content of the generic file Job.submit into a job-specific file Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry). Add also parameters that can be overwritten at each manual job resubmission (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES). """ ## Start the Job.<job_id>.submit content with the CRAB_Retry. new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry) msg = "Setting CRAB_Retry = %s" % (crab_retry) self.logger.info(msg) ## For the parameters that can be overwritten at each manual job resubmission, ## read them from the task ad, unless there is resubmission information there ## and this job is not one that has to be resubmitted, in which case we should ## use the same parameters (site black- and whitelists, requested memory, etc) ## as used by the previous job retry (which are saved in self.resubmit_info). CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad) use_resubmit_info = False resubmit_jobids = [] if 'CRAB_ResubmitList' in self.task_ad: resubmit_jobids = self.task_ad['CRAB_ResubmitList'] try: resubmit_jobids = set(resubmit_jobids) if resubmit_jobids and self.job_id not in resubmit_jobids: use_resubmit_info = True except TypeError: resubmit_jobids = True ## If there is no resubmit_info, we can of course not use it. if not self.resubmit_info: use_resubmit_info = False ## Get the resubmission parameters. maxjobruntime = None maxmemory = None numcores = None priority = None if not use_resubmit_info: #if 'MaxWallTimeMins_RAW' in self.task_ad: # if self.task_ad['MaxWallTimeMins_RAW'] != 1315: # maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW') # self.resubmit_info['maxjobruntime'] = maxjobruntime if 'MaxWallTimeMins' in self.task_ad: maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMins'))) if 'RequestMemory' in self.task_ad: maxmemory = int(str(self.task_ad.lookup('RequestMemory'))) if 'RequestCpus' in self.task_ad: numcores = int(str(self.task_ad.lookup('RequestCpus'))) if 'JobPrio' in self.task_ad: priority = int(str(self.task_ad['JobPrio'])) else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime') maxmemory = self.resubmit_info[inkey].get('maxmemory') numcores = self.resubmit_info[inkey].get('numcores') priority = self.resubmit_info[inkey].get('priority') ## Save the (new) values of the resubmission parameters in self.resubmit_info ## for the current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime self.resubmit_info[outkey]['maxmemory'] = maxmemory self.resubmit_info[outkey]['numcores'] = numcores self.resubmit_info[outkey]['priority'] = priority self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info self.resubmit_info[outkey]['CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad ## Add the resubmission parameters to the Job.<job_id>.submit content. if maxjobruntime is not None: new_submit_text += '+MaxWallTimeMins = %s\n' % (str(maxjobruntime)) if maxmemory is not None: new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory)) if numcores is not None: new_submit_text += '+RequestCpus = %s\n' % (str(numcores)) if priority is not None: new_submit_text += '+JobPrio = %s\n' % (str(priority)) ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority. pre_job_prio = 1 if self.job_id <= 5: pre_job_prio = 0 new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will ## run the job with the higher PostJobPrio1. new_submit_text += '+PostJobPrio1 = -%s\n' % str(self.task_ad.lookup('QDate')) ## Order retries before all other jobs in this task new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry ## This is used to send to dashbord the location of the logfiles try: storage_rules = htcondor.param['CRAB_StorageRules'] except: storage_rules = "^/home/remoteGlidein,http://submit-5.t2.ucsd.edu/CSstoragePath" new_submit_text += '+CRAB_UserWebDir = "%s"\n' % getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules) try: with open('proxied_webdir') as fd: proxied_webdir = fd.read() new_submit_text += '+CRAB_UserWebDirPrx = "%s"\n' % proxied_webdir except IOError as e: self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal" " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror))) ## Add the site black- and whitelists and the DESIRED_SITES to the ## Job.<job_id>.submit content. new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info) ## Add group information: username = self.task_ad.get('CRAB_UserHN') if 'CMSGroups' in self.task_ad: new_submit_text += '+CMSGroups = %s\n' % classad.quote(self.task_ad['CMSGroups']) elif username: groups = CMSGroupMapper.map_user_to_groups(username) if groups: new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups) ## Finally add (copy) all the content of the generic Job.submit file. with open("Job.submit", 'r') as fd: new_submit_text += fd.read() ## Write the Job.<job_id>.submit file. with open("Job.%d.submit" % (self.job_id), 'w') as fd: fd.write(new_submit_text)