def makeAds( config ): reversed_mapping = config['reversed_mapping'] needs_site = defaultdict(set) for workflow, tasks in config['modifications'].items(): for taskname,specs in tasks.items(): anAd = classad.ClassAd() anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 exp = 'regexp(target.WMAgent_SubTaskName, %s)'% classad.quote(str(taskname)) anAd["Requirements"] = classad.ExprTree(str(exp)) if "ReplaceSiteWhitelist" in specs: anAd["Name"] = str("Site Replacement for %s"% taskname) anAd["eval_set_DESIRED_Sites"] = str(",".join(specs['ReplaceSiteWhitelist'])) print anAd elif "AddWhitelist" in specs: for site in specs['AddWhitelist']: needs_site[site].add(taskname) for site in needs_site: if not site in reversed_mapping: continue anAd = classad.ClassAd() anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 anAd["Name"] = str("Overflow rule to go to %s"%site) anAd["OverflowTasknames"] = map(str, needs_site[site]) #exp = classad.ExprTree('regexp(%s, ExtDESIRED_Sites) && member(target.WMAgent_SubTaskName, OverflowTasknames)' % classad.quote(str(site))) exprs = ['regexp(%s, ExtDESIRED_Sites)'% classad.quote(str(origin)) for origin in reversed_mapping[site]] exp = classad.ExprTree('member(target.WMAgent_SubTaskName, OverflowTasknames) && ( %s )' % str("||".join( exprs ))) anAd["Requirements"] = classad.ExprTree(str(exp)) #anAd["eval_set_DESIRED_Sites"] = classad.Function("strcat", str(",".join( reversed_mapping[site]+[''] )), classad.Attribute("ExtDESIRED_Sites")) anAd["eval_set_DESIRED_Sites"] = classad.Function("strcat", str(site), classad.Attribute("ExtDESIRED_Sites")) print anAd
def getSubmitFileAdditions(resource_ad): """Returns additions to a submit file (as list of strings) to make the submitted job match the given resource """ global _logger lines = [] if 'grid_resource' in resource_ad: lines.append('+GridResource = %s' % classad.quote(resource_ad['grid_resource'])) if 'Transform' in resource_ad: transform_ad = resource_ad['Transform'] set_lines = [] copy_lines = [] eval_set_lines = [] for key, value in transform_ad.iteritems(): if key.startswith('set_'): set_lines.append('+%s = %s' % (key.replace('set_', '', 1), value)) elif key.startswith('copy_'): if value in resource_ad: copy_lines.append('+%s = %s' % (key.replace('copy_', '', 1), resource_ad[value])) else: _logger.warning("Ignoring '%s': '%s' missing from Resource Ad" % (key, value)) elif key.startswith('eval_set_'): eval_set_lines.append('+%s = %s' % (key.replace('eval_set_', '', 1), resource_ad.eval(value))) elif key.startswith('delete_'): _logger.warning("Ignoring '%s': 'delete' transforms not supported", key) else: _logger.warning("Ignoring '%s': unknown transform type", key) lines += copy_lines + set_lines + eval_set_lines return lines
def updateSiteInformation(self, jobs, siteName, excludeSite): """ _updateSiteInformation_ Allow or disallow jobs to run at a site. Called externally by Ops scripts if a site enters or leaves Down, Draining or Aborted. Kill job if after removing site from allowed sites it has nowhere to run. Parameters: excludeSite = False when moving to Normal excludeSite = True when moving to Down, Draining or Aborted """ schedd = htcondor.Schedd() jobtokill = [] try: itobj = schedd.xquery('WMAgent_AgentName =?= %s && JobStatus =?= 1' % classad.quote(self.agent), ['ClusterId', 'ProcId', 'DESIRED_Sites', 'ExtDESIRED_Sites']) except Exception as ex: logging.error("Failed to query condor schedd.") logging.exception(ex) return jobtokill else: jobInfo = {} for jobAd in itobj: gridId = "%s.%s" % (jobAd['ClusterId'], jobAd['ProcId']) jobInfo[gridId] = jobAd for job in jobs: jobAd = jobInfo.get(job['gridid'], None) if jobAd: desiredSites = jobAd.get('DESIRED_Sites').split(',') extDesiredSites = jobAd.get('ExtDESIRED_Sites').split(',') if excludeSite: # Remove siteName from DESIRED_Sites if job has it if siteName in desiredSites: if len(desiredSites) > 1: desiredSites.remove(siteName) desiredSites = ','.join(desiredSites) try: schedd.edit([job['gridid']], 'DESIRED_Sites', classad.ExprTree('"%s"' % desiredSites)) except Exception as ex: logging.error("Failed to edit sites for job %s" % job['gridid']) logging.exception(ex) else: jobtokill.append(job) else: # Add siteName to DESIRED_Sites if ExtDESIRED_Sites has it (moving back to Normal) if siteName not in desiredSites and siteName in extDesiredSites: desiredSites.append(siteName) desiredSites = ','.join(sorted(desiredSites)) try: schedd.edit([job['gridid']], 'DESIRED_Sites', classad.ExprTree('"%s"' % desiredSites)) except Exception as ex: logging.error("Failed to edit sites for job %s" % job['gridid']) logging.exception(ex) return jobtokill
def getClassAds(self): """ _getClassAds_ Grab CONDOR classAds using CONDOR-PYTHON This looks at the schedd running on the Submit-Host and edit/remove jobs """ jobInfo = {} schedd = condor.Schedd() try: logging.debug("Start: Retrieving classAds using Condor Python XQuery") itobj = schedd.xquery( 'WMAgent_JobID =!= "UNDEFINED" && WMAgent_AgentName == %s' % classad.quote(str(self.agent)), ["JobStatus", "EnteredCurrentStatus", "JobStartDate", "QDate", "DESIRED_Sites", "ExtDESIRED_Sites", "MATCH_EXP_JOBGLIDEIN_CMSSite", "WMAgent_JobID"] ) logging.debug("Finish: Retrieving classAds using Condor Python XQuery") except: msg = "Query to condor schedd failed in PyCondorPlugin" logging.debug(msg) return None, None else: for slicedAds in grouper(itobj, 1000): for jobAd in slicedAds: ### This condition ignores jobs that are Removed, but stay in the X state ### For manual condor_rm removal, job wont be in the queue \ ### and status of the jobs will be read from condor log if jobAd["JobStatus"] == 3: continue else: ## For some strange race condition, schedd sometimes does not publish StartDate for a Running Job ## Get the entire classad for such a job ## Do not crash WMA, wait for next polling cycle to get all the info. if jobAd["JobStatus"] == 2 and jobAd.get("JobStartDate") is None: logging.debug("THIS SHOULD NOT HAPPEN. JobStartDate is MISSING from the CLASSAD.") logging.debug("Could be caused by some race condition. Wait for the next Polling Cycle") logging.debug("%s", str(jobAd)) continue tmpDict = {} tmpDict["JobStatus"] = int(jobAd.get("JobStatus", 100)) tmpDict["stateTime"] = int(jobAd["EnteredCurrentStatus"]) tmpDict["runningTime"] = int(jobAd.get("JobStartDate", 0)) tmpDict["submitTime"] = int(jobAd["QDate"]) tmpDict["DESIRED_Sites"] = jobAd["DESIRED_Sites"] tmpDict["ExtDESIRED_Sites"] = jobAd["ExtDESIRED_Sites"] tmpDict["runningCMSSite"] = jobAd.get("MATCH_EXP_JOBGLIDEIN_CMSSite", None) tmpDict["WMAgentID"] = int(jobAd["WMAgent_JobID"]) jobInfo[tmpDict["WMAgentID"]] = tmpDict logging.info("Retrieved %i classAds", len(jobInfo)) return jobInfo, schedd
def get_schedd_ads(environ): pool = _get_pool(environ) coll = htcondor.Collector(pool) if pool: name = _get_name(environ) if name: return [coll.query(htcondor.AdTypes.Schedd, "Name=?=%s" % classad.quote(name))[0]] else: return coll.query(htcondor.AdTypes.Schedd, "true") return [coll.locate(htcondor.DaemonTypes.Schedd)]
def killWorkflowJobs(self, workflow): """ _killWorkflowJobs_ Kill all the jobs belonging to a specif workflow. """ sd = condor.Schedd() logging.debug("Going to remove all the jobs for workflow %s", workflow) sd.act(condor.JobAction.Remove, 'WMAgent_RequestName == %s' % classad.quote(str(workflow))) return
def updateJobInformation(self, workflow, task, **kwargs): """ _updateJobInformation_ Update job information for all jobs in the workflow and task, the change will take effect if the job is Idle or becomes idle. The currently supported changes are only priority for which both the task (taskPriority) and workflow priority (requestPriority) must be provided. """ sd = condor.Schedd() if 'taskPriority' in kwargs and 'requestPriority' in kwargs: # Do a priority update priority = (int(kwargs['requestPriority']) + int(kwargs['taskPriority'] * self.maxTaskPriority)) try: sd.edit('WMAgent_JobID =!= "UNDEFINED" && WMAgent_SubTaskName == %s && WMAgent_RequestName == %s && JobPrio != %d' % (classad.quote(str(task)),classad.quote(str(workflow)),classad.Literal(int(priority))), "JobPrio", classad.Literal(int(priority))) except: msg = "Couldn\'t edit classAd to change job Priority for WMAgent_SubTaskName=%s, WMAgent_RequestName=%s " % (classad.quote(str(task)), classad.quote(str(workflow))) logging.debug(msg) return
def makeReleaseAds(config): """ Create a set of rules to release a task to match """ for task,where in config.get('release',{}).items(): anAd = classad.ClassAd() anAd["Name"] = str("Releasing task %s"%(task)) anAd["GridResource"] = "condor localhost localhost" exp = '(HasBeenSetHeld is true) && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(str(task)) anAd["Requirements"] = classad.ExprTree(str(exp)) anAd["copy_Held_DESIRED_Sites"] = "DESIRED_Sites" anAd["set_HasBeenRouted"] = False anAd["set_HasBeenSetHeld"] = False print anAd
def main(): opts = parse_opts() users = set() for line in open(opts.local_users): line = line.strip() if line.startswith("#"): continue users.add(line) collectors = set() for pool in opts.pool: coll = htcondor.Collector(pool) collectors.add(coll) if not opts.quiet: print >> sys.stderr, "Querying collector %s for schedds matching" % pool, opts.const reqs = '(JobStatus == 1) && stringListMember(%s, DESIRED_Sites)' % classad.quote(opts.site) idle_count = {} for user in users: if user == "*": continue idle_count.setdefault(user, 0) user_map = {} if not opts.quiet: print >> sys.stderr, "Schedd job requirements:", reqs for coll in collectors: for schedd_ad in coll.query(htcondor.AdTypes.Schedd, opts.const, ['MyAddress', 'CondorVersion', 'Name', 'ScheddIpAddr']): if not opts.quiet: print >> sys.stderr, "Querying", schedd_ad.get('Name', "Unknown") schedd = htcondor.Schedd(schedd_ad) try: if opts.jobs_only: schedd_data = schedd.xquery(requirements=reqs, projection=["x509userproxysubject", "CRAB_UserHN", "JobStatus"]) else: schedd_data = schedd.xquery(requirements=reqs, projection=["x509userproxysubject", "CRAB_UserHN", "JobStatus"], opts=htcondor.QueryOpts.AutoCluster) except RuntimeError, e: if not opts.quiet: print >> sys.stderr, "Error querying %s: %s" % (schedd_ad.get('Name', "Unknown"), e) if not opts.jobs_only: for cluster in schedd_data: user = cluster.get("CRAB_UserHN") if (user in users) or ("*" in users): idle_count.setdefault(user, 0) idle_count[user] += int(cluster.get("JobCount", 0)) if 'x509userproxysubject' in cluster: user_map[user] = cluster['x509userproxysubject'] if opts.jobs_only: for job in schedd_data: user = job.get("CRAB_UserHN") if (user in users) or ("*" in users): idle_count.setdefault(user, 0) idle_count[user] += 1 if 'x509userproxysubject' in job: user_map[user] = job['x509userproxysubject']
def updateJobInformation(self, workflow, task, **kwargs): """ _updateJobInformation_ Update job information for all jobs in the workflow and task, the change will take effect if the job is Idle or becomes idle. The currently supported changes are only priority for which both the task (taskPriority) and workflow priority (requestPriority) must be provided. """ schedd = htcondor.Schedd() if 'taskPriority' in kwargs and 'requestPriority' in kwargs: newPriority = int(kwargs['requestPriority']) + int(kwargs['taskPriority'] * self.maxTaskPriority) try: constraint = "WMAgent_SubTaskName =?= %s" % classad.quote(str(task)) constraint += " && WMAgent_RequestName =?= %s" % classad.quote(str(workflow)) constraint += " && JobPrio =!= %d" % newPriority schedd.edit(constraint, 'JobPrio', classad.Literal(newPriority)) except Exception as ex: logging.error("Failed to update JobPrio for WMAgent_SubTaskName=%s", task) logging.exception(ex) return
def killWorkflowJobs(self, workflow): """ _killWorkflowJobs_ Kill all the jobs belonging to a specific workflow. """ logging.info("Going to remove all the jobs for workflow %s", workflow) schedd = htcondor.Schedd() try: schedd.act(htcondor.JobAction.Remove, "WMAgent_RequestName == %s" % classad.quote(str(workflow))) except RuntimeError: logging.warn("Error while killing jobs on the schedd: WMAgent_RequestName=%s", workflow) return
def executeAll(self, joblist=None, attributes=None, values=None): """ Given equal sized lists of job ids, attributes and values, executes in one large transaction a single qedit for each job. """ global disk_cache joblist = joblist or [] attributes = attributes or [] values = values or [] if not (len(joblist) == len(attributes) == len(values)): raise QueryError( "Arguments to QEdit.executeAll should have the same length") try: htcondor_full_reload() if self.pool_name: collector = htcondor.Collector(str(self.pool_name)) else: collector = htcondor.Collector() if self.schedd_name: schedd_ad = disk_cache.get(self.schedd_name + '.locate') if schedd_ad is None: schedd_ad = collector.locate(htcondor.DaemonTypes.Schedd, self.schedd_name) disk_cache.save(self.schedd_name + '.locate', schedd_ad) schedd = htcondor.Schedd(schedd_ad) else: schedd = htcondor.Schedd() with schedd.transaction() as _: for jobid, attr, val in zip(joblist, attributes, values): schedd.edit([jobid], attr, classad.quote(val)) except Exception as ex: s = 'default' if self.schedd_name is not None: s = self.schedd_name p = 'default' if self.pool_name is not None: p = self.pool_name try: j1 = jobid j2 = attr j3 = val except: j1 = j2 = j3 = 'unknown' err_str = 'Error querying schedd %s in pool %s using python bindings (qedit of job/attr/val %s/%s/%s): %s' % ( s, p, j1, j2, j3, ex) raise QueryError(err_str)
def getClassAds(self): """ _getClassAds_ Grab CONDOR classAds using CONDOR-PYTHON This looks at the schedd running on the Submit-Host and edit/remove jobs """ jobInfo = {} schedd = condor.Schedd() try: logging.debug("Start: Retrieving classAds using Condor Python XQuery") itobj = schedd.xquery( 'WMAgent_JobID =!= "UNDEFINED" && WMAgent_AgentName == %s' % classad.quote(str(self.agent)), ["JobStatus", "EnteredCurrentStatus", "JobStartDate", "QDate", "DESIRED_Sites", "ExtDESIRED_Sites", "MATCH_EXP_JOBGLIDEIN_CMSSite", "WMAgent_JobID"] ) logging.debug("Finish: Retrieving classAds using Condor Python XQuery") except: msg = "Query to condor schedd failed in PyCondorPlugin" logging.error(msg) return None, None else: for jobAd in itobj: ### This condition ignores jobs that are Removed, but stay in the X state ### For manual condor_rm removal, job wont be in the queue \ ### and status of the jobs will be read from condor log if jobAd["JobStatus"] == 3: continue else: tmpDict = {} tmpDict["JobStatus"] = int(jobAd.get("JobStatus", 100)) tmpDict["stateTime"] = int(jobAd["EnteredCurrentStatus"]) tmpDict["runningTime"] = int(jobAd.get("JobStartDate", 0)) tmpDict["submitTime"] = int(jobAd["QDate"]) tmpDict["DESIRED_Sites"] = jobAd["DESIRED_Sites"] tmpDict["ExtDESIRED_Sites"] = jobAd["ExtDESIRED_Sites"] tmpDict["runningCMSSite"] = jobAd.get("MATCH_EXP_JOBGLIDEIN_CMSSite", None) tmpDict["WMAgentID"] = int(jobAd["WMAgent_JobID"]) jobInfo[tmpDict["WMAgentID"]] = tmpDict logging.info("Retrieved %i classAds", len(jobInfo)) return jobInfo, schedd
def shared_submit_descriptors(unique_id=None, requirements=None): return { "executable": THIS_FILE.as_posix(), "My.Is_Transfer_Job": "true", "My.WantFlocking": "true", "keep_claim_idle": "300", "request_disk": "1GB", "requirements": requirements if requirements is not None else "true", "My.UniqueID": "{}".format(classad.quote(unique_id) if unique_id is not None else ''), }
def shared_submit_descriptors( executable: Optional[Path] = None, unique_id: Optional[str] = None, requirements: Optional[str] = None, ) -> Dict[str, str]: if executable is None: executable = THIS_FILE return { "executable": executable.as_posix(), "keep_claim_idle": "300", "request_disk": "1GB", "requirements": requirements or "true", "My.Is_Transfer_Job": "true", "My.WantFlocking": "true", # special attribute for the CHTC pool, not necessary at other sites "My.UniqueID": classad.quote(unique_id) if unique_id else "", }
def makeAds(config): reversed_mapping = config['reversed_mapping'] needs_site = defaultdict(set) for workflow, tasks in config['modifications'].items(): for taskname,specs in tasks.items(): anAd = classad.ClassAd() anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 exp = '(HasBeenReplaced isnt true) && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(str(taskname)) anAd["Requirements"] = classad.ExprTree(str(exp)) if "ReplaceSiteWhitelist" in specs: anAd["Name"] = str("Site Replacement for %s"% taskname) #if ("T2_CH_CERN_HLT" in specs['ReplaceSiteWhitelist']) and not g_is_cern: specs['ReplaceSiteWhitelist'].remove("T2_CH_CERN_HLT") anAd["eval_set_DESIRED_Sites"] = str(",".join(specs['ReplaceSiteWhitelist'])) anAd['set_Rank'] = classad.ExprTree("stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)") anAd["set_HasBeenReplaced"] = True anAd["set_HasBeenRouted"] = False print anAd elif "AddWhitelist" in specs: for site in specs['AddWhitelist']: needs_site[site].add(taskname) for site in needs_site: if not site in reversed_mapping: continue #if site == "T2_CH_CERN_HLT" and not g_is_cern: continue anAd = classad.ClassAd() anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 anAd["Name"] = str("Overflow rule to go to %s"%site) anAd["OverflowTasknames"] = map(str, needs_site[site]) overflow_names_escaped = anAd.lookup('OverflowTasknames').__repr__() del anAd['OverflowTaskNames'] exprs = ['regexp(%s, target.ExtDESIRED_Sites)'% classad.quote(str(origin)) for origin in reversed_mapping[site]] exp = classad.ExprTree('member(target.WMAgent_SubTaskName, %s) && ( %s ) && (target.HasBeenRouted_%s =!= true)' % (overflow_names_escaped, str("||".join( exprs )), str(site))) anAd["Requirements"] = classad.ExprTree(str(exp)) anAd["copy_DESIRED_Sites"] = "Prev_DESIRED_Sites" anAd["eval_set_DESIRED_Sites"] = classad.ExprTree('ifThenElse(sortStringSet("") isnt error, sortStringSet(strcat(%s, ",", Prev_DESIRED_Sites)), strcat(%s, ",", Prev_DESIRED_Sites))' % (classad.quote(str(site)), classad.quote(str(site)))) anAd['set_Rank'] = classad.ExprTree("stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)") anAd['set_HasBeenRouted'] = False anAd['set_HasBeenRouted_%s' % str(site)] = True print anAd
def makeHoldAds(config): """ Create a set of rules to hold a task from matching """ for task,where in config.get('hold',{}).items(): # task is the task name # where is either an empty list=all sites, or a list of sites (not implemented) anAd = classad.ClassAd() anAd["Name"] = str("Holding task %s from %s"%(task, where)) anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 exp = '(HasBeenSetHeld isnt true) && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(str(task)) anAd["Requirements"] = classad.ExprTree(str(exp)) ## we use the site whitelist to prevent matching anAd["copy_DESIRED_Sites"] = "Held_DESIRED_Sites" anAd["set_DESIRED_Sites"] = "T2_NW_NOWHERE" anAd["set_HasBeenRouted"] = False anAd["set_HasBeenSetHeld"] = True print anAd
def killWorkflowJobs(self, workflow): """ _killWorkflowJobs_ Kill all the jobs belonging to a specific workflow. """ logging.info("Going to remove all the jobs for workflow %s", workflow) schedd = htcondor.Schedd() try: schedd.act(htcondor.JobAction.Remove, "WMAgent_RequestName == %s" % classad.quote(workflow)) except RuntimeError: logging.warn( "Error while killing jobs on the schedd: WMAgent_RequestName=%s", workflow) return
def makeHoldAds(config): """ Create a set of rules to hold a task from matching """ for task, where in config.get('hold', {}).items(): # task is the task name # where is either an empty list=all sites, or a list of sites (not implemented) anAd = classad.ClassAd() anAd["Name"] = str("Holding task %s from %s" % (task, where)) anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 exp = '(HasBeenSetHeld isnt true) && (target.WMAgent_SubTaskName =?= %s)' % classad.quote( str(task)) anAd["Requirements"] = classad.ExprTree(str(exp)) ## we use the site whitelist to prevent matching anAd["copy_DESIRED_Sites"] = "Held_DESIRED_Sites" anAd["set_DESIRED_Sites"] = "T2_NW_NOWHERE" anAd["set_HasBeenRouted"] = False anAd["set_HasBeenSetHeld"] = True print anAd
def executeAll(self, joblist=None, attributes=None, values=None): """ Given equal sized lists of job ids, attributes and values, executes in one large transaction a single qedit for each job. """ joblist = joblist or [] attributes = attributes or [] values = values or [] if not (len(joblist) == len(attributes) == len(values)): raise QueryError("Arguments to QEdit.executeAll should have the same length") try: htcondor.reload_config() if self.pool_name: collector = htcondor.Collector(str(self.pool_name)) else: collector = htcondor.Collector() if self.schedd_name: schedd_ad = collector.locate(htcondor.DaemonTypes.Schedd, self.schedd_name) schedd = htcondor.Schedd(schedd_ad) else: schedd = htcondor.Schedd() with schedd.transaction() as _: for jobid, attr, val in zip(joblist, attributes, values): schedd.edit([jobid], attr, classad.quote(val)) except Exception as ex: s = 'default' if self.schedd_name is not None: s = self.schedd_name p = 'default' if self.pool_name is not None: p = self.pool_name try: j1 = jobid j2 = attr j3 = val except: j1 = j2 = j3 = 'unknown' err_str = 'Error querying schedd %s in pool %s using python bindings (qedit of job/attr/val %s/%s/%s): %s' % (s, p, j1, j2, j3, ex) raise QueryError(err_str)
def updateClassAd(collector, daemon, name, statistics='All:2', direct=False): """Returns an updated ClassAd from a HTCondor daemon""" # Begin building a list of keyword arguments. kwargs = {'statistics': statistics} if direct: # If we're doing a direct query, daemon_type and name must be defined. kwargs['daemon_type'] = htcondor.DaemonTypes.names[daemon] kwargs['name'] = name # Do a direct query on the given daemon at the given hostname. # Return the ClassAd and the number of ClassAds queried (always 1). return (collector.directQuery(**kwargs), 1) else: # If we're doing a regular query, ad_type must be defined # and the ClassAd should be constrained to the given hostname. kwargs['ad_type'] = htcondor.AdTypes.names[daemon] if name: constraint = 'Name =?= {0}'.format(classad.quote(name)) kwargs['constraint'] = constraint else: name = '(unspecified)' # Do a regular query on the given daemon at the given hostname. ads = collector.query(**kwargs) # A regular query can give multiple ClassAds, e.g. if there are multiple # daemons running on the same host (rare, usually duplicate daemon). if len(ads) == 0: # if no ClassAds, then exit sys.stderr.write( ('Error: Received {0} ClassAds from the {1} named {2} ' 'from the Collector at {3}.\n').format( len(ads), daemon, name, POOL)) sys.exit(1) # Return the first ClassAd and the number of ClassAds queried. return (ads[0], len(ads))
def submitDirect(self, schedd, cmd, arg, info): #pylint: disable=R0201 """ Submit directly to the schedd using the HTCondor module """ dagAd = classad.ClassAd() addCRABInfoToClassAd(dagAd, info) if info["CMSGroups"]: dagAd["CMSGroups"] = ','.join(info["CMSGroups"]) else: dagAd["CMSGroups"] = classad.Value.Undefined # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker dagAd["CRAB_Attempt"] = 0 # We switched from local to scheduler universe. Why? It seems there's no way in the # local universe to change the hold signal at runtime. That's fairly important for our # resubmit implementation. #dagAd["JobUniverse"] = 12 dagAd["JobUniverse"] = 7 dagAd["HoldKillSig"] = "SIGUSR1" dagAd["X509UserProxy"] = info['user_proxy'] dagAd["Requirements"] = classad.ExprTree('true || false') dagAd["Environment"] = classad.ExprTree('strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info['additional_environment_options'].split(";"))) dagAd["RemoteCondorSetup"] = info['remote_condor_setup'] dagAd["CRAB_TaskSubmitTime"] = classad.ExprTree("%s" % info["start_time"].encode('ascii', 'ignore')) dagAd['CRAB_TaskLifetimeDays'] = TASKLIFETIME // 24 // 60 // 60 dagAd['CRAB_TaskEndTime'] = int(info["start_time"]) + TASKLIFETIME #For task management info see https://github.com/dmwm/CRABServer/issues/4681#issuecomment-302336451 dagAd["LeaveJobInQueue"] = classad.ExprTree("true") dagAd["PeriodicHold"] = classad.ExprTree("time() > CRAB_TaskEndTime") dagAd["TransferOutput"] = info['outputFilesString'] dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)") dagAd["OnExitRemove"] = classad.ExprTree("( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))") dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId") dagAd["RemoveKillSig"] = "SIGUSR1" with open('subdag.ad' ,'w') as fd: for k, v in dagAd.items(): if k == 'X509UserProxy': v = os.path.basename(v) if isinstance(v, basestring): value = classad.quote(v) elif isinstance(v, classad.ExprTree): value = repr(v) elif isinstance(v, list): value = "{{{0}}}".format(json.dumps(v)[1:-1]) else: value = v fd.write('+{0} = {1}\n'.format(k, value)) dagAd["TaskType"] = "ROOT" dagAd["Out"] = str(os.path.join(info['scratch'], "request.out")) dagAd["Err"] = str(os.path.join(info['scratch'], "request.err")) dagAd["Cmd"] = cmd dagAd['Args'] = arg dagAd["TransferInput"] = str(info['inputFilesString']) condorIdDict = {} with HTCondorUtils.AuthenticatedSubprocess(info['user_proxy'], pickleOut=True, outputObj=condorIdDict, logger=self.logger) as (parent, rpipe): if not parent: resultAds = [] condorIdDict['ClusterId'] = schedd.submit(dagAd, 1, True, resultAds) schedd.spool(resultAds) # editing the LeaveJobInQueue since the remote submit overwrites it # see https://github.com/dmwm/CRABServer/pull/5212#issuecomment-216519749 if resultAds: id_ = "%s.%s" % (resultAds[0]['ClusterId'], resultAds[0]['ProcId']) schedd.edit([id_], "LeaveJobInQueue", classad.ExprTree("true")) try: results = pickle.load(rpipe) except EOFError: #Do not want to retry this since error may happen after submit (during edit for example). #And this can cause the task to be submitted twice (although we have a protection in the duplicatedCheck) raise TaskWorkerException("Timeout executing condor submit command.", retry=False) #notice that the clusterId might be set even if there was a failure. This is if the schedd.submit succeded, but the spool call failed if 'ClusterId' in results.outputObj: self.logger.debug("Condor cluster ID just submitted is: %s", results.outputObj['ClusterId']) if results.outputMessage != "OK": self.logger.debug("Now printing the environment used for submission:\n" + "-"*70 + "\n" + results.environmentStr + "-"*70) raise TaskWorkerException("Failure when submitting task to scheduler. Error reason: '%s'" % results.outputMessage, retry=True) #if we don't raise exception above the id is here return results.outputObj['ClusterId']
def main(): logging.basicConfig(format="%(asctime)s %(message)s", level=logging.INFO) args = parse_args() print("Called with args: {}".format(args)) if args.cmd == "sync": if args.unique_id: schedd = htcondor.Schedd() existing_job = schedd.query( constraint="UniqueId == {} && JobStatus =!= 4".format( classad.quote(args.unique_id)), attr_list=[], limit=1, ) if len(existing_job) > 0: logging.warning( 'Jobs already found in queue with UniqueId == "%s", exiting', args.unique_id, ) sys.exit() print("Will synchronize {} at source to {} at destination".format( args.src, args.dest)) cluster_id = submit_outer_dag( args.working_dir, args.src, args.dest, requirements=read_requirements_file(args.requirements_file) or args.requirements, unique_id=args.unique_id, test_mode=args.test_mode, ) print("Parent job running in cluster {}".format(cluster_id)) elif args.cmd == "generate": logging.info("Generating file listing for %s", args.src) generate_file_listing(args.src, Path("source_manifest.txt"), test_mode=args.test_mode) elif args.cmd == "write_subdag": logging.info( "Generating SUBGDAG for transfer of %s->%s", args.source_prefix, args.dest_prefix, ) write_inner_dag( args.source_prefix, args.source_manifest, args.dest_prefix, requirements=read_requirements_file(args.requirements_file) or args.requirements, test_mode=args.test_mode, unique_id=args.unique_id, ) elif args.cmd == "exec": xfer_exec(args.src) elif args.cmd == "verify": with args.json.open(mode="r") as f: cmd_info = json.load(f) # Split the DAG job name to get the cmd_info key info = cmd_info[args.fileid.split(":")[-1]] verify( Path(info["dest_prefix"]), Path(info["dest"]), Path("{}.metadata".format(info['src_file_noslash'])), Path(info["transfer_manifest"]), ) elif args.cmd == "verify_remote": verify_remote(args.src) elif args.cmd == "analyze": analyze(args.transfer_manifest)
def makeOverflowAds(config): # Mapping from source to a list of destinations. # key can be read by site in values reversed_mapping = config['reversed_mapping'] overflow_tasks = {} for workflow, tasks in config.get('modifications', {}).items(): for taskname, specs in tasks.items(): anAd = classad.ClassAd() anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 exp = '(HasBeenReplaced isnt true) && (target.WMAgent_SubTaskName =?= %s)' % classad.quote( str(taskname)) anAd["Requirements"] = classad.ExprTree(str(exp)) add_whitelist = specs.get("AddWhitelist") if "ReplaceSiteWhitelist" in specs: anAd["Name"] = str("Site Replacement for %s" % taskname) anAd["eval_set_DESIRED_Sites"] = str(",".join( specs['ReplaceSiteWhitelist'])) anAd['set_Rank'] = classad.ExprTree( "stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)") anAd["set_HasBeenReplaced"] = True anAd["set_HasBeenRouted"] = False print anAd elif add_whitelist: add_whitelist.sort() add_whitelist_key = ",".join(add_whitelist) tasks = overflow_tasks.setdefault(add_whitelist_key, []) tasks.append(taskname) # Create a source->dests mapping from the provided reverse_mapping. source_to_dests = {} for dest, sources in reversed_mapping.items(): for source in sources: dests = source_to_dests.setdefault(source, set()) dests.add(dest) tmp_source_to_dests = source_to_dests # For each unique set of site whitelists, create a new rule. Each task # should appear on just one of these ads, meaning it should only get routed # once. for whitelist_sites, tasks in overflow_tasks.items(): ## these are the sites that need to be added in whitelist. whitelist_sites_set = set(whitelist_sites.split(",")) # Create an updated source_to_dests, where the dests are filtered # on the whitelist. source_to_dests = {} for source, dests in tmp_source_to_dests.items(): new_dests = [str(i) for i in dests if i in whitelist_sites_set] if new_dests: source_to_dests[str(source)] = new_dests anAd = classad.ClassAd() anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 anAd["Name"] = "Master overflow rule to run at %s in addition" % str( whitelist_sites) # ClassAds trick to create a properly-formatted ClassAd list. anAd["OverflowTasknames"] = map(str, tasks) overflow_names_escaped = anAd.lookup('OverflowTasknames').__repr__() del anAd['OverflowTaskNames'] exp = classad.ExprTree( 'member(target.WMAgent_SubTaskName, %s) && (HasBeenRouted_Overflow isnt true)' % overflow_names_escaped) anAd["Requirements"] = classad.ExprTree(str(exp)) anAd["copy_DESIRED_Sites"] = "Pre_DESIRED_Sites" anAd["eval_set_DESIRED_Sites"] = classad.ExprTree( 'ifThenElse(siteMapping("", []) isnt error, siteMapping(Pre_DESIRED_Sites, %s), Pre_DESIRED_Sites)' % str(classad.ClassAd(source_to_dests))) # Where possible, prefer to run at a site where the input can be read locally. anAd['set_Rank'] = classad.ExprTree( "stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)") anAd['set_HasBeenRouted'] = False anAd['set_HasBeenRouted_Overflow'] = True print anAd
def test_quote(self): self.assertEquals(classad.quote("foo"), '"foo"') self.assertEquals(classad.quote('"foo'), '"\\"foo"') for i in ["foo", '"foo', '"\\"foo']: self.assertEquals(i, classad.unquote(classad.quote(i)))
def customizePerJob(self, job): """ JDL additions just for this implementation. Over-ridden in sub-classes These are the Glide-in specific bits """ jdl = [] jobCE = job['location'] if not jobCE: # Then we ended up with a site that doesn't exist? logging.error("Job for non-existant site %s", job['location']) return jdl if self.submitWMSMode and len(job.get('possibleSites', [])) > 0: strg = ','.join(map(str, job.get('possibleSites'))) jdl.append('+DESIRED_Sites = \"%s\"\n' % strg) else: jdl.append('+DESIRED_Sites = \"%s\"\n' % (jobCE)) if self.submitWMSMode and len(job.get('potentialSites', [])) > 0: strg = ','.join(map(str, job.get('potentialSites'))) jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % strg) else: jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % (jobCE)) if job.get('proxyPath'): jdl.append('x509userproxy = %s\n' % job['proxyPath']) jdl.append('+WMAgent_RequestName = "%s"\n' % job['requestName']) match = GROUP_NAME_RE.match(job['requestName']) if match: jdl.append('+CMSGroups = %s\n' % classad.quote(match.groups()[0])) else: jdl.append('+CMSGroups = undefined\n') jdl.append('+WMAgent_SubTaskName = "%s"\n' % job['taskName']) jdl.append('+CMS_JobType = "%s"\n' % job['taskType']) # Handling for AWS, cloud and opportunistic resources jdl.append('+AllowOpportunistic = %s\n' % job.get('allowOpportunistic', False)) # dataset info if job.get('inputDataset'): jdl.append('+DESIRED_CMSDataset = "%s"\n' % job['inputDataset']) else: jdl.append('+DESIRED_CMSDataset = undefined\n') if job.get('inputDatasetLocations'): jdl.append('+DESIRED_CMSDataLocations = "%s"\n' % ','.join(job['inputDatasetLocations'])) else: jdl.append('+DESIRED_CMSDataLocations = undefined\n') # HighIO jobs jdl.append('+Requestioslots = %d\n' % job.get('highIOjob', 0)) # Performance and resource estimates numberOfCores = job.get('numberOfCores', 1) requestMemory = int(job['estimatedMemoryUsage']) if job.get('estimatedMemoryUsage', None) else 1000 requestDisk = int(job['estimatedDiskUsage']) if job.get('estimatedDiskUsage', None) else 20*1000*1000*numberOfCores maxWallTimeMins = int(job['estimatedJobTime'])/60.0 if job.get('estimatedJobTime', None) else 12*60 jdl.append('request_memory = %d\n' % requestMemory) jdl.append('request_disk = %d\n' % requestDisk) jdl.append('+MaxWallTimeMins = %d\n' % maxWallTimeMins) # How many cores job is using jdl.append('machine_count = 1\n') jdl.append('request_cpus = %s\n' % numberOfCores) # Add OS requirements for jobs if job.get('scramArch') is not None and job.get('scramArch').startswith("slc6_"): jdl.append('+REQUIRED_OS = "rhel6"\n') else: jdl.append('+REQUIRED_OS = "any"\n') return jdl
def test_quote(input, expected): assert classad.quote(input) == expected
def makeOverflowAds(config): # Mapping from source to a list of destinations. reversed_mapping = config['reversed_mapping'] overflow_tasks = {} for workflow, tasks in config['modifications'].items(): for taskname,specs in tasks.items(): anAd = classad.ClassAd() anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 exp = '(HasBeenReplaced isnt true) && (target.WMAgent_SubTaskName =?= %s)' % classad.quote(str(taskname)) anAd["Requirements"] = classad.ExprTree(str(exp)) add_whitelist = specs.get("AddWhitelist") if "ReplaceSiteWhitelist" in specs: anAd["Name"] = str("Site Replacement for %s"% taskname) anAd["eval_set_DESIRED_Sites"] = str(",".join(specs['ReplaceSiteWhitelist'])) anAd['set_Rank'] = classad.ExprTree("stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)") anAd["set_HasBeenReplaced"] = True anAd["set_HasBeenRouted"] = False print anAd elif add_whitelist: add_whitelist.sort() add_whitelist_key = ",".join(add_whitelist) tasks = overflow_tasks.setdefault(add_whitelist_key, []) tasks.append(taskname) # Create a source->dests mapping from the provided reverse_mapping. source_to_dests = {} for dest, sources in reversed_mapping.items(): for source in sources: dests = source_to_dests.setdefault(source, set()) dests.add(dest) tmp_source_to_dests = source_to_dests # For each unique set of site whitelists, create a new rule. Each task # should appear on just one of these ads, meaning it should only get routed # once. for whitelist_sites, tasks in overflow_tasks.items(): whitelist_sites_set = set(whitelist_sites.split(",")) # Create an updated source_to_dests, where the dests are filtered # on the whitelist. source_to_dests = {} for source, dests in tmp_source_to_dests.items(): new_dests = [str(i) for i in dests if i in whitelist_sites_set] if new_dests: source_to_dests[str(source)] = new_dests anAd = classad.ClassAd() anAd["GridResource"] = "condor localhost localhost" anAd["TargetUniverse"] = 5 anAd["Name"] = "Master overflow rule for %s" % str(whitelist_sites) # ClassAds trick to create a properly-formatted ClassAd list. anAd["OverflowTasknames"] = map(str, tasks) overflow_names_escaped = anAd.lookup('OverflowTasknames').__repr__() del anAd['OverflowTaskNames'] exp = classad.ExprTree('member(target.WMAgent_SubTaskName, %s) && (HasBeenRouted_Overflow isnt true)' % overflow_names_escaped) anAd["Requirements"] = classad.ExprTree(str(exp)) # siteMapping will apply the source->dest rules, given the current set of sources in ExtDESIRED_Sites. anAd["eval_set_DESIRED_Sites"] = classad.ExprTree('ifThenElse(siteMapping("", []) isnt error, siteMapping(ExtDESIRED_Sites, %s), ExtDESIRED_Sites)' % str(classad.ClassAd(source_to_dests))) # Where possible, prefer to run at a site where the input can be read locally. anAd['set_Rank'] = classad.ExprTree("stringlistmember(GLIDEIN_CMSSite, ExtDESIRED_Sites)") anAd['set_HasBeenRouted'] = False anAd['set_HasBeenRouted_Overflow'] = True print anAd
def make_inner_dag( direction: TransferDirection, requirements: Optional[str], transfer_cmd_info: T_CMD_INFO, verify_cmd_info: T_CMD_INFO, unique_id: Optional[str] = None, test_mode: bool = False, ): # Only import htcondor.dags submit-side import htcondor.dags as dags inner_dag = dags.DAG( max_jobs_by_category={"TRANSFER_JOBS": 1} if test_mode else None) tof = [METADATA_FILE_NAME] tor = {METADATA_FILE_NAME: "$(flattened_name).metadata"} pull_tof = [SANDBOX_FILE_NAME] pull_tor = {SANDBOX_FILE_NAME: "$(flattened_name)"} shared_descriptors = shared_submit_descriptors(unique_id=unique_id, requirements=requirements) inner_dag.layer( name=direction, submit_description=htcondor.Submit({ "output": "$(flattened_name).out", "error": "$(flattened_name).err", "log": "transfer_file.log", "arguments": classad.quote("{} '$(remote_file)'".format( DIRECTION_TO_COMMAND[direction])), "should_transfer_files": "yes", "transfer_input_files": "$(local_file)" if direction is TransferDirection.PUSH else "", "transfer_output_files": ", ".join(tof + ( pull_tof if direction is TransferDirection.PULL else [])), "transfer_output_remaps": classad.quote(" ; ".join( "{} = {}".format(k, v) for k, v in { **tor, **(pull_tor if TransferDirection.PULL else {}), }.items())), **shared_descriptors, }), vars=transfer_cmd_info, post=dags.Script( executable=THIS_FILE, arguments=[ Commands.POST_TRANSFER, "--cmd-info", TRANSFER_COMMANDS_FILE_NAME, "--key", "$JOB", ], ), ) inner_dag.layer( name="verify", submit_description=htcondor.Submit({ "output": "$(flattened_name).out", "error": "$(flattened_name).err", "log": "verify_file.log", "arguments": classad.quote("{} '$(remote_file)'".format( Commands.GET_REMOTE_METADATA)), "should_transfer_files": "yes", "transfer_output_files": ", ".join(tof), "transfer_output_remaps": classad.quote(" ; ".join("{} = {}".format(k, v) for k, v in tor.items())), **shared_descriptors, }), vars=verify_cmd_info, post=dags.Script( executable=THIS_FILE, arguments=[ Commands.POST_TRANSFER, "--cmd-info", VERIFY_COMMANDS_FILE_NAME, "--key", "$JOB", "--only-verify", ], ), ) logging.info("Inner DAG shape:\n{}".format(inner_dag.describe())) return inner_dag
"transfer_input_files": "$(item)", "output": "test-$(ProcID).out", "error": "test-$(ProcID).err", "request_cpus": "1", "request_memory": "1GB", "request_disk": "1GB", "hold": "true", "My.HoldReason": classad.quote("Spooling input files"), "My.HoldReasonCode": "16", "My.LeaveJobInQueue": f"JobStatus == {COMPLETED} && ( {COMPLETION_DATE} =?= UNDEFINED || {COMPLETION_DATE} == 0 || ((time() - {COMPLETION_DATE}) < {REMOVAL_DELAY}) )", "transfer_output_remaps": classad.quote( "_condor_stdout=test-$(ProcID).out ; _condor_stderr=test-$(ProcID).err" ), }) collector = htcondor.Collector("cm.chtc.wisc.edu") schedd_ad = collector.locate(htcondor.DaemonTypes.Schedd, "submittest0000.chtc.wisc.edu")
def track(self, jobs): """ _track_ Track the jobs while in condor This returns a three-way ntuple First, the total number of jobs still running Second, the jobs that need to be changed Third, the jobs that need to be completed """ jobInfo = {} changeList = [] completeList = [] runningList = [] # get info about all active and recent jobs logging.debug("SimpleCondorPlugin is going to track %s jobs", len(jobs)) schedd = htcondor.Schedd() logging.debug("Start: Retrieving classAds using Condor Python XQuery") try: itobj = schedd.xquery( "WMAgent_AgentName == %s" % classad.quote(self.agent), [ 'ClusterId', 'ProcId', 'JobStatus', 'MachineAttrGLIDEIN_CMSSite0' ]) for jobAd in itobj: gridId = "%s.%s" % (jobAd['ClusterId'], jobAd['ProcId']) jobStatus = SimpleCondorPlugin.exitCodeMap().get( jobAd.get('JobStatus'), 'Unknown') location = jobAd.get('MachineAttrGLIDEIN_CMSSite0', None) jobInfo[gridId] = (jobStatus, location) except Exception as ex: logging.error( "Query to condor schedd failed in SimpleCondorPlugin.") logging.error("Returning empty lists for all job types...") logging.exception(ex) return runningList, changeList, completeList logging.debug("Finished retrieving %d classAds from Condor", len(jobInfo)) # now go over the jobs and see what we have for job in jobs: # if the schedd doesn't know a job, consider it complete # doing any further checks is not cost effective if job['gridid'] not in jobInfo: (newStatus, location) = ('Completed', None) else: (newStatus, location) = jobInfo[job['gridid']] # check for status changes if newStatus != job['status']: # update location info for Idle->Running transition if newStatus == 'Running' and job['status'] == 'Idle': if location: job['location'] = location logging.debug( "JobAdInfo: Job location for jobid=%i gridid=%s changed to %s", job['jobid'], job['gridid'], location) job['status'] = newStatus job['status_time'] = int(time.time()) logging.debug( "JobAdInfo: Job status for jobid=%i gridid=%s changed to %s", job['jobid'], job['gridid'], job['status']) changeList.append(job) job['globalState'] = SimpleCondorPlugin.stateMap().get(newStatus) # stop tracking finished jobs if job['globalState'] in ['Complete', 'Error']: completeList.append(job) else: runningList.append(job) logging.debug( "SimpleCondorPlugin tracking : %i/%i/%i (Executing/Changing/Complete)", len(runningList), len(changeList), len(completeList)) return runningList, changeList, completeList
def alter_submit(self, crab_retry): """ Copy the content of the generic file Job.submit into a job-specific file Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry). Add also parameters that can be overwritten at each manual job resubmission (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES). """ ## Start the Job.<job_id>.submit content with the CRAB_Retry. new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry) msg = "Setting CRAB_Retry = %s" % (crab_retry) self.logger.info(msg) ## Add job and postjob log URLs job_retry = "%s.%s" % (self.job_id, crab_retry) new_submit_text += '+CRAB_JobLogURL = %s\n' % classad.quote( os.path.join(self.userWebDirPrx, "job_out." + job_retry + ".txt")) new_submit_text += '+CRAB_PostJobLogURL = %s\n' % classad.quote( os.path.join(self.userWebDirPrx, "postjob." + job_retry + ".txt")) ## For the parameters that can be overwritten at each manual job resubmission, ## read them from the task ad, unless there is resubmission information there ## and this job is not one that has to be resubmitted, in which case we should ## use the same parameters (site black- and whitelists, requested memory, etc) ## as used by the previous job retry (which are saved in self.resubmit_info). CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad) use_resubmit_info = False resubmit_jobids = [] if 'CRAB_ResubmitList' in self.task_ad: resubmit_jobids = map(str, self.task_ad['CRAB_ResubmitList']) try: resubmit_jobids = set(resubmit_jobids) if resubmit_jobids and self.job_id not in resubmit_jobids: use_resubmit_info = True except TypeError: resubmit_jobids = True ## If there is no resubmit_info, we can of course not use it. if not self.resubmit_info: use_resubmit_info = False ## Get the resubmission parameters. maxjobruntime = None maxmemory = None numcores = None priority = None if not use_resubmit_info: #if 'MaxWallTimeMins_RAW' in self.task_ad: # if self.task_ad['MaxWallTimeMins_RAW'] != 1315: # maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW') # self.resubmit_info['maxjobruntime'] = maxjobruntime if 'MaxWallTimeMinsProbe' in self.task_ad and self.stage == 'probe': maxjobruntime = int( str(self.task_ad.lookup('MaxWallTimeMinsProbe'))) elif 'MaxWallTimeMinsTail' in self.task_ad and self.stage == 'tail': maxjobruntime = int( str(self.task_ad.lookup('MaxWallTimeMinsTail'))) elif 'MaxWallTimeMinsRun' in self.task_ad: maxjobruntime = int( str(self.task_ad.lookup('MaxWallTimeMinsRun'))) if 'RequestMemory' in self.task_ad: maxmemory = int(str(self.task_ad.lookup('RequestMemory'))) if 'RequestCpus' in self.task_ad: numcores = int(str(self.task_ad.lookup('RequestCpus'))) if 'JobPrio' in self.task_ad: priority = int(str(self.task_ad['JobPrio'])) if str(self.job_id) == '0': #jobids can be like 1-1 for subjobs priority = 20 #the maximum for splitting jobs else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime') maxmemory = self.resubmit_info[inkey].get('maxmemory') numcores = self.resubmit_info[inkey].get('numcores') priority = self.resubmit_info[inkey].get('priority') ## Save the (new) values of the resubmission parameters in self.resubmit_info ## for the current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime self.resubmit_info[outkey]['maxmemory'] = maxmemory self.resubmit_info[outkey]['numcores'] = numcores self.resubmit_info[outkey]['priority'] = priority self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info self.resubmit_info[outkey][ 'CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad ## Add the resubmission parameters to the Job.<job_id>.submit content. savelogs = 0 if self.stage == 'probe' else self.task_ad.lookup( 'CRAB_SaveLogsFlag') saveoutputs = 0 if self.stage == 'probe' else self.task_ad.lookup( 'CRAB_TransferOutputs') new_submit_text += '+CRAB_TransferOutputs = {0}\n+CRAB_SaveLogsFlag = {1}\n'.format( saveoutputs, savelogs) if maxjobruntime is not None: new_submit_text += '+EstimatedWallTimeMins = %s\n' % str( maxjobruntime) new_submit_text += '+MaxWallTimeMinsRun = %s\n' % str( maxjobruntime) # how long it can run new_submit_text += '+MaxWallTimeMins = %s\n' % str( maxjobruntime) # how long a slot can it match to # no plus sign for next 3 attributes, since those are Condor standard ones if maxmemory is not None: new_submit_text += 'RequestMemory = %s\n' % (str(maxmemory)) if numcores is not None: new_submit_text += 'RequestCpus = %s\n' % (str(numcores)) if priority is not None: new_submit_text += 'JobPrio = %s\n' % (str(priority)) ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority. pre_job_prio = 1 if int(self.job_id.split('-')[0]) <= 5: pre_job_prio = 0 new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will ## run the job with the higher PostJobPrio1. new_submit_text += '+PostJobPrio1 = -%s\n' % str( self.task_ad.lookup('QDate')) ## Order retries before all other jobs in this task new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry ## Add the site black- and whitelists and the DESIRED_SITES to the ## Job.<job_id>.submit content. new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info) ## Add group information: username = self.task_ad.get('CRAB_UserHN') if 'CMSGroups' in self.task_ad: new_submit_text += '+CMSGroups = %s\n' % classad.quote( self.task_ad['CMSGroups']) elif username: groups = CMSGroupMapper.map_user_to_groups(username) if groups: new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups) ## Finally add (copy) all the content of the generic Job.submit file. with open("Job.submit", 'r') as fd: new_submit_text += fd.read() ## Write the Job.<job_id>.submit file. with open("Job.%s.submit" % (self.job_id), 'w') as fd: fd.write(new_submit_text)
def make_inner_dag( requirements: Optional[str], xfer_cmd_info: T_CMD_INFO, verify_cmd_info: T_CMD_INFO, unique_id: Optional[str] = None, test_mode: bool = False, ): # Only import htcondor.dags submit-side import htcondor.dags as dags inner_dag = dags.DAG( max_jobs_by_category={"TRANSFER_JOBS": 1} if test_mode else None) inner_dag.layer( name="xfer", submit_description=htcondor.Submit({ "output": "$(src_file_noslash).out", "error": "$(src_file_noslash).err", "log": "xfer_file.log", "arguments": classad.quote("exec '$(src_file)'"), "should_transfer_files": "yes", "transfer_output_files": "{}, metadata".format(SANDBOX_FILE_NAME), "transfer_output_remaps": classad.quote( "{} = $(dest); metadata = $(src_file_noslash).metadata".format( SANDBOX_FILE_NAME)), **shared_submit_descriptors(unique_id, requirements), }), vars=xfer_cmd_info, post=dags.Script( executable=THIS_FILE, arguments=[ "verify", "--json=xfer_commands.json", "--fileid", "$JOB" ], ), ) inner_dag.layer( name="verify", submit_description=htcondor.Submit({ "output": "$(src_file_noslash).out", "error": "$(src_file_noslash).err", "log": "verify_file.log", "arguments": classad.quote("verify_remote '$(src_file)'"), "should_transfer_files": "yes", "transfer_output_files": "metadata", "transfer_output_remaps": classad.quote("metadata = $(src_file_noslash).metadata"), **shared_submit_descriptors(unique_id, requirements), }), vars=verify_cmd_info, post=dags.Script( executable=THIS_FILE, arguments=[ "verify", "--json=verify_commands.json", "--fileid", "$JOB" ], ), ) return inner_dag
def main(): opts = parse_opts() users = set() for line in open(opts.local_users): line = line.strip() if line.startswith("#"): continue users.add(line) collectors = set() for pool in opts.pool: coll = htcondor.Collector(pool) collectors.add(coll) if not opts.quiet: print >> sys.stderr, "Querying collector %s for schedds matching" % pool, opts.const reqs = '(JobStatus == 1) && stringListMember(%s, DESIRED_Sites)' % classad.quote( opts.site) idle_count = {} for user in users: if user == "*": continue idle_count.setdefault(user, 0) user_map = {} if not opts.quiet: print >> sys.stderr, "Schedd job requirements:", reqs for coll in collectors: for schedd_ad in coll.query( htcondor.AdTypes.Schedd, opts.const, ['MyAddress', 'CondorVersion', 'Name', 'ScheddIpAddr']): if not opts.quiet: print >> sys.stderr, "Querying", schedd_ad.get( 'Name', "Unknown") schedd = htcondor.Schedd(schedd_ad) try: if opts.jobs_only: schedd_data = schedd.xquery(requirements=reqs, projection=[ "x509userproxysubject", "CRAB_UserHN", "JobStatus" ]) else: schedd_data = schedd.xquery( requirements=reqs, projection=[ "x509userproxysubject", "CRAB_UserHN", "JobStatus" ], opts=htcondor.QueryOpts.AutoCluster) except RuntimeError, e: if not opts.quiet: print >> sys.stderr, "Error querying %s: %s" % ( schedd_ad.get('Name', "Unknown"), e) if not opts.jobs_only: for cluster in schedd_data: user = cluster.get("CRAB_UserHN") if (user in users) or ("*" in users): idle_count.setdefault(user, 0) idle_count[user] += int(cluster.get("JobCount", 0)) if 'x509userproxysubject' in cluster: user_map[user] = cluster['x509userproxysubject'] if opts.jobs_only: for job in schedd_data: user = job.get("CRAB_UserHN") if (user in users) or ("*" in users): idle_count.setdefault(user, 0) idle_count[user] += 1 if 'x509userproxysubject' in job: user_map[user] = job['x509userproxysubject']
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists( os.environ["_CONDOR_JOB_AD"]): printLog( "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist" ) sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOne(fd) printLog("Parsed ad: %s" % ad) # instantiate a server object to talk with crabserver host = ad['CRAB_RestHost'] dbInstance = ad['CRAB_DbInstance'] cert = ad['X509UserProxy'] crabserver = CRABRest(host, cert, cert, retry=3, userAgent='CRABSchedd') crabserver.setDbInstance(dbInstance) checkTaskInfo(crabserver, ad) # is this the first time this script runs for this task ? (it runs at each resubmit as well !) if not os.path.exists('WEB_DIR'): makeWebDir(ad) printLog( "Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 maxRetries = 3 while retries < maxRetries and exitCode != 0: exitCode = uploadWebDir(crabserver, ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog( "Exiting AdjustSites because the webdir upload failed %d times." % maxRetries) sys.exit(1) printLog( "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(crabserver, ad) printLog("Proxied webdir saved") printLog( "Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote( ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parseOne(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")
def updateSiteInformation(self, jobs, siteName, excludeSite): """ _updateSiteInformation_ Allow or disallow jobs to run at a site. Called externally by Ops scripts if a site enters or leaves Down, Draining or Aborted. Kill job if after removing site from allowed sites it has nowhere to run. Parameters: excludeSite = False when moving to Normal excludeSite = True when moving to Down, Draining or Aborted """ schedd = htcondor.Schedd() jobtokill = [] try: itobj = schedd.xquery( 'WMAgent_AgentName =?= %s && JobStatus =?= 1' % classad.quote(self.agent), ['ClusterId', 'ProcId', 'DESIRED_Sites', 'ExtDESIRED_Sites']) except Exception as ex: logging.error("Failed to query condor schedd.") logging.exception(ex) return jobtokill else: jobInfo = {} for jobAd in itobj: gridId = "%s.%s" % (jobAd['ClusterId'], jobAd['ProcId']) jobInfo[gridId] = jobAd for job in jobs: jobAd = jobInfo.get(job['gridid'], None) if jobAd: desiredSites = jobAd.get('DESIRED_Sites').split(',') extDesiredSites = jobAd.get('ExtDESIRED_Sites').split(',') if excludeSite: # Remove siteName from DESIRED_Sites if job has it if siteName in desiredSites: if len(desiredSites) > 1: desiredSites.remove(siteName) desiredSites = ','.join(desiredSites) try: schedd.edit([job['gridid']], 'DESIRED_Sites', classad.ExprTree('"%s"' % desiredSites)) except Exception as ex: logging.error( "Failed to edit sites for job %s" % job['gridid']) logging.exception(ex) else: jobtokill.append(job) else: # Add siteName to DESIRED_Sites if ExtDESIRED_Sites has it (moving back to Normal) if siteName not in desiredSites and siteName in extDesiredSites: desiredSites.append(siteName) desiredSites = ','.join(sorted(desiredSites)) try: schedd.edit([job['gridid']], 'DESIRED_Sites', classad.ExprTree('"%s"' % desiredSites)) except Exception as ex: logging.error( "Failed to edit sites for job %s" % job['gridid']) logging.exception(ex) return jobtokill
def customizePerJob(self, job): """ JDL additions just for this implementation. Over-ridden in sub-classes These are the Glide-in specific bits """ jdl = [] jobCE = job['location'] if not jobCE: # Then we ended up with a site that doesn't exist? logging.error("Job for non-existant site %s", job['location']) return jdl if self.submitWMSMode and len(job.get('possibleSites', [])) > 0: strg = ','.join(map(str, job.get('possibleSites'))) jdl.append('+DESIRED_Sites = \"%s\"\n' % strg) else: jdl.append('+DESIRED_Sites = \"%s\"\n' % (jobCE)) if self.submitWMSMode and len(job.get('potentialSites', [])) > 0: strg = ','.join(map(str, job.get('potentialSites'))) jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % strg) else: jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % (jobCE)) if job.get('proxyPath'): jdl.append('x509userproxy = %s\n' % job['proxyPath']) jdl.append('+WMAgent_RequestName = "%s"\n' % job['requestName']) match = GROUP_NAME_RE.match(job['requestName']) if match: jdl.append('+CMSGroups = %s\n' % classad.quote(match.groups()[0])) else: jdl.append('+CMSGroups = undefined\n') jdl.append('+WMAgent_SubTaskName = "%s"\n' % job['taskName']) jdl.append('+CMS_JobType = "%s"\n' % job['taskType']) # Handling for AWS, cloud and opportunistic resources jdl.append('+AllowOpportunistic = %s\n' % job.get('allowOpportunistic', False)) # dataset info if job.get('inputDataset'): jdl.append('+DESIRED_CMSDataset = "%s"\n' % job['inputDataset']) else: jdl.append('+DESIRED_CMSDataset = undefined\n') if job.get('inputDatasetLocations'): jdl.append('+DESIRED_CMSDataLocations = "%s"\n' % ','.join(job['inputDatasetLocations'])) else: jdl.append('+DESIRED_CMSDataLocations = undefined\n') # HighIO and repack jobs handling highio = 1 if job['taskType'] in ["Merge", "Cleanup", "LogCollect"] else 0 repackjob = 1 if job['taskType'] == 'Repack' else 0 jdl.append('+Requestioslots = %d\n' % highio) jdl.append('+RequestRepackslots = %d\n' % repackjob) # Performance and resource estimates (including JDL magic tweaks) origCores = job.get('numberOfCores', 1) estimatedMins = int(job['estimatedJobTime']/60.0) if job.get('estimatedJobTime') else 12*60 estimatedMinsSingleCore = estimatedMins * origCores # For now, assume a 15 minute job startup overhead -- condor will round this up further jdl.append('+EstimatedSingleCoreMins = %d\n' % estimatedMinsSingleCore) jdl.append('+OriginalMaxWallTimeMins = %d\n' % estimatedMins) jdl.append('+MaxWallTimeMins = WMCore_ResizeJob ? (EstimatedSingleCoreMins/RequestCpus + 15) : OriginalMaxWallTimeMins\n') requestMemory = int(job['estimatedMemoryUsage']) if job.get('estimatedMemoryUsage', None) else 1000 jdl.append('+OriginalMemory = %d\n' % requestMemory) jdl.append('+ExtraMemory = %d\n' % self.extraMem) jdl.append('+RequestMemory = OriginalMemory + ExtraMemory * (WMCore_ResizeJob ? (RequestCpus-OriginalCpus) : 0)\n') requestDisk = int(job['estimatedDiskUsage']) if job.get('estimatedDiskUsage', None) else 20*1000*1000*origCores jdl.append('request_disk = %d\n' % requestDisk) # Set up JDL for multithreaded jobs. # By default, RequestCpus will evaluate to whatever CPU request was in the workflow. # If the job is labelled as resizable, then the logic is more complex: # - If the job is running in a slot with N cores, this should evaluate to N # - If the job is being matched against a machine, match all available CPUs, provided # they are between min and max CPUs. # - Otherwise, just use the original CPU count. jdl.append('machine_count = 1\n') minCores = int(job.get('minCores', max(1, origCores/2))) maxCores = max(int(job.get('maxCores', origCores)), origCores) jdl.append('+MinCores = %d\n' % minCores) jdl.append('+MaxCores = %d\n' % maxCores) # Advertise the original CPU setting, in case someone needs this for monitoring jdl.append('+OriginalCpus = %d\n' % origCores) # Prefer slots that are closest to our MaxCores without going over. # If the slot size is _greater_ than our MaxCores, we prefer not to # use it - we might unnecessarily fragment the slot. jdl.append('rank = isUndefined(Cpus) ? 0 : ifThenElse(Cpus > MaxCores, -Cpus, Cpus)\n') # Record the number of CPUs utilized at match time. We'll use this later # for monitoring and accounting. Defaults to 0; once matched, it'll # put an attribute in the job MATCH_EXP_JOB_GLIDEIN_Cpus = 4 jdl.append('+JOB_GLIDEIN_Cpus = "$$(Cpus:0)"\n') # Make sure the resize request stays within MinCores and MaxCores. jdl.append('+RequestResizedCpus = (Cpus>MaxCores) ? MaxCores : ((Cpus < MinCores) ? MinCores : Cpus)\n') # If the job is running, then we should report the matched CPUs in RequestCpus - but only if there are sane # values. Otherwise, we just report the original CPU request jdl.append('+JobCpus = ((JobStatus =!= 1) && (JobStatus =!= 5) && !isUndefined(MATCH_EXP_JOB_GLIDEIN_Cpus) && (int(MATCH_EXP_JOB_GLIDEIN_Cpus) isnt error)) ? int(MATCH_EXP_JOB_GLIDEIN_Cpus) : OriginalCpus\n') # Cpus is taken from the machine ad - hence it is only defined when we are doing negotiation. # Otherwise, we use either the cores in the running job (if available) or the original cores. jdl.append('+RequestCpus = WMCore_ResizeJob ? (!isUndefined(Cpus) ? RequestResizedCpus : JobCpus) : OriginalCpus\n') jdl.append('+WMCore_ResizeJob = %s\n' % bool(job.get('resizeJob', False))) # Add OS requirements for jobs if job.get('scramArch') is not None and job.get('scramArch').startswith("slc6_"): jdl.append('+REQUIRED_OS = "rhel6"\n') else: jdl.append('+REQUIRED_OS = "any"\n') return jdl
def getJobParameters(self, jobList): """ _getJobParameters_ Return a list of dictionaries with submit parameters per job. """ undefined = 'UNDEFINED' jobParameters = [] for job in jobList: ad = {} ad['initial_Dir'] = job['cache_dir'] ad['transfer_input_files'] = "%s,%s/%s,%s" % ( job['sandbox'], job['packageDir'], 'JobPackage.pkl', self.unpacker) ad['Arguments'] = "%s %i %s" % (os.path.basename( job['sandbox']), job['id'], job["retry_count"]) ad['transfer_output_files'] = "Report.%i.pkl,wmagentJob.log" % job[ "retry_count"] # Do not define Requirements and X509 ads for Volunteer resources if self.reqStr and "T3_CH_Volunteer" not in job.get( 'possibleSites'): ad['Requirements'] = self.reqStr ad['My.x509userproxy'] = classad.quote(self.x509userproxy) sites = ','.join(sorted(job.get('possibleSites'))) ad['My.DESIRED_Sites'] = classad.quote(str(sites)) sites = ','.join(sorted(job.get('potentialSites'))) ad['My.ExtDESIRED_Sites'] = classad.quote(str(sites)) ad['My.CMS_JobRetryCount'] = str(job['retry_count']) ad['My.WMAgent_RequestName'] = classad.quote(job['request_name']) match = re.compile("^[a-zA-Z0-9_]+_([a-zA-Z0-9]+)-").match( job['request_name']) if match: ad['My.CMSGroups'] = classad.quote(match.groups()[0]) else: ad['My.CMSGroups'] = undefined ad['My.WMAgent_JobID'] = str(job['jobid']) ad['My.WMAgent_SubTaskName'] = classad.quote(job['task_name']) ad['My.CMS_JobType'] = classad.quote(job['task_type']) ad['My.CMS_Type'] = classad.quote(activityToType(job['activity'])) # Handling for AWS, cloud and opportunistic resources ad['My.AllowOpportunistic'] = str( job.get('allowOpportunistic', False)) if job.get('inputDataset'): ad['My.DESIRED_CMSDataset'] = classad.quote( job['inputDataset']) else: ad['My.DESIRED_CMSDataset'] = undefined if job.get('inputDatasetLocations'): sites = ','.join(sorted(job['inputDatasetLocations'])) ad['My.DESIRED_CMSDataLocations'] = classad.quote(str(sites)) else: ad['My.DESIRED_CMSDataLocations'] = undefined if job.get('inputPileup'): cmsPileups = ','.join(sorted(job['inputPileup'])) ad['My.DESIRED_CMSPileups'] = classad.quote(str(cmsPileups)) else: ad['My.DESIRED_CMSPileups'] = undefined # HighIO and repack jobs ad['My.Requestioslots'] = str( 1 if job['task_type'] in ["Merge", "Cleanup", "LogCollect"] else 0) ad['My.RequestRepackslots'] = str(1 if job['task_type'] == 'Repack' else 0) # Performance and resource estimates (including JDL magic tweaks) origCores = job.get('numberOfCores', 1) estimatedMins = int( job['estimatedJobTime'] / 60.0) if job.get('estimatedJobTime') else 12 * 60 estimatedMinsSingleCore = estimatedMins * origCores # For now, assume a 15 minute job startup overhead -- condor will round this up further ad['My.EstimatedSingleCoreMins'] = str(estimatedMinsSingleCore) ad['My.OriginalMaxWallTimeMins'] = str(estimatedMins) ad['My.MaxWallTimeMins'] = 'WMCore_ResizeJob ? (EstimatedSingleCoreMins/RequestCpus + 15) : OriginalMaxWallTimeMins' requestMemory = int(job['estimatedMemoryUsage']) if job.get( 'estimatedMemoryUsage', None) else 1000 ad['My.OriginalMemory'] = str(requestMemory) ad['My.ExtraMemory'] = str(self.extraMem) ad['request_memory'] = 'OriginalMemory + ExtraMemory * (WMCore_ResizeJob ? (RequestCpus-OriginalCpus) : 0)' requestDisk = int(job['estimatedDiskUsage']) if job.get( 'estimatedDiskUsage', None) else 20 * 1000 * 1000 * origCores ad['request_disk'] = str(requestDisk) # Set up JDL for multithreaded jobs. # By default, RequestCpus will evaluate to whatever CPU request was in the workflow. # If the job is labelled as resizable, then the logic is more complex: # - If the job is running in a slot with N cores, this should evaluate to N # - If the job is being matched against a machine, match all available CPUs, provided # they are between min and max CPUs. # - Otherwise, just use the original CPU count. ad['My.MinCores'] = str(job.get('minCores', max(1, origCores / 2))) ad['My.MaxCores'] = str( max(int(job.get('maxCores', origCores)), origCores)) ad['My.OriginalCpus'] = str(origCores) # Prefer slots that are closest to our MaxCores without going over. # If the slot size is _greater_ than our MaxCores, we prefer not to # use it - we might unnecessarily fragment the slot. ad['Rank'] = 'isUndefined(Cpus) ? 0 : ifThenElse(Cpus > MaxCores, -Cpus, Cpus)' # Record the number of CPUs utilized at match time. We'll use this later # for monitoring and accounting. Defaults to 0; once matched, it'll # put an attribute in the job MATCH_EXP_JOB_GLIDEIN_Cpus = 4 ad['My.JOB_GLIDEIN_Cpus'] = classad.quote("$$(Cpus:0)") # Make sure the resize request stays within MinCores and MaxCores. ad['My.RequestResizedCpus'] = '(Cpus>MaxCores) ? MaxCores : ((Cpus < MinCores) ? MinCores : Cpus)' # If the job is running, then we should report the matched CPUs in RequestCpus - but only if there are sane # values. Otherwise, we just report the original CPU request ad['My.JobCpus'] = ( '((JobStatus =!= 1) && (JobStatus =!= 5) && !isUndefined(MATCH_EXP_JOB_GLIDEIN_Cpus) ' '&& (int(MATCH_EXP_JOB_GLIDEIN_Cpus) isnt error)) ? int(MATCH_EXP_JOB_GLIDEIN_Cpus) : OriginalCpus' ) # Cpus is taken from the machine ad - hence it is only defined when we are doing negotiation. # Otherwise, we use either the cores in the running job (if available) or the original cores. ad['request_cpus'] = 'WMCore_ResizeJob ? (!isUndefined(Cpus) ? RequestResizedCpus : JobCpus) : OriginalCpus' ad['My.WMCore_ResizeJob'] = str(job.get('resizeJob', False)) taskPriority = int(job.get('taskPriority', 1)) priority = int(job.get('wf_priority', 0)) ad['My.JobPrio'] = str(int(priority + taskPriority * 1)) ad['My.PostJobPrio1'] = str( int(-1 * len(job.get('potentialSites', [])))) ad['My.PostJobPrio2'] = str(int(-1 * job['task_id'])) # Add OS requirements for jobs requiredOSes = self.scramArchtoRequiredOS(job.get('scramArch')) ad['My.REQUIRED_OS'] = classad.quote(requiredOSes) cmsswVersions = ','.join(job.get('swVersion')) ad['My.CMSSW_Versions'] = classad.quote(cmsswVersions) jobParameters.append(ad) return jobParameters
def test_quote_unquote_is_symmetric(input): assert classad.unquote(classad.quote(input)) == input
def customizePerJob(self, job): """ JDL additions just for this implementation. Over-ridden in sub-classes These are the Glide-in specific bits """ jdl = [] jobCE = job['location'] if not jobCE: # Then we ended up with a site that doesn't exist? logging.error("Job for non-existant site %s", job['location']) return jdl if self.submitWMSMode and len(job.get('possibleSites', [])) > 0: strg = ','.join([str(x) for x in job.get('possibleSites')]) jdl.append('+DESIRED_Sites = \"%s\"\n' % strg) else: jdl.append('+DESIRED_Sites = \"%s\"\n' % (jobCE)) if self.submitWMSMode and len(job.get('potentialSites', [])) > 0: strg = ','.join([str(x) for x in job.get('potentialSites')]) jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % strg) else: jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % (jobCE)) if job.get('proxyPath'): jdl.append('x509userproxy = %s\n' % job['proxyPath']) jdl.append('+WMAgent_RequestName = "%s"\n' % job['requestName']) match = GROUP_NAME_RE.match(job['requestName']) if match: jdl.append('+CMSGroups = %s\n' % classad.quote(match.groups()[0])) else: jdl.append('+CMSGroups = undefined\n') jdl.append('+WMAgent_SubTaskName = "%s"\n' % job['taskName']) jdl.append('+CMS_JobType = "%s"\n' % job['taskType']) # Handling for AWS, cloud and opportunistic resources jdl.append('+AllowOpportunistic = %s\n' % job.get('allowOpportunistic', False)) # dataset info if job.get('inputDataset'): jdl.append('+DESIRED_CMSDataset = "%s"\n' % job['inputDataset']) else: jdl.append('+DESIRED_CMSDataset = undefined\n') if job.get('inputDatasetLocations'): jdl.append('+DESIRED_CMSDataLocations = "%s"\n' % ','.join(job['inputDatasetLocations'])) else: jdl.append('+DESIRED_CMSDataLocations = undefined\n') # HighIO and repack jobs handling highio = 1 if job['taskType'] in ["Merge", "Cleanup", "LogCollect" ] else 0 repackjob = 1 if job['taskType'] == 'Repack' else 0 jdl.append('+Requestioslots = %d\n' % highio) jdl.append('+RequestRepackslots = %d\n' % repackjob) # Performance and resource estimates (including JDL magic tweaks) origCores = job.get('numberOfCores', 1) estimatedMins = int(job['estimatedJobTime'] / 60.0) if job.get('estimatedJobTime') else 12 * 60 estimatedMinsSingleCore = estimatedMins * origCores # For now, assume a 15 minute job startup overhead -- condor will round this up further jdl.append('+EstimatedSingleCoreMins = %d\n' % estimatedMinsSingleCore) jdl.append('+OriginalMaxWallTimeMins = %d\n' % estimatedMins) jdl.append( '+MaxWallTimeMins = WMCore_ResizeJob ? (EstimatedSingleCoreMins/RequestCpus + 15) : OriginalMaxWallTimeMins\n' ) requestMemory = int(job['estimatedMemoryUsage']) if job.get( 'estimatedMemoryUsage', None) else 1000 jdl.append('+OriginalMemory = %d\n' % requestMemory) jdl.append('+ExtraMemory = %d\n' % self.extraMem) jdl.append( '+RequestMemory = OriginalMemory + ExtraMemory * (WMCore_ResizeJob ? (RequestCpus-OriginalCpus) : 0)\n' ) requestDisk = int(job['estimatedDiskUsage']) if job.get( 'estimatedDiskUsage', None) else 20 * 1000 * 1000 * origCores jdl.append('request_disk = %d\n' % requestDisk) # Set up JDL for multithreaded jobs. # By default, RequestCpus will evaluate to whatever CPU request was in the workflow. # If the job is labelled as resizable, then the logic is more complex: # - If the job is running in a slot with N cores, this should evaluate to N # - If the job is being matched against a machine, match all available CPUs, provided # they are between min and max CPUs. # - Otherwise, just use the original CPU count. jdl.append('machine_count = 1\n') minCores = int(job.get('minCores', max(1, origCores / 2))) maxCores = max(int(job.get('maxCores', origCores)), origCores) jdl.append('+MinCores = %d\n' % minCores) jdl.append('+MaxCores = %d\n' % maxCores) # Advertise the original CPU setting, in case someone needs this for monitoring jdl.append('+OriginalCpus = %d\n' % origCores) # Prefer slots that are closest to our MaxCores without going over. # If the slot size is _greater_ than our MaxCores, we prefer not to # use it - we might unnecessarily fragment the slot. jdl.append( 'rank = isUndefined(Cpus) ? 0 : ifThenElse(Cpus > MaxCores, -Cpus, Cpus)\n' ) # Record the number of CPUs utilized at match time. We'll use this later # for monitoring and accounting. Defaults to 0; once matched, it'll # put an attribute in the job MATCH_EXP_JOB_GLIDEIN_Cpus = 4 jdl.append('+JOB_GLIDEIN_Cpus = "$$(Cpus:0)"\n') # Make sure the resize request stays within MinCores and MaxCores. jdl.append( '+RequestResizedCpus = (Cpus>MaxCores) ? MaxCores : ((Cpus < MinCores) ? MinCores : Cpus)\n' ) # If the job is running, then we should report the matched CPUs in RequestCpus - but only if there are sane # values. Otherwise, we just report the original CPU request jdl.append( '+JobCpus = ((JobStatus =!= 1) && (JobStatus =!= 5) && !isUndefined(MATCH_EXP_JOB_GLIDEIN_Cpus) ' '&& (int(MATCH_EXP_JOB_GLIDEIN_Cpus) isnt error)) ? int(MATCH_EXP_JOB_GLIDEIN_Cpus) : OriginalCpus\n' ) # Cpus is taken from the machine ad - hence it is only defined when we are doing negotiation. # Otherwise, we use either the cores in the running job (if available) or the original cores. jdl.append( '+RequestCpus = WMCore_ResizeJob ? (!isUndefined(Cpus) ? RequestResizedCpus : JobCpus) : OriginalCpus\n' ) jdl.append('+WMCore_ResizeJob = %s\n' % bool(job.get('resizeJob', False))) # Add OS requirements for jobs requiredOSes = self.scramArchtoRequiredOS(job.get('scramArch')) jdl.append('+REQUIRED_OS = "%s"\n' % requiredOSes) return jdl
def track(self, jobs): """ _track_ Track the jobs while in condor This returns a three-way ntuple First, the total number of jobs still running Second, the jobs that need to be changed Third, the jobs that need to be completed """ jobInfo = {} changeList = [] completeList = [] runningList = [] # get info about all active and recent jobs logging.debug("SimpleCondorPlugin is going to track %s jobs", len(jobs)) schedd = htcondor.Schedd() logging.debug("Start: Retrieving classAds using Condor Python XQuery") try: itobj = schedd.xquery("WMAgent_AgentName == %s" % classad.quote(self.agent), ['ClusterId', 'ProcId', 'JobStatus', 'MATCH_EXP_JOBGLIDEIN_CMSSite']) for jobAd in itobj: gridId = "%s.%s" % (jobAd['ClusterId'], jobAd['ProcId']) jobStatus = SimpleCondorPlugin.exitCodeMap().get(jobAd.get('JobStatus'), 'Unknown') location = jobAd.get('MATCH_EXP_JOBGLIDEIN_CMSSite', None) jobInfo[gridId] = (jobStatus, location) except Exception as ex: logging.error("Query to condor schedd failed in SimpleCondorPlugin.") logging.error("Returning empty lists for all job types...") logging.exception(ex) return runningList, changeList, completeList logging.debug("Finished retrieving %d classAds from Condor", len(jobInfo)) # now go over the jobs and see what we have for job in jobs: # if the schedd doesn't know a job, consider it complete # doing any further checks is not cost effective if job['gridid'] not in jobInfo: (newStatus, location) = ('Completed', None) else: (newStatus,location) = jobInfo[job['gridid']] # check for status changes if newStatus != job['status']: # update location info for Idle->Running transition if newStatus == 'Running' and job['status'] == 'Idle': if location: job['location'] = location logging.debug("JobAdInfo: Job location for jobid=%i gridid=%s changed to %s", job['jobid'], job['gridid'], location) job['status'] = newStatus job['status_time'] = int(time.time()) logging.debug("JobAdInfo: Job status for jobid=%i gridid=%s changed to %s", job['jobid'], job['gridid'], job['status']) changeList.append(job) job['globalState'] = SimpleCondorPlugin.stateMap().get(newStatus) # stop tracking finished jobs if job['globalState'] in ['Complete', 'Error']: completeList.append(job) else: runningList.append(job) logging.debug("SimpleCondorPlugin tracking : %i/%i/%i (Executing/Changing/Complete)", len(runningList), len(changeList), len(completeList)) return runningList, changeList, completeList
def alter_submit(self, crab_retry): """ Copy the content of the generic file Job.submit into a job-specific file Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry). Add also parameters that can be overwritten at each manual job resubmission (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES). """ ## Start the Job.<job_id>.submit content with the CRAB_Retry. new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry) msg = "Setting CRAB_Retry = %s" % (crab_retry) self.logger.info(msg) ## For the parameters that can be overwritten at each manual job resubmission, ## read them from the task ad, unless there is resubmission information there ## and this job is not one that has to be resubmitted, in which case we should ## use the same parameters (site black- and whitelists, requested memory, etc) ## as used by the previous job retry (which are saved in self.resubmit_info). CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad) use_resubmit_info = False resubmit_jobids = [] if 'CRAB_ResubmitList' in self.task_ad: resubmit_jobids = self.task_ad['CRAB_ResubmitList'] try: resubmit_jobids = set(resubmit_jobids) if resubmit_jobids and self.job_id not in resubmit_jobids: use_resubmit_info = True except TypeError: resubmit_jobids = True ## If there is no resubmit_info, we can of course not use it. if not self.resubmit_info: use_resubmit_info = False ## Get the resubmission parameters. maxjobruntime = None maxmemory = None numcores = None priority = None if not use_resubmit_info: #if 'MaxWallTimeMins_RAW' in self.task_ad: # if self.task_ad['MaxWallTimeMins_RAW'] != 1315: # maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW') # self.resubmit_info['maxjobruntime'] = maxjobruntime if 'MaxWallTimeMins' in self.task_ad: maxjobruntime = int(str( self.task_ad.lookup('MaxWallTimeMins'))) if 'RequestMemory' in self.task_ad: maxmemory = int(str(self.task_ad.lookup('RequestMemory'))) if 'RequestCpus' in self.task_ad: numcores = int(str(self.task_ad.lookup('RequestCpus'))) if 'JobPrio' in self.task_ad: priority = int(str(self.task_ad['JobPrio'])) else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime') maxmemory = self.resubmit_info[inkey].get('maxmemory') numcores = self.resubmit_info[inkey].get('numcores') priority = self.resubmit_info[inkey].get('priority') ## Save the (new) values of the resubmission parameters in self.resubmit_info ## for the current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime self.resubmit_info[outkey]['maxmemory'] = maxmemory self.resubmit_info[outkey]['numcores'] = numcores self.resubmit_info[outkey]['priority'] = priority self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info self.resubmit_info[outkey][ 'CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad ## Add the resubmission parameters to the Job.<job_id>.submit content. if maxjobruntime is not None: new_submit_text += '+MaxWallTimeMins = %s\n' % (str(maxjobruntime)) if maxmemory is not None: new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory)) if numcores is not None: new_submit_text += '+RequestCpus = %s\n' % (str(numcores)) if priority is not None: new_submit_text += '+JobPrio = %s\n' % (str(priority)) ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority. pre_job_prio = 1 if self.job_id <= 5: pre_job_prio = 0 new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will ## run the job with the higher PostJobPrio1. new_submit_text += '+PostJobPrio1 = -%s\n' % str( self.task_ad.lookup('QDate')) ## Order retries before all other jobs in this task new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry ## Add the site black- and whitelists and the DESIRED_SITES to the ## Job.<job_id>.submit content. new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info) ## Add group information: username = self.task_ad.get('CRAB_UserHN') if 'CMSGroups' in self.task_ad: new_submit_text += '+CMSGroups = %s\n' % classad.quote( self.task_ad['CMSGroups']) elif username: groups = CMSGroupMapper.map_user_to_groups(username) if groups: new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups) ## Finally add (copy) all the content of the generic Job.submit file. with open("Job.submit", 'r') as fd: new_submit_text += fd.read() ## Write the Job.<job_id>.submit file. with open("Job.%d.submit" % (self.job_id), 'w') as fd: fd.write(new_submit_text)
def parse_job_set_file(self, job_set_file): commands = {"name", "iterator", "job"} iterator_types = {"table"} lineno = 0 with open(job_set_file, "rt") as f: while f: line = f.readline() if line == "": break lineno += 1 line = line.strip() if line == "" or line.startswith("#"): continue try: command = line.split()[0].split("=")[0].casefold() except IndexError: raise IndexError( f"""Malformed command in {job_set_file} at line {lineno}.""" ) if command not in commands: raise ValueError( f"""Unrecognized command "{command}" in {job_set_file} at line {lineno}.""" ) if command == "name": if self.name is not None: raise ValueError( f"""Job set name can only be set once, second name found in {job_set_file} at line {lineno}.""" ) try: value = line.split("#")[0].split("=")[1].strip() except IndexError: raise IndexError( f"""Malformed {command} command in {job_set_file} at line {lineno}.""" ) if value.strip() == "": raise ValueError( f"""Blank job set name found in {job_set_file} at line {lineno}.""" ) self.name = value elif command == "iterator": if self.itemdata is not None: raise ValueError( f"""Job set iterator can only be set once, second iterator found in {job_set_file} at line {lineno}.""" ) try: value = line.split("#")[0].split("=")[1].strip() except IndexError: raise IndexError( f"""Malformed {command} command in {job_set_file} at line {lineno}.""" ) if len(value.split()) < 3: raise ValueError( f"""Unparseable iterator "{value}" in {job_set_file} at line {lineno}.""" ) iterator_type = value.split()[0] if iterator_type not in iterator_types: raise ValueError( f"""Unknown iterator type "{iterator_type}" in {job_set_file} at line {lineno}.""" ) if iterator_type == "table": # Get the column names iterator_names = value.replace(",", " ").split()[1:-1] iterator_names = [x.strip() for x in iterator_names] # Read the iterator values into a itemdata list of dicts iterator_source = value.split()[-1] if iterator_source == "{": inline = "{" inlineno = 0 inline_data = "" while inline != "": inline = f.readline() inlineno += 1 if inline.strip() == "": continue if inline.split("#")[0].strip() == "}": break # Assume that a newly opened bracket without # a closing bracket means that there was an error. try: if inline.split( "#")[0].split()[-1].strip() == "{": raise ValueError( f"""Unclosed bracket in {job_set_file} starting at line {lineno}.""" ) except IndexError: pass # Let the parser handle this situation inline_data += inline else: raise ValueError( f"""Unclosed bracket in {job_set_file} starting at line {lineno}.""" ) self.itemdata = self.parse_columnar_itemdata( iterator_names, inline_data, lineno=lineno, fname=job_set_file) lineno += inlineno else: try: with open(iterator_source, "rt") as f_iter: self.itemdata = self.parse_columnar_itemdata( iterator_names, f_iter.read(), fname=iterator_source) except IOError as e: raise IOError( f"Error opening table file {iterator_source} in {job_set_file} at line {lineno}:\n{str(e)}" ) elif command == "job": try: value = " ".join( line.split("#")[0].strip().split()[1:]) except IndexError: raise IndexError( f"""Malformed {command} command in {job_set_file} at line {lineno}.""" ) # Get the variable name mappings mappings = [] if len(value.split()) > 1: mapping_strs = ",".join(value.split()[:-1]) if "=" in mapping_strs: for mapping_str in mapping_strs.split(","): mapping = tuple( x.strip() for x in mapping_str.split("=")) if len(mapping) != 2: raise ValueError( f"""Unsupported mapping "{mapping_str}" in {job_set_file} at line {lineno}.""" ) mappings.append(mapping) else: raise ValueError( f"""Unsupported mapping "{' '.join(value.split()[:-1])}" in {job_set_file} at line {lineno}.""" ) mappings = dict(mappings) # Read the job submit description into a Submit object job_source = value.split()[-1] if job_source == "{": inline = "{" inlineno = 0 inline_data = "" while inline != "": inline = f.readline() inlineno += 1 if inline.strip() == "": continue if inline.split("#")[0].strip() == "}": break # Assume that a newly opened bracket without # a closing bracket means that there was an error. try: if inline.split( "#")[0].split()[-1].strip() == "{": raise ValueError( f"""Unclosed bracket in {job_set_file} starting at line {lineno}.""" ) except IndexError: pass # Let the parser handle this situation inline_data += inline.lstrip() else: raise ValueError( f"""Unclosed bracket in {job_set_file} starting at line {lineno}.""" ) lineno += inlineno submit_obj = htcondor.Submit(inline_data) #Set s_method to HTC_JOBSET_SUBMIT submit_obj.setSubmitMethod(JSM_HTC_JOBSET_SUBMIT, True) else: try: with open(job_source, "rt") as f_sub: submit_obj = htcondor.Submit(f_sub.read()) #Set s_method to HTC_JOBSET_SUBMIT submit_obj.setSubmitMethod( JSM_HTC_JOBSET_SUBMIT, True) except IOError as e: raise IOError( f"Error opening submit description file {job_source} in {job_set_file} at line {lineno}:\n{str(e)}" ) # Remap variables in the Submit object submit_obj = self.remap_submit_variables( mappings, submit_obj) # Store each job self.jobs.append(submit_obj) # Add job set name to each job's Submit object for i_job, job in enumerate(self.jobs): job["MY.JobSetName"] = classad.quote(self.name) job["MY.InJobSet"] = True
def submitDirect(self, schedd, cmd, arg, info): # pylint: disable=R0201 """ Submit directly to the schedd using the HTCondor module """ dagAd = classad.ClassAd() addCRABInfoToClassAd(dagAd, info) if info["CMSGroups"]: dagAd["CMSGroups"] = ",".join(info["CMSGroups"]) else: dagAd["CMSGroups"] = classad.Value.Undefined # NOTE: Changes here must be synchronized with the job_submit in DagmanCreator.py in CAFTaskWorker dagAd["CRAB_Attempt"] = 0 # We switched from local to scheduler universe. Why? It seems there's no way in the # local universe to change the hold signal at runtime. That's fairly important for our # resubmit implementation. # dagAd["JobUniverse"] = 12 dagAd["JobUniverse"] = 7 dagAd["HoldKillSig"] = "SIGUSR1" dagAd["X509UserProxy"] = info["user_proxy"] dagAd["Requirements"] = classad.ExprTree("true || false") dagAd["TaskType"] = "ROOT" dagAd["Environment"] = classad.ExprTree( 'strcat("PATH=/usr/bin:/bin CRAB3_VERSION=3.3.0-pre1 CONDOR_ID=", ClusterId, ".", ProcId," %s")' % " ".join(info["additional_environment_options"].split(";")) ) dagAd["RemoteCondorSetup"] = info["remote_condor_setup"] with open("subdag.ad", "w") as fd: for k, v in dagAd.items(): if k == "X509UserProxy": v = os.path.basename(v) if isinstance(v, basestring): value = classad.quote(v) elif isinstance(v, classad.ExprTree): value = repr(v) else: value = v fd.write("+{0} = {1}\n".format(k, value)) dagAd["Out"] = str(os.path.join(info["scratch"], "request.out")) dagAd["Err"] = str(os.path.join(info["scratch"], "request.err")) dagAd["Cmd"] = cmd dagAd["Args"] = arg dagAd["TransferInput"] = str(info["inputFilesString"]) dagAd["CRAB_TaskSubmitTime"] = classad.ExprTree("%s" % info["start_time"].encode("ascii", "ignore")) # Putting JobStatus == 4 since LeaveJobInQueue is for completed jobs (probably redundant) LEAVE_JOB_IN_QUEUE_EXPR = "(JobStatus == 4) && ((time()-CRAB_TaskSubmitTime) < %s)" % TASKLIFETIME dagAd["LeaveJobInQueue"] = classad.ExprTree(LEAVE_JOB_IN_QUEUE_EXPR) # Removing a task after the expiration date no matter what its status is dagAd["PeriodicRemove"] = classad.ExprTree("((time()-CRAB_TaskSubmitTime) > %s)" % TASKLIFETIME) dagAd["TransferOutput"] = info["outputFilesString"] dagAd["OnExitRemove"] = classad.ExprTree( "( ExitSignal =?= 11 || (ExitCode =!= UNDEFINED && ExitCode >=0 && ExitCode <= 2))" ) dagAd["OtherJobRemoveRequirements"] = classad.ExprTree("DAGManJobId =?= ClusterId") dagAd["RemoveKillSig"] = "SIGUSR1" dagAd["OnExitHold"] = classad.ExprTree("(ExitCode =!= UNDEFINED && ExitCode != 0)") condorIdDict = {} with HTCondorUtils.AuthenticatedSubprocess(info["user_proxy"], pickleOut=True, outputObj=condorIdDict) as ( parent, rpipe, ): if not parent: resultAds = [] condorIdDict["ClusterId"] = schedd.submit(dagAd, 1, True, resultAds) schedd.spool(resultAds) # editing the LeaveJobInQueue since the remote submit overwrites it # see https://github.com/dmwm/CRABServer/pull/5212#issuecomment-216519749 if resultAds: id_ = "%s.%s" % (resultAds[0]["ClusterId"], resultAds[0]["ProcId"]) schedd.edit([id_], "LeaveJobInQueue", classad.ExprTree(LEAVE_JOB_IN_QUEUE_EXPR)) results = pickle.load(rpipe) # notice that the clusterId might be set even if there was a failure. This is if the schedd.submit succeded, but the spool call failed if "ClusterId" in results.outputObj: self.logger.debug("Condor cluster ID just submitted is: %s", results.outputObj["ClusterId"]) if results.outputMessage != "OK": self.logger.debug( "Now printing the environment used for submission:\n" + "-" * 70 + "\n" + results.environmentStr + "-" * 70 ) raise TaskWorkerException( "Failure when submitting task to scheduler. Error reason: '%s'" % results.outputMessage, retry=True ) # if we don't raise exception above the id is here return results.outputObj["ClusterId"]
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(os.environ["_CONDOR_JOB_AD"]): printLog("Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist") sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog("Exiting AdjustSites because the webdir upload failed three times.") sys.exit(1) printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog("Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions") clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")
def alter_submit(self, crab_retry): """ Copy the content of the generic file Job.submit into a job-specific file Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry). Add also parameters that can be overwritten at each manual job resubmission (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES). """ ## Start the Job.<job_id>.submit content with the CRAB_Retry. new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry) msg = "Setting CRAB_Retry = %s" % (crab_retry) self.logger.info(msg) ## For the parameters that can be overwritten at each manual job resubmission, ## read them from the task ad, unless there is resubmission information there ## and this job is not one that has to be resubmitted, in which case we should ## use the same parameters (site black- and whitelists, requested memory, etc) ## as used by the previous job retry (which are saved in self.resubmit_info). CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad) use_resubmit_info = False resubmit_jobids = [] if 'CRAB_ResubmitList' in self.task_ad: resubmit_jobids = map(str, self.task_ad['CRAB_ResubmitList']) try: resubmit_jobids = set(resubmit_jobids) if resubmit_jobids and self.job_id not in resubmit_jobids: use_resubmit_info = True except TypeError: resubmit_jobids = True ## If there is no resubmit_info, we can of course not use it. if not self.resubmit_info: use_resubmit_info = False ## Get the resubmission parameters. maxjobruntime = None maxmemory = None numcores = None priority = None if not use_resubmit_info: #if 'MaxWallTimeMins_RAW' in self.task_ad: # if self.task_ad['MaxWallTimeMins_RAW'] != 1315: # maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW') # self.resubmit_info['maxjobruntime'] = maxjobruntime if 'MaxWallTimeMinsProbe' in self.task_ad and self.stage == 'probe': maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMinsProbe'))) elif 'MaxWallTimeMinsTail' in self.task_ad and self.stage == 'tail': maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMinsTail'))) elif 'MaxWallTimeMins' in self.task_ad: maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMins'))) if 'RequestMemory' in self.task_ad: maxmemory = int(str(self.task_ad.lookup('RequestMemory'))) if 'RequestCpus' in self.task_ad: numcores = int(str(self.task_ad.lookup('RequestCpus'))) if 'JobPrio' in self.task_ad: priority = int(str(self.task_ad['JobPrio'])) if str(self.job_id) == '0': #jobids can be like 1-1 for subjobs priority = 20 #the maximum for splitting jobs else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime') maxmemory = self.resubmit_info[inkey].get('maxmemory') numcores = self.resubmit_info[inkey].get('numcores') priority = self.resubmit_info[inkey].get('priority') ## Save the (new) values of the resubmission parameters in self.resubmit_info ## for the current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime self.resubmit_info[outkey]['maxmemory'] = maxmemory self.resubmit_info[outkey]['numcores'] = numcores self.resubmit_info[outkey]['priority'] = priority self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info self.resubmit_info[outkey]['CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad ## Add the resubmission parameters to the Job.<job_id>.submit content. if maxjobruntime is not None: new_submit_text += '+EstimatedWallTimeMins = %s\n' % str(maxjobruntime) new_submit_text += '+MaxWallTimeMins = (JobStatus=?=1) ? EstimatedWallTimeMins : %s\n' % str(maxjobruntime) if maxmemory is not None: new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory)) if numcores is not None: new_submit_text += '+RequestCpus = %s\n' % (str(numcores)) if priority is not None: new_submit_text += '+JobPrio = %s\n' % (str(priority)) ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority. pre_job_prio = 1 if int(self.job_id.split('-')[0]) <= 5: pre_job_prio = 0 new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will ## run the job with the higher PostJobPrio1. new_submit_text += '+PostJobPrio1 = -%s\n' % str(self.task_ad.lookup('QDate')) ## Order retries before all other jobs in this task new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry ## Add the site black- and whitelists and the DESIRED_SITES to the ## Job.<job_id>.submit content. new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info) ## Add group information: username = self.task_ad.get('CRAB_UserHN') if 'CMSGroups' in self.task_ad: new_submit_text += '+CMSGroups = %s\n' % classad.quote(self.task_ad['CMSGroups']) elif username: groups = CMSGroupMapper.map_user_to_groups(username) if groups: new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups) ## Finally add (copy) all the content of the generic Job.submit file. with open("Job.submit", 'r') as fd: new_submit_text += fd.read() ## Write the Job.<job_id>.submit file. with open("Job.%s.submit" % (self.job_id), 'w') as fd: fd.write(new_submit_text)
def updateSiteInformation(self, jobs, siteName, excludeSite): """ _updateSiteInformation_ Allow or disallow jobs to run at a site. Called externally by Ops scripts if a site enters or leaves Down, Draining or Aborted. Kill job if after removing site from allowed sites it has nowhere to run. Parameters: excludeSite = False when moving to Normal excludeSite = True when moving to Down, Draining or Aborted """ sd = htcondor.Schedd() jobIdToKill = [] jobtokill = [] origSiteLists = set() try: itobj = sd.xquery('WMAgent_AgentName =?= %s && JobStatus =?= 1' % classad.quote(self.agent), ['WMAgent_JobID', 'DESIRED_Sites', 'ExtDESIRED_Sites']) for jobAd in itobj: jobAdId = jobAd.get('WMAgent_JobID') desiredSites = jobAd.get('DESIRED_Sites') extDesiredSites = jobAd.get('ExtDESIRED_Sites') if excludeSite and siteName == desiredSites: jobIdToKill.append(jobAdId) else: origSiteLists.add((desiredSites, extDesiredSites)) logging.info("Set of %d site list condor combinations", len(origSiteLists)) except Exception as ex: msg = "Failed to query condor schedd: %s" % str(ex) logging.exception(msg) return jobtokill with sd.transaction() as txn: for siteStrings in origSiteLists: desiredList = set([site.strip() for site in siteStrings[0].split(",")]) extDesiredList = set([site.strip() for site in siteStrings[1].split(",")]) if excludeSite and siteName not in desiredList: continue elif not excludeSite and (siteName in desiredList or siteName not in extDesiredList): continue elif excludeSite: desiredList.remove(siteName) extDesiredList.add(siteName) else: # well, then include desiredList.add(siteName) extDesiredList.remove(siteName) # now put it back in the string format expected by condor desiredList = ",".join(desiredList) extDesiredList = ",".join(extDesiredList) try: sd.edit('DESIRED_Sites =?= %s && ExtDESIRED_Sites =?= %s' % (classad.quote(siteStrings[0]), classad.quote(siteStrings[1])), "DESIRED_Sites", classad.quote(str(desiredList))) sd.edit('DESIRED_Sites =?= %s && ExtDESIRED_Sites =?= %s' % (classad.quote(siteStrings[0]), classad.quote(siteStrings[1])), "ExtDESIRED_Sites", classad.quote(str(extDesiredList))) except RuntimeError as ex: msg = 'Failed to condor edit job sites. Could be that no jobs were in condor anymore: %s' % str(ex) logging.warning(msg) # now update the list of jobs to be killed jobtokill = [job for job in jobs if job['id'] in jobIdToKill] return jobtokill
def classad_quote(input_value): import classad return classad.quote(str(input_value))
def find_job_event_logs( users=None, cluster_ids=None, files=None, batches=None, collector=None, schedd=None, ): """ Discover job event logs to read events from based on various methods. Parameters ---------- users Find job event logs for these user's active jobs. cluster_ids Find job event logs for these clusters. files Find these job event logs (basically, these just get passed straight through). batches Find job event logs for these batch names. collector Query this collector to find the schedd. Defaults to the local collector. schedd Query this schedd for users, cluster_ids, and batches. Defaults to the local schedd. """ if users is None: users = [] if cluster_ids is None: cluster_ids = [] if files is None: files = [] if batches is None: batches = [] constraint = " || ".join( itertools.chain( ("Owner == {}".format(classad.quote(u)) for u in users), ("ClusterId == {}".format(cid) for cid in cluster_ids), ("JobBatchName == {}".format(b) for b in batches), )) clusters = set() event_logs = set() batch_names = {} already_warned_missing_log = set() dagman_job_cluster_id_to_log_path = {} dagman_job_cluster_ids = set() for file in files: event_logs.add(os.path.abspath(file)) for ad in get_ads(constraint, collector, schedd): cluster_id = ad["ClusterId"] clusters.add(cluster_id) batch_names[cluster_id] = ad.get("JobBatchName") if "DAGManNodesLog" in ad: log_path = dagman_job_cluster_id_to_log_path[cluster_id] = ad[ "DAGManNodesLog"] elif "UserLog" in ad: log_path = ad["UserLog"] else: if cluster_id not in already_warned_missing_log: warning( "Cluster {} does not have a job event log file (set log=<path> in the submit description)" .format(cluster_id)) already_warned_missing_log.add(cluster_id) continue if not os.path.isabs(log_path): log_path = os.path.abspath(os.path.join(ad["Iwd"], log_path)) event_logs.add(log_path) # this job is the actual DAGMan controller job if "OtherJobRemoveRequirements" in ad: dagman_job_cluster_ids.add(cluster_id) return ( clusters, constraint, event_logs, batch_names, dagman_job_cluster_id_to_log_path, dagman_job_cluster_ids, )
def customizePerJob(self, job): """ JDL additions just for this implementation. Over-ridden in sub-classes These are the Glide-in specific bits """ jdl = [] jobCE = job['location'] if not jobCE: # Then we ended up with a site that doesn't exist? logging.error("Job for non-existant site %s", job['location']) return jdl if self.submitWMSMode and len(job.get('possibleSites', [])) > 0: strg = ','.join(map(str, job.get('possibleSites'))) jdl.append('+DESIRED_Sites = \"%s\"\n' % strg) else: jdl.append('+DESIRED_Sites = \"%s\"\n' % (jobCE)) if self.submitWMSMode and len(job.get('potentialSites', [])) > 0: strg = ','.join(map(str, job.get('potentialSites'))) jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % strg) else: jdl.append('+ExtDESIRED_Sites = \"%s\"\n' % (jobCE)) if job.get('proxyPath'): jdl.append('x509userproxy = %s\n' % job['proxyPath']) jdl.append('+WMAgent_RequestName = "%s"\n' % job['requestName']) match = GROUP_NAME_RE.match(job['requestName']) if match: jdl.append('+CMSGroups = %s\n' % classad.quote(match.groups()[0])) else: jdl.append('+CMSGroups = undefined\n') jdl.append('+WMAgent_SubTaskName = "%s"\n' % job['taskName']) jdl.append('+CMS_JobType = "%s"\n' % job['taskType']) # Handling for AWS, cloud and opportunistic resources jdl.append('+AllowOpportunistic = %s\n' % job.get('allowOpportunistic', False)) # dataset info if job.get('inputDataset'): jdl.append('+DESIRED_CMSDataset = "%s"\n' % job['inputDataset']) else: jdl.append('+DESIRED_CMSDataset = undefined\n') if job.get('inputDatasetLocations'): jdl.append('+DESIRED_CMSDataLocations = "%s"\n' % ','.join(job['inputDatasetLocations'])) else: jdl.append('+DESIRED_CMSDataLocations = undefined\n') # HighIO and repack jobs handling highio = 1 if job['taskType'] in ["Merge", "Cleanup", "LogCollect" ] else 0 repackjob = 1 if job['taskType'] == 'Repack' else 0 jdl.append('+Requestioslots = %d\n' % highio) jdl.append('+RequestRepackslots = %d\n' % repackjob) # Performance and resource estimates numberOfCores = job.get('numberOfCores', 1) requestMemory = int(job['estimatedMemoryUsage']) if job.get( 'estimatedMemoryUsage', None) else 1000 requestDisk = int(job['estimatedDiskUsage']) if job.get( 'estimatedDiskUsage', None) else 20 * 1000 * 1000 * numberOfCores maxWallTimeMins = int(job['estimatedJobTime']) / 60.0 if job.get( 'estimatedJobTime', None) else 12 * 60 jdl.append('request_memory = %d\n' % requestMemory) jdl.append('request_disk = %d\n' % requestDisk) jdl.append('+MaxWallTimeMins = %d\n' % maxWallTimeMins) # How many cores job is using jdl.append('machine_count = 1\n') jdl.append('request_cpus = %s\n' % numberOfCores) # Add OS requirements for jobs if job.get('scramArch') is not None and job.get( 'scramArch').startswith("slc6_"): jdl.append('+REQUIRED_OS = "rhel6"\n') else: jdl.append('+REQUIRED_OS = "any"\n') return jdl
def updateSiteInformation(self, jobs, siteName, excludeSite): """ _updateSiteInformation_ Allow or disallow jobs to run at a site. Called externally by Ops scripts if a site enters or leaves Down, Draining or Aborted. Kill job if after removing site from allowed sites it has nowhere to run. Parameters: excludeSite = False when moving to Normal excludeSite = True when moving to Down, Draining or Aborted """ sd = htcondor.Schedd() jobIdToKill = [] jobtokill = [] origSiteLists = set() try: itobj = sd.xquery( 'WMAgent_AgentName =?= %s && JobStatus =?= 1' % classad.quote(self.agent), ['WMAgent_JobID', 'DESIRED_Sites', 'ExtDESIRED_Sites']) for jobAd in itobj: jobAdId = jobAd.get('WMAgent_JobID') desiredSites = jobAd.get('DESIRED_Sites') extDesiredSites = jobAd.get('ExtDESIRED_Sites') if excludeSite and siteName == desiredSites: jobIdToKill.append(jobAdId) else: origSiteLists.add((desiredSites, extDesiredSites)) logging.info("Set of %d site list condor combinations", len(origSiteLists)) except Exception as ex: msg = "Failed to query condor schedd: %s" % str(ex) logging.exception(msg) return jobtokill with sd.transaction() as dummyTxn: for siteStrings in origSiteLists: desiredList = set( [site.strip() for site in siteStrings[0].split(",")]) extDesiredList = set( [site.strip() for site in siteStrings[1].split(",")]) if excludeSite and siteName not in desiredList: continue elif not excludeSite and (siteName in desiredList or siteName not in extDesiredList): continue elif excludeSite: desiredList.remove(siteName) extDesiredList.add(siteName) else: # well, then include desiredList.add(siteName) extDesiredList.remove(siteName) # now put it back in the string format expected by condor desiredListStr = ",".join(desiredList) extDesiredListStr = ",".join(extDesiredList) try: sd.edit( 'DESIRED_Sites =?= %s && ExtDESIRED_Sites =?= %s' % (classad.quote(siteStrings[0]), classad.quote(siteStrings[1])), "DESIRED_Sites", classad.quote(str(desiredListStr))) sd.edit( 'DESIRED_Sites =?= %s && ExtDESIRED_Sites =?= %s' % (classad.quote(siteStrings[0]), classad.quote(siteStrings[1])), "ExtDESIRED_Sites", classad.quote(str(extDesiredListStr))) except RuntimeError as ex: msg = 'Failed to condor edit job sites. Could be that no jobs were in condor anymore: %s' % str( ex) logging.warning(msg) # now update the list of jobs to be killed jobtokill = [job for job in jobs if job['id'] in jobIdToKill] return jobtokill
def alter_submit(self, crab_retry): """ Copy the content of the generic file Job.submit into a job-specific file Job.<job_id>.submit and add attributes that are job-specific (e.g. CRAB_Retry). Add also parameters that can be overwritten at each manual job resubmission (e.g. MaxWallTimeMins, RequestMemory, RequestCpus, JobPrio, DESIRED_SITES). """ ## Start the Job.<job_id>.submit content with the CRAB_Retry. new_submit_text = '+CRAB_Retry = %d\n' % (crab_retry) msg = "Setting CRAB_Retry = %s" % (crab_retry) self.logger.info(msg) ## For the parameters that can be overwritten at each manual job resubmission, ## read them from the task ad, unless there is resubmission information there ## and this job is not one that has to be resubmitted, in which case we should ## use the same parameters (site black- and whitelists, requested memory, etc) ## as used by the previous job retry (which are saved in self.resubmit_info). CRAB_ResubmitList_in_taskad = ('CRAB_ResubmitList' in self.task_ad) use_resubmit_info = False resubmit_jobids = [] if 'CRAB_ResubmitList' in self.task_ad: resubmit_jobids = self.task_ad['CRAB_ResubmitList'] try: resubmit_jobids = set(resubmit_jobids) if resubmit_jobids and self.job_id not in resubmit_jobids: use_resubmit_info = True except TypeError: resubmit_jobids = True ## If there is no resubmit_info, we can of course not use it. if not self.resubmit_info: use_resubmit_info = False ## Get the resubmission parameters. maxjobruntime = None maxmemory = None numcores = None priority = None if not use_resubmit_info: #if 'MaxWallTimeMins_RAW' in self.task_ad: # if self.task_ad['MaxWallTimeMins_RAW'] != 1315: # maxjobruntime = self.task_ad.lookup('MaxWallTimeMins_RAW') # self.resubmit_info['maxjobruntime'] = maxjobruntime if 'MaxWallTimeMins' in self.task_ad: maxjobruntime = int(str(self.task_ad.lookup('MaxWallTimeMins'))) if 'RequestMemory' in self.task_ad: maxmemory = int(str(self.task_ad.lookup('RequestMemory'))) if 'RequestCpus' in self.task_ad: numcores = int(str(self.task_ad.lookup('RequestCpus'))) if 'JobPrio' in self.task_ad: priority = int(str(self.task_ad['JobPrio'])) else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) maxjobruntime = self.resubmit_info[inkey].get('maxjobruntime') maxmemory = self.resubmit_info[inkey].get('maxmemory') numcores = self.resubmit_info[inkey].get('numcores') priority = self.resubmit_info[inkey].get('priority') ## Save the (new) values of the resubmission parameters in self.resubmit_info ## for the current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['maxjobruntime'] = maxjobruntime self.resubmit_info[outkey]['maxmemory'] = maxmemory self.resubmit_info[outkey]['numcores'] = numcores self.resubmit_info[outkey]['priority'] = priority self.resubmit_info[outkey]['use_resubmit_info'] = use_resubmit_info self.resubmit_info[outkey]['CRAB_ResubmitList_in_taskad'] = CRAB_ResubmitList_in_taskad ## Add the resubmission parameters to the Job.<job_id>.submit content. if maxjobruntime is not None: new_submit_text += '+MaxWallTimeMins = %s\n' % (str(maxjobruntime)) if maxmemory is not None: new_submit_text += '+RequestMemory = %s\n' % (str(maxmemory)) if numcores is not None: new_submit_text += '+RequestCpus = %s\n' % (str(numcores)) if priority is not None: new_submit_text += '+JobPrio = %s\n' % (str(priority)) ## Within the schedd, order the first few jobs in the task before all other tasks of the same priority. pre_job_prio = 1 if self.job_id <= 5: pre_job_prio = 0 new_submit_text += '+PreJobPrio1 = %d\n' % pre_job_prio ## The schedd will use PostJobPrio1 as a secondary job-priority sorting key: it ## will first run jobs by JobPrio; then, for jobs with the same JobPrio, it will ## run the job with the higher PostJobPrio1. new_submit_text += '+PostJobPrio1 = -%s\n' % str(self.task_ad.lookup('QDate')) ## Order retries before all other jobs in this task new_submit_text += '+PostJobPrio2 = %d\n' % crab_retry ## This is used to send to dashbord the location of the logfiles try: storage_rules = htcondor.param['CRAB_StorageRules'] except: storage_rules = "^/home/remoteGlidein,http://submit-5.t2.ucsd.edu/CSstoragePath" new_submit_text += '+CRAB_UserWebDir = "%s"\n' % getWebdirForDb(str(self.task_ad.get('CRAB_ReqName')), storage_rules) try: with open('proxied_webdir') as fd: proxied_webdir = fd.read() new_submit_text += '+CRAB_UserWebDirPrx = "%s"\n' % proxied_webdir except IOError as e: self.logger.error(("'I/O error(%s): %s', when looking for the proxied_webdir file. Might be normal" " if the schedd does not have a proxiedurl in the REST external config." % (e.errno, e.strerror))) ## Add the site black- and whitelists and the DESIRED_SITES to the ## Job.<job_id>.submit content. new_submit_text = self.redo_sites(new_submit_text, crab_retry, use_resubmit_info) ## Add group information: username = self.task_ad.get('CRAB_UserHN') if 'CMSGroups' in self.task_ad: new_submit_text += '+CMSGroups = %s\n' % classad.quote(self.task_ad['CMSGroups']) elif username: groups = CMSGroupMapper.map_user_to_groups(username) if groups: new_submit_text += '+CMSGroups = %s\n' % classad.quote(groups) ## Finally add (copy) all the content of the generic Job.submit file. with open("Job.submit", 'r') as fd: new_submit_text += fd.read() ## Write the Job.<job_id>.submit file. with open("Job.%d.submit" % (self.job_id), 'w') as fd: fd.write(new_submit_text)