def test_warnings(self): with warnings.catch_warnings(record=True) as w: warnings.simplefilter("always") classad.parseOld("foo = 1\nbar = 2") self.assertEqual(len(w), 1) self.assertTrue(issubclass(w[-1].category, DeprecationWarning)) self.assertTrue("deprecated" in str(w[-1].message))
def main(): """ Need a doc string here. """ ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) makeWebDir(ad) retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries*20) retries += 1 clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = set(ad['CRAB_ResubmitList']) dagJobId = '%d.%d' % (ad['ClusterId'], ad['ProcId']) ad['foo'] = [] try: htcondor.Schedd().edit([dagJobId], 'CRAB_ResubmitList', ad['foo']) except RuntimeError, reerror: print "ERROR: %s" % str(reerror)
def main(): ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) make_webdir(ad) make_job_submit(ad) retries = 0 exit_code = 1 while retries < 3 and exit_code != 0: exit_code = updatewebdir(ad) if exit_code != 0: time.sleep(retries * 20) retries += 1 clear_automatic_blacklist(ad) blacklist = set() if 'CRAB_SiteBlacklist' in ad: blacklist = set(ad['CRAB_SiteBlacklist']) whitelist = set() if 'CRAB_SiteWhitelist' in ad: whitelist = set(ad['CRAB_SiteWhitelist']) resubmit = [] if 'CRAB_ResubmitList' in ad: resubmit = set(ad['CRAB_ResubmitList']) id = '%d.%d' % (ad['ClusterId'], ad['ProcId']) ad['foo'] = [] try: htcondor.Schedd().edit([id], 'CRAB_ResubmitList', ad['foo']) except RuntimeError, reerror: print "ERROR: %s" % str(reerror)
def get_job_ad_from_condor_q(self): """ Need a doc string here. """ if self.dag_clusterid == -1: return shutil.copy("job_log", "job_log.%s" % str(self.dag_jobid)) p = subprocess.Popen([ "condor_q", "-debug", "-l", "-userlog", "job_log.%s" % str(self.dag_jobid), str(self.dag_jobid) ], stdout=subprocess.PIPE, stderr=sys.stderr) output, _ = p.communicate() status = p.returncode try: os.unlink("job_log.%s" % str(self.dag_jobid)) except: pass if status: raise FatalError("Failed to query condor user log:\n%s" % output) for text_ad in output.split("\n\n"): try: ad = classad.parseOld(text_ad) except SyntaxError: continue if ad: self.ads.append(ad) self.ad = self.ads[-1]
def get_job_ad_from_file(self): """ Need a doc string here """ self.ads.append(self.ad) if self.dag_retry == 0: msg = "This is job retry number 0. Will not try to search and load previous job ads." self.logger.info(msg) return for dag_retry in range(self.dag_retry): job_ad_file = os.path.join(".", "finished_jobs", "job.%s.%d" % (self.job_id, dag_retry)) if os.path.isfile(job_ad_file): try: with open(job_ad_file) as fd: ad = classad.parseOld(fd) except Exception: msg = "Unable to parse classads from file %s. Continuing." % ( job_ad_file) self.logger.warning(msg) continue if ad: self.ads.append(ad) else: msg = "File %s does not exist. Continuing." % (job_ad_file) self.logger.warning(msg)
def get_job_ad(self): try: cluster = int(self.cluster.split(".")[0]) if cluster == -1: return except ValueError: pass shutil.copy("job_log", "job_log.%s" % str(self.cluster)) p = subprocess.Popen(["condor_q", "-debug", "-l", "-userlog", "job_log.%s" % str(self.cluster), str(self.cluster)], stdout=subprocess.PIPE, stderr=sys.stderr) output, _ = p.communicate() status = p.returncode try: os.unlink("job_log.%s" % str(self.cluster)) except: pass if status: raise FatalError("Failed to query condor user log:\n%s" % output) self.ads = [] for text_ad in output.split("\n\n"): try: ad = classad.parseOld(text_ad) except SyntaxError: continue if ad: self.ads.append(ad) self.ad = self.ads[-1] if 'JOBGLIDEIN_CMSSite' in self.ad: self.site = self.ad['JOBGLIDEIN_CMSSite']
def get_job_ad_from_condor_q(self): """ Need a doc string here. """ if self.dag_clusterid == -1: return shutil.copy("job_log", "job_log.%s" % str(self.dag_jobid)) p = subprocess.Popen( ["condor_q", "-debug", "-l", "-userlog", "job_log.%s" % str(self.dag_jobid), str(self.dag_jobid)], stdout=subprocess.PIPE, stderr=sys.stderr, ) output, _ = p.communicate() status = p.returncode try: os.unlink("job_log.%s" % str(self.dag_jobid)) except: pass if status: raise FatalError("Failed to query condor user log:\n%s" % output) for text_ad in output.split("\n\n"): try: ad = classad.parseOld(text_ad) except SyntaxError: continue if ad: self.ads.append(ad) self.ad = self.ads[-1]
def main(): ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) make_webdir(ad) make_job_submit(ad) retries = 0 exit_code = 1 while retries < 3 and exit_code != 0: exit_code = updatewebdir(ad) if exit_code != 0: time.sleep(retries*20) retries += 1 clear_automatic_blacklist(ad) blacklist = set() if 'CRAB_SiteBlacklist' in ad: blacklist = set(ad['CRAB_SiteBlacklist']) whitelist = set() if 'CRAB_SiteWhitelist' in ad: whitelist = set(ad['CRAB_SiteWhitelist']) resubmit = [] if 'CRAB_ResubmitList' in ad: resubmit = set(ad['CRAB_ResubmitList']) id = '%d.%d' % (ad['ClusterId'], ad['ProcId']) ad['foo'] = [] try: htcondor.Schedd().edit([id], 'CRAB_ResubmitList', ad['foo']) except RuntimeError, reerror: print "ERROR: %s" % str(reerror)
def get_task_ad(self): """ Need a doc string here. """ self.task_ad = {} try: self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) except: msg = "Got exception while trying to parse the job ad." self.logger.exception(msg)
def get_task_ad(self): """ Need a doc string here. """ self.task_ad = {} try: self.logger.info("Loading classads from: %s" % os.environ['_CONDOR_JOB_AD']) self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) except: msg = "Got exception while trying to parse the job ad." self.logger.exception(msg)
def test_old_classad(self): ad = classad.parseOld(open("tests/test.old.ad")) contents = open("tests/test.old.ad").read() keys = [] for line in contents.splitlines(): info = line.split(" = ") if len(info) != 2: continue self.assertTrue(info[0] in ad) self.assertEqual(ad.lookup(info[0]).__repr__(), info[1]) keys.append(info[0]) for key in ad: self.assertTrue(key in keys)
def get_task_ad(self): """ Need a doc string here. """ self.task_ad = {} try: self.logger.info("Loading classads from: %s" % os.environ['_CONDOR_JOB_AD']) self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) self.logger.info(os.listdir('.')) self.logger.info(str(self.task_ad)) except: msg = "Got exception while trying to parse the job ad." self.logger.exception(msg)
def main(): """ Need a doc string here. """ ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) makeWebDir(ad) retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries*20) retries += 1 clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock) as lock: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd))
def parseHistoryFile(self, historyFile): xml_out = {} with open(historyFile) as fd: job_ad = classad.parseOld(fd) for key in job_ad.keys(): temp = '' try: temp = str(int(job_ad[key])) # force boolean values to be converted to integer except: temp = job_ad[key] if key not in ['GlobalJobId', 'TaskType', 'Owner', 'CRAB_ReqName', 'CRAB_JobSW', 'CRAB_AsyncDest', 'MATCH_EXP_JOB_GLIDEIN_Entry_Name', 'MATCH_EXP_JOB_GLIDEIN_CMSSite']: continue xml_out[key] = temp if len(xml_out) > 0: self.xmlBuffer = xml_out self.totalParsed += 1 return 1 return 0
def get_job_ad(self): try: cluster = int(self.cluster.split(".")[0]) if cluster == -1: return except ValueError: pass shutil.copy("job_log", "job_log.%s" % str(self.cluster)) p = subprocess.Popen([ "condor_q", "-debug", "-l", "-userlog", "job_log.%s" % str(self.cluster), str(self.cluster) ], stdout=subprocess.PIPE, stderr=sys.stderr) output, _ = p.communicate() status = p.returncode try: os.unlink("job_log.%s" % str(self.cluster)) except: pass if status: raise FatalError("Failed to query condor user log:\n%s" % output) self.ads = [] for text_ad in output.split("\n\n"): try: ad = classad.parseOld(text_ad) except SyntaxError: continue if ad: self.ads.append(ad) self.ad = self.ads[-1] if 'JOBGLIDEIN_CMSSite' in self.ad: self.site = self.ad['JOBGLIDEIN_CMSSite']
def get_job_ad_from_file(self): """ Need a doc string here """ self.ads = [] self.ads.append(self.ad) if self.crab_retry == 0: print 'Job is retry num 0. Will not try to search and load previous job ads.' return for crab_retry in range(1, int(self.crab_retry + 1)): job_ad_file = "./finished_jobs/job.%d.%d" % (self.job_id, crab_retry) if os.path.isfile(job_ad_file): with open(job_ad_file, "r") as fd: text_ad = fd.read_lines() try: ad = classad.parseOld(text_ad) except SyntaxError as e: print 'Unable to parse classads from file %s' % job_Ad continue if ad: self.ads.append(ad) else: print 'File %s does not exist. Continuing' % job_ad_file
def get_job_ad_from_file(self): """ Need a doc string here """ self.ads.append(self.ad) if self.dag_retry == 0: msg = "This is job retry number 0. Will not try to search and load previous job ads." self.logger.info(msg) return for dag_retry in range(self.dag_retry): job_ad_file = os.path.join(".", "finished_jobs", "job.%d.%d" % (self.job_id, dag_retry)) if os.path.isfile(job_ad_file): try: with open(job_ad_file) as fd: ad = classad.parseOld(fd) except Exception: msg = "Unable to parse classads from file %s. Continuing." % (job_ad_file) self.logger.warning(msg) continue if ad: self.ads.append(ad) else: msg = "File %s does not exist. Continuing." % (job_ad_file) self.logger.warning(msg)
def main(): opts = parse_opts() config = get_config() setup_log(level=config["log_level"], logfile=config["log_file"], syslog_facility=config["syslog_facility"], debug=opts.debug) route_ad = classad.ClassAd(sys.stdin.readline()) #logger.debug("Route Ad: %s", route_ad.__str__()) separator_line = sys.stdin.readline() try: assert separator_line == "------\n" except AssertionError: logger.error("Separator line was not second line of STDIN") return(FAILURE) try: ad = classad.parseOld(sys.stdin) except SyntaxError: logger.error("Unable to parse classad") return(FAILURE) # try: # ad = classad.parse(input_classad) # except SyntaxError: # try: # ad = classad.parseOld(input_classad) # except SyntaxError: # logger.error("Unable to parse classad") # return(FAILURE) # Set some variables based on incoming job ad jobid = "%s.%s" % (ad["ClusterId"], ad["ProcId"]) # Perform transformations normally done by condor when a hook is not used # The version that fixes this is not yet defined so comparing against 9.9.9 condor_version = classad.version() if StrictVersion(condor_version) < StrictVersion('9.9.9'): vanillaToGrid(ad, route_ad) # Test if job is a pilot #if "x509UserProxyFirstFQAN" in ad and "/local/Role=pilot" in ad.eval("x509UserProxyFirstFQAN"): if "x509UserProxyFirstFQAN" in ad and "/Role=pilot" in ad.eval("x509UserProxyFirstFQAN"): logger.debug("Job=%s x509UserProxyFirstFQAN='%s' is a pilot", jobid, ad["x509UserProxyFirstFQAN"]) pilot_job = True else: logger.debug("Job=%s x509UserProxyFirstFQAN='%s' is not a pilot", jobid, ad.get("x509UserProxyFirstFQAN", "None")) pilot_job = False # TEST #if ad["Owner"] == "treydock": # logger.error("Job=%s Invalid. Reason='TEST', setting JobStatus=5.", jobid) # ad["JobStatus"] = 5 # ad["SITELocalUser"] = False # ad["HoldReason"] = "Job invalid - TEST" # print ad.printOld() # return(SUCCESS) # END TEST # If not a pilot then return unmodified ad if not pilot_job: logger.debug("Job=%s is not a pilot job, returning ad", jobid) print ad.printOld() return(SUCCESS) # If owner or route are in ignore_users or ignore_routes then return unmodified ad if config["ignore_users"] and ad["owner"] in config["ignore_users"]: logger.debug("Job=%s Owner=%s is in ignore_users list, returning ad", jobid, ad["owner"]) print ad.printOld() return(SUCCESS) if config["ignore_routes"] and route_ad["name"] in config["ignore_routes"]: logger.debug("Job=%s Route=%s is in ignore_routes list, returning ad", jobid, route_ad["name"]) print ad.printOld() return(SUCCESS) # Get pending requests data pending_requests = get_pending_requests(data_file=config["user_requests_json"]) # If unable to determine pending requests, mark job invalid if not pending_requests or "idle" not in pending_requests or "users" not in pending_requests: return(mark_job_invalid(ad=ad, jobid=jobid, reason="pending requests missing required data")) # If no idle users defined, mark job invalid idle_users = pending_requests["idle"] if not idle_users: return(mark_job_invalid(ad=ad, jobid=jobid, reason="pending requests contains no idle users")) # If no idle user DNs, mark job invalid pending_user_dns = pending_requests["users"] if not pending_user_dns: return(mark_job_invalid(ad=ad, jobid=jobid, reason="pending requests contains no user DNs")) # Get all users with idle jobs pending_users = {} for user, idle in idle_users.iteritems(): if idle != 0: pending_users[user] = idle # If no pending user jobs, mark job invalid if not pending_users: return(mark_job_invalid(ad=ad, jobid=jobid, reason="no pending user jobs found")) # Determine which user to assign to the pilot # Priority: user with most idle jobs pending_user = sorted(pending_users, key=pending_users.get, reverse=True)[0] logger.debug("Pending users:\n%s", json.dumps(pending_users)) logger.debug("Job=%s selected user to run job name=%s idle=%s", jobid, pending_user, pending_users[pending_user]) # If the DN can't be found in the pending request JSON, job is invalid pending_user_dn = pending_user_dns.get(pending_user) if not pending_user_dn: return(mark_job_invalid(ad=ad, jobid=jobid, reason="unable to find pending user DN")) # The idle user selected is a CERN username, we need to map the associated DN to find local user local_grid_map = get_local_grid_map(dn=pending_user_dn, grid_mapfile=config["grid_mapfile"]) if not local_grid_map: return(mark_job_invalid(ad=ad, jobid=jobid, reason="unable to get local gridmap information for DN='%s'" % pending_user_dn)) new_owner = local_grid_map["username"] # Set USER_DN environment variable to new owner's DN if not ad["environment"] or ad["environment"] == "": new_environment = "USER_DN='%s'" % local_grid_map["dn"] else: new_environment = ad["environment"] + " USER_DN='%s'" % local_grid_map["dn"] # Get location of spooled files and change ownership #if "Iwd" in ad.keys(): # iwd = ad["Iwd"] # if os.path.isdir(iwd): # _pwd = pwd.getpwnam(new_owner) # _uid = _pwd.pw_uid # _gid = _pwd.pw_gid # logger.debug("Modify permissions for Job=%s Set uid=%s gid=%s Iwd=%s", jobid, _uid, _gid, iwd) # chown_wrapper_cmd = [ # os.path.join(os.path.dirname(os.path.realpath(__file__)), "chown_iwd"), str(_uid), str(_gid), iwd # ] # chown_wrapper_exit_code = subprocess.call(chown_wrapper_cmd) # if chown_wrapper_exit_code != 0: # return(mark_job_invalid(ad=ad, jobid=jobid, reason="chown wrapper failed with exit code %s" % chown_wrapper_exit_code)) # Hack to replace arguments with values we can use #if "Arguments" in ad.keys(): # job_arguments = ad["Arguments"] # if "-param_GLIDEIN_Glexec_Use OPTIONAL" in job_arguments: # new_job_arguments = job_arguments.replace("-param_GLIDEIN_Glexec_Use OPTIONAL", "-param_GLIDEIN_Glexec_Use NEVER") # logger.info("Update Job=%s set Arguments='%s'", jobid, new_job_arguments) # ad["Arguments"] = new_job_arguments # Define remote_cerequirements to pass to submit script # Set new ad values logger.info("Update Job=%s set Owner=%s", jobid, new_owner) logger.info("Update Job=%s set Environment=\"%s\"", jobid, new_environment) ad["owner"] = new_owner ad["environment"] = new_environment #logger.debug("Route Ad:\n%s", route_ad.__str__()) #logger.debug("Class Ad:\n%s", ad.printOld()) print ad.printOld() return(SUCCESS)
def bootstrap(): print("Entering TaskManagerBootstrap with args: %s" % sys.argv) command = sys.argv[1] if command == "POSTJOB": return PostJob.PostJob().execute(*sys.argv[2:]) elif command == "PREJOB": return PreJob.PreJob().execute(*sys.argv[2:]) elif command == "PREDAG": return PreDAG.PreDAG().execute(*sys.argv[2:]) infile, outfile = sys.argv[2:] adfile = os.environ["_CONDOR_JOB_AD"] print("Parsing classad") with open(adfile, "r") as fd: ad = classad.parseOld(fd) print("..done") in_args = [] if infile != "None": with open(infile, "r") as fd: in_args = pickle.load(fd) config = Configuration.Configuration() config.section_("Services") config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/' ad['tm_taskname'] = ad.eval("CRAB_Workflow") ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo") ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL") ad['tm_input_dataset'] = ad.eval("DESIRED_CMSDataset") ad['tm_outfiles'] = HTCondorUtils.unquote( ad.eval("CRAB_AdditionalOutputFiles")) ad['tm_tfile_outfiles'] = HTCondorUtils.unquote( ad.eval("CRAB_TFileOutputFiles")) ad['tm_edm_outfiles'] = HTCondorUtils.unquote( ad.eval("CRAB_EDMOutputFiles")) ad['tm_site_whitelist'] = HTCondorUtils.unquote( ad.eval("CRAB_SiteWhitelist")) ad['tm_site_blacklist'] = HTCondorUtils.unquote( ad.eval("CRAB_SiteBlacklist")) ad['tm_job_type'] = 'Analysis' print("TaskManager got this raw ad") print(ad) pure_ad = {} for key in ad: try: pure_ad[key] = ad.eval(key) if isinstance(pure_ad[key], classad.Value): del pure_ad[key] if isinstance(pure_ad[key], list): pure_ad[key] = [i.eval() for i in pure_ad[key]] except: pass ad = pure_ad ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"]) ad['tm_split_args'] = ad["CRAB_AlgoArgs"] ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '') print("TaskManagerBootstrap got this ad:") pprint.pprint(ad) results = task.execute(in_args, task=ad).result print(results) with open(outfile, "w") as fd: pickle.dump(results, fd) return 0
def main(): opts = parse_opts() config = get_config() setup_log(level=config["log_level"], logfile=config["log_file"], syslog_facility=config["syslog_facility"], debug=opts.debug) route_ad = classad.ClassAd(sys.stdin.readline()) #logger.debug("Route Ad: %s", route_ad.__str__()) separator_line = sys.stdin.readline() try: assert separator_line == "------\n" except AssertionError: logger.error("Separator line was not second line of STDIN") return (FAILURE) try: ad = classad.parseOld(sys.stdin) except SyntaxError: logger.error("Unable to parse classad") return (FAILURE) # try: # ad = classad.parse(input_classad) # except SyntaxError: # try: # ad = classad.parseOld(input_classad) # except SyntaxError: # logger.error("Unable to parse classad") # return(FAILURE) # Set some variables based on incoming job ad jobid = "%s.%s" % (ad["ClusterId"], ad["ProcId"]) # Perform transformations normally done by condor when a hook is not used # The version that fixes this is not yet defined so comparing against 9.9.9 condor_version = classad.version() if StrictVersion(condor_version) < StrictVersion('9.9.9'): vanillaToGrid(ad, route_ad) # Test if job is a pilot #if "x509UserProxyFirstFQAN" in ad and "/local/Role=pilot" in ad.eval("x509UserProxyFirstFQAN"): if "x509UserProxyFirstFQAN" in ad and "/Role=pilot" in ad.eval( "x509UserProxyFirstFQAN"): logger.debug("Job=%s x509UserProxyFirstFQAN='%s' is a pilot", jobid, ad["x509UserProxyFirstFQAN"]) pilot_job = True else: logger.debug("Job=%s x509UserProxyFirstFQAN='%s' is not a pilot", jobid, ad.get("x509UserProxyFirstFQAN", "None")) pilot_job = False # TEST #if ad["Owner"] == "treydock": # logger.error("Job=%s Invalid. Reason='TEST', setting JobStatus=5.", jobid) # ad["JobStatus"] = 5 # ad["SITELocalUser"] = False # ad["HoldReason"] = "Job invalid - TEST" # print ad.printOld() # return(SUCCESS) # END TEST # If not a pilot then return unmodified ad if not pilot_job: logger.debug("Job=%s is not a pilot job, returning ad", jobid) print ad.printOld() return (SUCCESS) # If owner or route are in ignore_users or ignore_routes then return unmodified ad if config["ignore_users"] and ad["owner"] in config["ignore_users"]: logger.debug("Job=%s Owner=%s is in ignore_users list, returning ad", jobid, ad["owner"]) print ad.printOld() return (SUCCESS) if config["ignore_routes"] and route_ad["name"] in config["ignore_routes"]: logger.debug("Job=%s Route=%s is in ignore_routes list, returning ad", jobid, route_ad["name"]) print ad.printOld() return (SUCCESS) # Get pending requests data pending_requests = get_pending_requests( data_file=config["user_requests_json"]) # If unable to determine pending requests, mark job invalid if not pending_requests or "idle" not in pending_requests or "users" not in pending_requests: return (mark_job_invalid( ad=ad, jobid=jobid, reason="pending requests missing required data")) # If no idle users defined, mark job invalid idle_users = pending_requests["idle"] if not idle_users: return (mark_job_invalid( ad=ad, jobid=jobid, reason="pending requests contains no idle users")) # If no idle user DNs, mark job invalid pending_user_dns = pending_requests["users"] if not pending_user_dns: return (mark_job_invalid( ad=ad, jobid=jobid, reason="pending requests contains no user DNs")) # Get all users with idle jobs pending_users = {} for user, idle in idle_users.iteritems(): if idle != 0: pending_users[user] = idle # If no pending user jobs, mark job invalid if not pending_users: return (mark_job_invalid(ad=ad, jobid=jobid, reason="no pending user jobs found")) # Determine which user to assign to the pilot # Priority: user with most idle jobs pending_user = sorted(pending_users, key=pending_users.get, reverse=True)[0] logger.debug("Pending users:\n%s", json.dumps(pending_users)) logger.debug("Job=%s selected user to run job name=%s idle=%s", jobid, pending_user, pending_users[pending_user]) # If the DN can't be found in the pending request JSON, job is invalid pending_user_dn = pending_user_dns.get(pending_user) if not pending_user_dn: return (mark_job_invalid(ad=ad, jobid=jobid, reason="unable to find pending user DN")) # The idle user selected is a CERN username, we need to map the associated DN to find local user local_grid_map = get_local_grid_map(dn=pending_user_dn, grid_mapfile=config["grid_mapfile"]) if not local_grid_map: return (mark_job_invalid( ad=ad, jobid=jobid, reason="unable to get local gridmap information for DN='%s'" % pending_user_dn)) new_owner = local_grid_map["username"] # Set USER_DN environment variable to new owner's DN if not ad["environment"] or ad["environment"] == "": new_environment = "USER_DN='%s'" % local_grid_map["dn"] else: new_environment = ad[ "environment"] + " USER_DN='%s'" % local_grid_map["dn"] # Get location of spooled files and change ownership #if "Iwd" in ad.keys(): # iwd = ad["Iwd"] # if os.path.isdir(iwd): # _pwd = pwd.getpwnam(new_owner) # _uid = _pwd.pw_uid # _gid = _pwd.pw_gid # logger.debug("Modify permissions for Job=%s Set uid=%s gid=%s Iwd=%s", jobid, _uid, _gid, iwd) # chown_wrapper_cmd = [ # os.path.join(os.path.dirname(os.path.realpath(__file__)), "chown_iwd"), str(_uid), str(_gid), iwd # ] # chown_wrapper_exit_code = subprocess.call(chown_wrapper_cmd) # if chown_wrapper_exit_code != 0: # return(mark_job_invalid(ad=ad, jobid=jobid, reason="chown wrapper failed with exit code %s" % chown_wrapper_exit_code)) # Hack to replace arguments with values we can use #if "Arguments" in ad.keys(): # job_arguments = ad["Arguments"] # if "-param_GLIDEIN_Glexec_Use OPTIONAL" in job_arguments: # new_job_arguments = job_arguments.replace("-param_GLIDEIN_Glexec_Use OPTIONAL", "-param_GLIDEIN_Glexec_Use NEVER") # logger.info("Update Job=%s set Arguments='%s'", jobid, new_job_arguments) # ad["Arguments"] = new_job_arguments # Define remote_cerequirements to pass to submit script # Set new ad values logger.info("Update Job=%s set Owner=%s", jobid, new_owner) logger.info("Update Job=%s set Environment=\"%s\"", jobid, new_environment) ad["owner"] = new_owner ad["environment"] = new_environment #logger.debug("Route Ad:\n%s", route_ad.__str__()) #logger.debug("Class Ad:\n%s", ad.printOld()) print ad.printOld() return (SUCCESS)
def getLastHistory(self, cluster): fd = os.popen("condor_history -match 1 -l %d" % cluster) ad = classad.parseOld(fd.read()[:-1]) self.assertFalse(fd.close()) return ad
def get_task_ad(self): self.task_ad = {} try: self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) except Exception: print traceback.format_exc()
def bootstrap(): print("Entering TaskManagerBootstrap with args: %s" % sys.argv) command = sys.argv[1] if command == "POSTJOB": return PostJob.PostJob().execute(*sys.argv[2:]) elif command == "PREJOB": return PreJob.PreJob().execute(*sys.argv[2:]) elif command == "PREDAG": return PreDAG.PreDAG().execute(*sys.argv[2:]) infile, outfile = sys.argv[2:] adfile = os.environ["_CONDOR_JOB_AD"] print("Parsing classad") with open(adfile, "r") as fd: ad = classad.parseOld(fd) print("..done") in_args = [] if infile != "None": with open(infile, "r") as fd: in_args = pickle.load(fd) config = Configuration.Configuration() config.section_("Services") config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/' ad['tm_taskname'] = ad.eval("CRAB_Workflow") ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo") ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL") ad['tm_input_dataset'] = ad.eval("DESIRED_CMSDataset") ad['tm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_AdditionalOutputFiles")) ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_TFileOutputFiles")) ad['tm_edm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_EDMOutputFiles")) ad['tm_site_whitelist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteWhitelist")) ad['tm_site_blacklist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteBlacklist")) ad['tm_job_type'] = 'Analysis' print("TaskManager got this raw ad") print(ad) pure_ad = {} for key in ad: try: pure_ad[key] = ad.eval(key) if isinstance(pure_ad[key], classad.Value): del pure_ad[key] if isinstance(pure_ad[key], list): pure_ad[key] = [i.eval() for i in pure_ad[key]] except: pass ad = pure_ad ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"]) ad['tm_split_args'] = ad["CRAB_AlgoArgs"] ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '') print("TaskManagerBootstrap got this ad:") pprint.pprint(ad) results = task.execute(in_args, task=ad).result print(results) with open(outfile, "w") as fd: pickle.dump(results, fd) return 0
def test_old_classad(self): ad = classad.parseOld(open("tests/test.old.ad")) contents = open("tests/test.old.ad").read() self.assertEqual(ad.printOld(), contents)
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(os.environ["_CONDOR_JOB_AD"]): printLog("Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist") sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog("Exiting AdjustSites because the webdir upload failed three times.") sys.exit(1) printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog("Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions") clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")
def bootstrap(): print "Entering TaskManagerBootstrap with args: %s" % sys.argv command = sys.argv[1] if command == "POSTJOB": return PostJob.PostJob().execute(*sys.argv[2:]) elif command == "PREJOB": return PreJob.PreJob().execute(*sys.argv[2:]) elif command == "FINAL": return Final.Final().execute(*sys.argv[2:]) elif command == "ASO": return ASO.async_stageout(*sys.argv[2:]) infile, outfile = sys.argv[2:] adfile = os.environ["_CONDOR_JOB_AD"] print "Parsing classad" with open(adfile, "r") as fd: ad = classad.parseOld(fd) print "..done" in_args = [] if infile != "None": with open(infile, "r") as fd: in_args = pickle.load(fd) config = Configuration.Configuration() config.section_("Services") config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/' ad['tm_taskname'] = ad.eval("CRAB_Workflow") ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo") ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL") ad['tm_input_dataset'] = ad.eval("CRAB_InputData") ad['tm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_AdditionalOutputFiles")) ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_TFileOutputFiles")) ad['tm_edm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_EDMOutputFiles")) ad['tm_site_whitelist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteWhitelist")) ad['tm_site_blacklist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteBlacklist")) ad['tm_job_type'] = 'Analysis' print "TaskManager got this raw ad" print ad pure_ad = {} for key in ad: try: pure_ad[key] = ad.eval(key) if isinstance(pure_ad[key], classad.Value): del pure_ad[key] if isinstance(pure_ad[key], types.ListType): pure_ad[key] = [i.eval() for i in pure_ad[key]] except: pass ad = pure_ad ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"]) ad['tm_split_args'] = ad["CRAB_AlgoArgs"] ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '') print "TaskManagerBootstrap got this ad:" pprint.pprint(ad) if command == "DBS": task = DBSDataDiscovery.DBSDataDiscovery(config) elif command == "SPLIT": task = Splitter.Splitter(config) print "Got this result from the splitter" pprint.pprint(task) results = task.execute(in_args, task=ad).result if command == "SPLIT": results = DagmanCreator.create_subdag(results, task=ad) print results with open(outfile, "w") as fd: pickle.dump(results, fd) return 0
def bootstrap(): print "Entering TaskManagerBootstrap with args: %s" % sys.argv command = sys.argv[1] if command == "POSTJOB": return PostJob.PostJob().execute(*sys.argv[2:]) elif command == "PREJOB": return PreJob.PreJob().execute(*sys.argv[2:]) elif command == "FINAL": return Final.Final().execute(*sys.argv[2:]) elif command == "ASO": return ASO.async_stageout(*sys.argv[2:]) infile, outfile = sys.argv[2:] adfile = os.environ["_CONDOR_JOB_AD"] print "Parsing classad" with open(adfile, "r") as fd: ad = classad.parseOld(fd) print "..done" in_args = [] if infile != "None": with open(infile, "r") as fd: in_args = pickle.load(fd) config = Configuration.Configuration() config.section_("Services") config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/' ad['tm_taskname'] = ad.eval("CRAB_Workflow") ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo") ad['tm_dbs_url'] = ad.eval("CRAB_DBSUrl") ad['tm_input_dataset'] = ad.eval("CRAB_InputData") ad['tm_outfiles'] = HTCondorUtils.unquote( ad.eval("CRAB_AdditionalOutputFiles")) ad['tm_tfile_outfiles'] = HTCondorUtils.unquote( ad.eval("CRAB_TFileOutputFiles")) ad['tm_edm_outfiles'] = HTCondorUtils.unquote( ad.eval("CRAB_EDMOutputFiles")) ad['tm_site_whitelist'] = HTCondorUtils.unquote( ad.eval("CRAB_SiteWhitelist")) ad['tm_site_blacklist'] = HTCondorUtils.unquote( ad.eval("CRAB_SiteBlacklist")) ad['tm_job_type'] = 'Analysis' print "TaskManager got this raw ad" print ad pure_ad = {} for key in ad: try: pure_ad[key] = ad.eval(key) if isinstance(pure_ad[key], classad.Value): del pure_ad[key] if isinstance(pure_ad[key], types.ListType): pure_ad[key] = [i.eval() for i in pure_ad[key]] except: pass ad = pure_ad ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"]) ad['tm_split_args'] = ad["CRAB_AlgoArgs"] ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '') print "TaskManagerBootstrap got this ad:" pprint.pprint(ad) if command == "DBS": task = DBSDataDiscovery.DBSDataDiscovery(config) elif command == "SPLIT": task = Splitter.Splitter(config) print "Got this result from the splitter" pprint.pprint(task) results = task.execute(in_args, task=ad).result if command == "SPLIT": results = DagmanCreator.create_subdag(results, task=ad) print results with open(outfile, "w") as fd: pickle.dump(results, fd) return 0
def main(): """ Need a doc string here. """ printLog("Starting AdjustSites") with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 printLog( "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog( "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock): adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) printLog("Exiting AdjustSite")
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists( os.environ["_CONDOR_JOB_AD"]): printLog( "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist" ) sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog( "Exiting AdjustSites because the webdir upload failed three times." ) sys.exit(1) printLog( "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog( "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote( ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")
def getStatus(): base_dir = os.environ.get("_CONDOR_SCRATCH_DIR", os.getcwd()) fd = open(os.path.join(base_dir, ".machine.ad")) machineAd = classad.parseOld(fd) return htcondor.Collector().query(htcondor.AdTypes.Startd, "Name =?= %s" % machineAd.lookup("Name").__str__())[0]
try: fp = open(".pilot.ad") st = os.fstat(fp.fileno()) ad["AD_FOUND"] = classad.ExprTree("true") if launch_time - st.st_mtime < 600: ad["AD_FRESH"] = classad.ExprTree("true") else: print "Pilot ad too old" except IOError, oe: if oe.errno == errno.ENOENT: print "No pilot ad available" else: raise if not fp: return ad pilot_ad = classad.parseOld(fp) for key in pilot_ad: if key not in ad: ad[key] = pilot_ad.lookup(key) return ad def main(): ad = getAd() global chirp_verb for attr in ad.keys(): val = ad.lookup(attr) attr = "PILOT_" + attr if chirp(attr, val) and chirp_verb == "set_job_attr_delayed": chirp_verb = "set_job_attr" retval = chirp(attr, val)
def main(): """ Need a doc string here. """ printLog("Starting AdjustSites") with open(os.environ["_CONDOR_JOB_AD"]) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog( "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if "CRAB_ResubmitList" in ad: resubmitJobIds = ad["CRAB_ResubmitList"] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, "lock"): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", "a"), htcondor.LockType.WriteLock): adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if "CRAB_SiteAdUpdate" in ad: newSiteAd = ad["CRAB_SiteAdUpdate"] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) printLog("Exiting AdjustSite")