def __init__( self, job_queue, machine_ad: str = machine_ad_defaults, job_ad: str = job_ad_defaults, pre_job_rank: str = "0", interval: float = 60, autocluster: bool = False, ): """ Initializes the CondorClassadJobScheduler :param job_queue: queue of jobs that are scheduled in the following simulation :param machine_ad: ClassAd that is used with every drone :param job_ad: ClassAd that is used with every job :param pre_job_rank: ClassAd attribute that all drones are sorted by :param interval: time between scheduling cycles :param autocluster: could be used to decide whether to use autoclusters """ self._stream_queue = job_queue self._drones: RankedClusters[Drone] = RankedNonClusters( quantization=quantization_defaults, ranking=parse(pre_job_rank) ) self.interval = interval self.job_queue = JobQueue() self._collecting = True self._processing = Resources(jobs=0) # temporary solution self._wrapped_classads = WeakKeyDictionary() self._machine_classad = parse(machine_ad) self._job_classad = parse(job_ad)
def testNetworkAccounting(self): jobqueue_log_dir = os.path.join(os.getcwd(), "tests_tmp/spool") filelist = [f for f in os.listdir(jobqueue_log_dir) if f.startswith("job_queue.log")] for f in filelist: print "Removing", f os.remove(os.path.join(jobqueue_log_dir, f)) self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "lark_test_2.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parse(open("tests/lark_submit_2.ad")) ads = [] cluster = schedd.submit(ad, 1, False, ads) for i in range(60): ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) job_ad = ads[0] if job_ad["JobStatus"] == 4: break if i % 2 == 0: schedd.reschedule() time.sleep(1) ads = schedd.query("ClusterId == %d" % cluster, []) print ads[0] self.assertTrue("NetworkIncoming" in ads[0].keys() and ads[0]["NetworkIncoming"] > 0)
def test_load_classad_from_file(self): with warnings.catch_warnings(): warnings.simplefilter("ignore") ad = classad.parse(open("tests/test.ad")) self.assertEqual(ad["foo"], "bar") self.assertEqual(ad["baz"], classad.Value.Undefined) self.assertRaises(KeyError, ad.__getitem__, "bar")
def redo_sites(self, new_submit_file, id, automatic_blacklist): if os.path.exists("site.ad.json"): with open("site.ad.json") as fd: site_info = json.load(fd) group = site_info[str(id)] available = set(site_info['groups'][str(group)]) else: with open("site.ad") as fd: site_ad = classad.parse(fd) available = set(site_ad['Job%d' % id]) blacklist = set(self.task_ad['CRAB_SiteBlacklist']) blacklist.update(automatic_blacklist) whitelist = set(self.task_ad['CRAB_SiteWhitelist']) if 'CRAB_SiteResubmitWhitelist' in self.task_ad: whitelist.update(self.task_ad['CRAB_SiteResubmitWhitelist']) if 'CRAB_SiteResubmitBlacklist' in self.task_ad: blacklist.update(self.task_ad['CRAB_SiteResubmitBlacklist']) if whitelist: available &= whitelist # Never blacklist something on the whitelist available -= (blacklist - whitelist) new_submit_file = '+DESIRED_SITES="%s"\n%s' % (",".join(available), new_submit_file) return new_submit_file
def testNetworkPolicyNAT(self): jobqueue_log_dir = os.path.join(os.getcwd(), "tests_tmp/spool") filelist = [f for f in os.listdir(jobqueue_log_dir) if f.startswith("job_queue.log")] for f in filelist: os.remove(os.path.join(jobqueue_log_dir, f)) self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "lark_test_1.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() coll = htcondor.Collector() ad = classad.parse(open("tests/lark_submit_1.ad")) ads = [] cluster = schedd.submit(ad, 1, False, ads) #print ads[0] for i in range(60): #ads = coll.query(htcondor.AdTypes.Startd, "true", ["LarkNetworkType"]) #ads = coll.query("true", ["LarkNetworkType"]) ads = schedd.query("ClusterId == %d" % cluster, ["LarkNetworkType"]) print ads if len(ads) != 0: if "LarkNetworkType" in ads[0].keys(): break time.sleep(1) #machine_ad = classad.parseOld(open(output_file, "r")) self.assertTrue(len(ads) == 1) self.assertTrue("LarkNetworkType" in ads[0].keys()) self.assertEquals(job_ad["LarkNetworkType"], "nat")
def tryUpdate(self, shouldwork=True, wantio=None, wantupdate=None, prefix="NonChirp"): output_file = os.path.join(htcondor_tests.testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) ad = classad.parse(open("tests/delayed_submit.ad")) ad["Arguments"] = "--type=update --prefix=%s --shouldwork=%s" % ( prefix, str(shouldwork)) if wantio == True: ad["WantIOProxy"] = True elif wantio == False: ad["WantIOProxy"] = False if wantupdate == True: ad["WantRemoteUpdates"] = True elif wantupdate == False: ad["WantRemoteUpdates"] = False cluster = self.runJob(ad) result_ad = self.getLastHistory(cluster) self.assertTrue("ExitCode" in result_ad) self.assertEqual(result_ad["ExitCode"], 0) last_line = open(output_file).readlines()[-1] self.assertEqual(last_line, "SUCCESS\n") if shouldwork: attr = "%sFoo" % prefix self.assertTrue(attr in result_ad) self.assertEqual(result_ad[attr], 2)
def get_schedulers(filename): list_schedulers = [] tmpfile_1sched = open_file(filename) sched = classad.parse(tmpfile_1sched) list_schedulers.append(sched) tmpfile_1sched.close() return list_schedulers
def redo_sites(self, new_submit_file, id, automatic_blacklist): if os.path.exists("site.ad.json"): with open("site.ad.json") as fd: site_info = json.load(fd) group = site_info[str(id)] available = set(site_info['groups'][str(group)]) else: with open("site.ad") as fd: site_ad = classad.parse(fd) available = set(site_ad['Job%d' % id]) blacklist = set(self.task_ad['CRAB_SiteBlacklist']) blacklist.update(automatic_blacklist) whitelist = set(self.task_ad['CRAB_SiteWhitelist']) if 'CRAB_SiteResubmitWhitelist' in self.task_ad: whitelist.update(self.task_ad['CRAB_SiteResubmitWhitelist']) if 'CRAB_SiteResubmitBlacklist' in self.task_ad: blacklist.update(self.task_ad['CRAB_SiteResubmitBlacklist']) if whitelist: available &= whitelist # Never blacklist something on the whitelist available -= (blacklist-whitelist) new_submit_file = '+DESIRED_SITES="%s"\n%s' % (",".join(available), new_submit_file) return new_submit_file
def testScheddNonblockingQuery(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parse(open("tests/submit.ad")) ads = [] cluster = schedd.submit(ad, 10, False, ads) for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ads2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ctrs = [0, 0] iters = [(ads, 0), (ads2, 1)] while iters: for it, pos in iters: try: it.next() ctrs[pos] += 1 except StopIteration: iters.remove((it, pos)) print ctrs if ctrs[0] == 0: break if i % 2 == 0: schedd.reschedule() time.sleep(1) self.assertEquals(open(output_file).read(), "hello world\n")
def testScheddNonblockingQueryCount(self): os.environ["_condor_SCHEDD_DEBUG"] = "D_FULLDEBUG|D_NETWORK" self.launch_daemons(["SCHEDD"]) schedd = htcondor.Schedd() submit_ad = classad.parse(open("tests/submit_large.ad")) schedd.act(htcondor.JobAction.Remove, "true") ads = [] time.sleep(1) while ads: time.sleep(.2) ads = schedd.query("true") #print ads for i in range(1, 60): print "Testing querying %d jobs in queue." % i schedd.submit(submit_ad, i, True, ads) ads = schedd.query("true", ["ClusterID", "ProcID"]) ads2 = list(schedd.xquery("true", ["ClusterID", "ProcID", "a1", "a2", "a3", "a4"])) #print ads #print ads2 self.assertNotEqual(ads2[0].lookup("ProcID"), classad.Value.Undefined) for ad in ads: found_ad = False for ad2 in ads2: if ad2["ProcID"] == ad["ProcID"] and ad2["ClusterID"] == ad["ClusterID"]: found_ad = True break self.assertTrue(found_ad, msg="Ad %s missing from xquery results: %s" % (ad, ads2)) self.assertEquals(len(ads), i, msg="Old query protocol gives incorrect number of results (expected %d, got %d)" % (i, len(ads))) self.assertEquals(len(ads2), i, msg="New query protocol gives incorrect number of results (expected %d, got %d)" % (i, len(ads2))) schedd.act(htcondor.JobAction.Remove, "true") while ads: time.sleep(.2) ads = schedd.query("true")
def testScheddSubmitSpool(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parse(open("tests/submit.ad")) result_ads = [] cluster = schedd.submit(ad, 1, True, result_ads) #print result_ads[0] schedd.spool(result_ads) for i in range(60): ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) #print ads self.assertEquals(len(ads), 1) if ads[0]["JobStatus"] == 4: break if i % 5 == 0: schedd.reschedule() time.sleep(1) schedd.retrieve("ClusterId == %d" % cluster) #print "Final status:", schedd.query("ClusterId == %d" % cluster)[0]; schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster]) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) self.assertEquals(len(ads), 0) self.assertEquals(open(output_file).read(), "hello world\n");
def testScheddNonblockingQuery(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parse(open("tests/submit.ad")) ads = [] cluster = schedd.submit(ad, 10, False, ads) for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ads2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ctrs = [0, 0] iters = [(ads, 0), (ads2, 1)] while iters: for it, pos in iters: try: it.next() ctrs[pos] += 1 except StopIteration: iters.remove((it, pos)) print ctrs if ctrs[0] == 0: break if i % 2 == 0: schedd.reschedule() time.sleep(1) self.assertEquals(open(output_file).read(), "hello world\n");
def get_worker_nodes(filename): worker_nodes = [] tmpfile_wn = open_file(filename) wn = classad.parse(tmpfile_wn) worker_nodes.append(wn) tmpfile_wn.close() return worker_nodes
def classad_parse(inputstr): """Parse string into a classad. Uses classad.parseOne if available (HTCondor 8.3+), and classad.parse otherwise (HTCondor 8.2, deprecated in 8.3). """ if hasattr(classad, 'parseOne'): return classad.parseOne(inputstr) else: return classad.parse(inputstr)
def testNegotiate(self): #htcondor.param['TOOL_DEBUG'] = 'D_FULLDEBUG' #os.environ['_condor_SCHEDD_DEBUG'] = 'D_FULLDEBUG, D_NETWORK' #htcondor.enable_debug() self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() schedd.act(htcondor.JobAction.Remove, 'true') ad = classad.parse(open("tests/submit.ad")) ads = [] cluster = schedd.submit(ad, 1, False, ads) # Get claim for startd claim_ads = [] for i in range(10): startd_ads = htcondor.Collector().locateAll( htcondor.DaemonTypes.Startd) private_ads = htcondor.Collector().query( htcondor.AdTypes.StartdPrivate) if (len(startd_ads) != htcondor.param['NUM_CPUS']) or ( len(private_ads) != htcondor.param['NUM_CPUS']): time.sleep(1) continue break self.assertEquals(len(startd_ads), len(private_ads)) self.assertEquals(len(startd_ads), htcondor.param['NUM_CPUS']) for ad in htcondor.Collector().locateAll(htcondor.DaemonTypes.Startd): for pvt_ad in private_ads: if pvt_ad.get('Name') == ad['Name']: ad['ClaimId'] = pvt_ad['Capability'] claim_ads.append(ad) self.assertEquals(len(claim_ads), len(startd_ads)) claim = claim_ads[0] me = "%s@%s" % (pwd.getpwuid( os.geteuid()).pw_name, htcondor.param['UID_DOMAIN']) with schedd.negotiate(me) as session: requests = list(session) self.assertEquals(len(requests), 1) request = requests[0] self.assertTrue(request.symmetricMatch(claim)) session.sendClaim(claim['ClaimId'], claim, request) for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ads = list(ads) if len(ads) == 0: break time.sleep(1) self.assertEquals(open(output_file).read(), "hello world\n")
def get_jobs_scheduled_attributes(maxrange): list_jobs_scheduled_attr = [] # gather scheduled jobs from respective file job$n.txt # files are generated by generate_job_sched.py for i in range(0, maxrange): filename = 'test-files/job' + str(i) + '.txt' tmpfile_1line = open_file(filename) tmpobj = classad.parse(tmpfile_1line) list_jobs_scheduled_attr.append(tmpobj) tmpfile_1line.close() return list_jobs_scheduled_attr
def main(): """ Need a doc string here. """ ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) makeWebDir(ad) retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries*20) retries += 1 clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock) as lock: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd))
def testNegotiate(self): #htcondor.param['TOOL_DEBUG'] = 'D_FULLDEBUG' #os.environ['_condor_SCHEDD_DEBUG'] = 'D_FULLDEBUG, D_NETWORK' #htcondor.enable_debug() self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() schedd.act(htcondor.JobAction.Remove, 'true') ad = classad.parse(open("tests/submit.ad")) ads = [] cluster = schedd.submit(ad, 1, False, ads) # Get claim for startd claim_ads = [] for i in range(10): startd_ads = htcondor.Collector().locateAll(htcondor.DaemonTypes.Startd) private_ads = htcondor.Collector().query(htcondor.AdTypes.StartdPrivate) if (len(startd_ads) != htcondor.param['NUM_CPUS']) or (len(private_ads) != htcondor.param['NUM_CPUS']): time.sleep(1) continue break self.assertEquals(len(startd_ads), len(private_ads)) self.assertEquals(len(startd_ads), htcondor.param['NUM_CPUS']) for ad in htcondor.Collector().locateAll(htcondor.DaemonTypes.Startd): for pvt_ad in private_ads: if pvt_ad.get('Name') == ad['Name']: ad['ClaimId'] = pvt_ad['Capability'] claim_ads.append(ad) self.assertEquals(len(claim_ads), len(startd_ads)) claim = claim_ads[0] me = "%s@%s" % (pwd.getpwuid(os.geteuid()).pw_name, htcondor.param['UID_DOMAIN']) with schedd.negotiate(me) as session: requests = list(session) self.assertEquals(len(requests), 1) request = requests[0] self.assertTrue(request.symmetricMatch(claim)) session.sendClaim(claim['ClaimId'], claim, request) for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ads = list(ads) if len(ads) == 0: break time.sleep(1) self.assertEquals(open(output_file).read(), "hello world\n");
def testScheddNonblockingQueryRemove(self): os.environ["_condor_SCHEDD_DEBUG"] = "D_FULLDEBUG|D_NETWORK" self.launch_daemons(["SCHEDD"]) schedd = htcondor.Schedd() submit_ad = classad.parse(open("tests/submit.ad")) ads = [] cluster = schedd.submit(submit_ad, 300, False, ads) ads = schedd.xquery("ClusterId == %d" % cluster) print str(datetime.datetime.now()) print str(datetime.datetime.now()) schedd.act(htcondor.JobAction.Remove, "ClusterId == %d" % cluster) time.sleep(3) print str(datetime.datetime.now()) print len(list(ads)) print str(datetime.datetime.now())
def parseSiteAd(self, fp, task_ad, nodes): site_ad = classad.parse(fp) blacklist = set(task_ad['CRAB_SiteBlacklist']) whitelist = set(task_ad['CRAB_SiteWhitelist']) for key, val in site_ad.items(): m = self.job_name_re.match(key) if not m: continue nodeid = m.groups()[0] sites = set(val.eval()) if whitelist: sites &= whitelist # Never blacklist something on the whitelist sites -= (blacklist-whitelist) info = nodes.setdefault(nodeid, {}) info['AvailableSites'] = list([i.eval() for i in sites])
def tryIO(self, shouldwork=True, wantio=None): open(os.path.join(htcondor_tests.testdir, "test_chirp_io"), "w").write("hello world") output_file = os.path.join(htcondor_tests.testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) ad = classad.parse(open("tests/delayed_submit.ad")) ad["Arguments"] = "--type=io --shouldwork=%s" % str(shouldwork) if wantio == True: ad["WantIOProxy"] = True elif wantio == False: ad["WantIOProxy"] = False cluster = self.runJob(ad) result_ad = self.getLastHistory(cluster) self.assertTrue("ExitCode" in result_ad) self.assertEquals(result_ad["ExitCode"], 0) last_line = open(output_file).readlines()[-1] self.assertEquals(last_line, "SUCCESS\n")
def tryIO(self, shouldwork=True, wantio=None): open(os.path.join(htcondor_tests.testdir, "test_chirp_io"), "w").write("hello world") output_file = os.path.join(htcondor_tests.testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) ad = classad.parse(open("tests/delayed_submit.ad")) ad["Arguments"] = "--type=io --shouldwork=%s" % str(shouldwork) if wantio: ad["WantIOProxy"] = True elif not wantio: ad["WantIOProxy"] = False cluster = self.runJob(ad) result_ad = self.getLastHistory(cluster) self.assertTrue("ExitCode" in result_ad) self.assertEqual(result_ad["ExitCode"], 0) last_line = open(output_file).readlines()[-1] self.assertEqual(last_line, "SUCCESS\n")
def testDelayedUpdateDOS(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(htcondor_tests.testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) ad = classad.parse(open("tests/delayed_submit.ad")) ad["Arguments"] = "--type=delayeddos" cluster = self.runJob(ad) result_ad = self.getLastHistory(cluster) self.assertTrue("ExitCode" in result_ad) self.assertEquals(result_ad["ExitCode"], 0) last_line = open(output_file).readlines()[-1] self.assertEquals(last_line, "SUCCESS\n") self.assertTrue("ChirpFoo" in result_ad) self.assertEquals(result_ad["ChirpFoo"], "0" * 990) self.assertFalse("ChirpBar" in result_ad) for i in range(1, 50): self.assertTrue(("ChirpFoo%d" % i) in result_ad) self.assertFalse("ChirpFoo50" in result_ad)
def testDelayedUpdateDOS(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(htcondor_tests.testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) ad = classad.parse(open("tests/delayed_submit.ad")) ad["Arguments"] = "--type=delayeddos" cluster = self.runJob(ad) result_ad = self.getLastHistory(cluster) self.assertTrue("ExitCode" in result_ad) self.assertEqual(result_ad["ExitCode"], 0) last_line = open(output_file).readlines()[-1] self.assertEqual(last_line, "SUCCESS\n") self.assertTrue("ChirpFoo" in result_ad) self.assertEqual(result_ad["ChirpFoo"], "0" * 990) self.assertFalse("ChirpBar" in result_ad) for i in range(1, 50): self.assertTrue(("ChirpFoo%d" % i) in result_ad) self.assertFalse("ChirpFoo50" in result_ad)
def testScheddSubmit(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parse(open("tests/submit.ad")) ads = [] cluster = schedd.submit(ad, 1, False, ads) #print ads[0] for i in range(60): ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) #print ads if len(ads) == 0: break if i % 2 == 0: schedd.reschedule() time.sleep(1) self.assertEquals(open(output_file).read(), "hello world\n");
def tryDelayedUpdate(self, prefix="Chirp", shouldwork=True, wantupdate=None): output_file = os.path.join(htcondor_tests.testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) ad = classad.parse(open("tests/delayed_submit.ad")) ad["Arguments"] = "--prefix=%s --type=delayed --shouldwork=%s" % (prefix, str(shouldwork)) if wantupdate == True: ad["WantDelayedUpdates"] = True elif wantupdate == False: ad["WantDelayedUpdates"] = False cluster = self.runJob(ad) result_ad = self.getLastHistory(cluster) attr = "%sFoo" % prefix self.assertTrue("ExitCode" in result_ad) self.assertEquals(result_ad["ExitCode"], 0) last_line = open(output_file).readlines()[-1] self.assertEquals(last_line, "SUCCESS\n") if shouldwork: self.assertTrue(attr in result_ad) self.assertEquals(result_ad[attr], 2)
def parseSiteAd(self, fp, task_ad, nodes): site_ad = classad.parse(fp) blacklist = set(task_ad['CRAB_SiteBlacklist']) whitelist = set(task_ad['CRAB_SiteWhitelist']) if 'CRAB_SiteResubmitWhitelist' in task_ad: whitelist.update(task_ad['CRAB_SiteResubmitWhitelist']) if 'CRAB_SiteResubmitBlacklist' in task_ad: blacklist.update(task_ad['CRAB_SiteResubmitBlacklist']) for key, val in site_ad.items(): m = self.job_name_re.match(key) if not m: continue nodeid = m.groups()[0] sites = set(val.eval()) if whitelist: sites &= whitelist # Never blacklist something on the whitelist sites -= (blacklist-whitelist) info = nodes.setdefault(nodeid, {}) info['AvailableSites'] = list([i.eval() for i in sites])
def testNetworkPolicyOVS(self): jobqueue_log_dir = os.path.join(os.getcwd(), "tests_tmp/spool") if(os.path.exists(jobqueue_log_dir)): filelist = [f for f in os.listdir(jobqueue_log_dir) if f.startswith("job_queue.log")] for f in filelist: print "Removing", f os.remove(os.path.join(jobqueue_log_dir, f)) self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "lark_test_4.out") if os.path.exists(output_file): os.unlink(output_file) fd = os.popen("curl www.google.com") content = fd.read() schedd = htcondor.Schedd() ad = classad.parse(open("tests/lark_submit_4.ad")) ads = [] cluster = schedd.submit(ad, 1, False, ads) for i in range(60): ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) print ads if len(ads) == 0: break time.sleep(1) self.assertEquals(open(output_file).read(), content)
def redo_sites(self, new_submit_text, crab_retry, use_resubmit_info): """ Re-define the set of sites where the job can run on by taking into account any site-white-list and site-black-list. """ ## If there is an automatic site blacklist, add it to the Job.<job_id>.submit ## content. automatic_siteblacklist = self.calculate_blacklist() if automatic_siteblacklist: self.task_ad[ 'CRAB_SiteAutomaticBlacklist'] = automatic_siteblacklist new_submit_text += '+CRAB_SiteAutomaticBlacklist = %s\n' % str( self.task_ad.lookup('CRAB_SiteAutomaticBlacklist')) ## Get the site black- and whitelists either from the task ad or from ## self.resubmit_info. siteblacklist = set() sitewhitelist = set() if not use_resubmit_info: if 'CRAB_SiteBlacklist' in self.task_ad: siteblacklist = set(self.task_ad['CRAB_SiteBlacklist']) if 'CRAB_SiteWhitelist' in self.task_ad: sitewhitelist = set(self.task_ad['CRAB_SiteWhitelist']) else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) siteblacklist = set(self.resubmit_info[inkey].get( 'site_blacklist', [])) sitewhitelist = set(self.resubmit_info[inkey].get( 'site_whitelist', [])) ## Save the current site black- and whitelists in self.resubmit_info for the ## current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['site_blacklist'] = list(siteblacklist) self.resubmit_info[outkey]['site_whitelist'] = list(sitewhitelist) ## Add the current site black- and whitelists to the Job.<job_id>.submit ## content. if siteblacklist: new_submit_text += '+CRAB_SiteBlacklist = {"%s"}\n' % ( '", "'.join(siteblacklist)) else: new_submit_text += '+CRAB_SiteBlacklist = {}\n' if sitewhitelist: new_submit_text += '+CRAB_SiteWhitelist = {"%s"}\n' % ( '", "'.join(sitewhitelist)) else: new_submit_text += '+CRAB_SiteWhitelist = {}\n' ## Get the list of available sites (the sites where this job could run). if os.path.exists("site.ad.json"): with open("site.ad.json") as fd: site_info = json.load(fd) group = site_info[str(self.job_id)] available = set(site_info['group_sites'][str(group)]) datasites = set(site_info['group_datasites'][str(group)]) else: with open("site.ad") as fd: site_ad = classad.parse(fd) available = set(site_ad['Job%d' % (self.job_id)]) ## Take the intersection between the available sites and the site whitelist. ## This is the new set of available sites. if sitewhitelist: available &= sitewhitelist ## Remove from the available sites the ones that are in the site blacklist, ## unless they are also in the site whitelist (i.e. never blacklist something ## on the whitelist). siteblacklist.update(automatic_siteblacklist) available -= (siteblacklist - sitewhitelist) ## Add DESIRED_SITES to the Job.<job_id>.submit content. new_submit_text = '+DESIRED_SITES="%s"\n%s' % (",".join(available), new_submit_text) new_submit_text = '+DESIRED_CMSDataLocations="%s"\n%s' % ( ",".join(datasites), new_submit_text) return new_submit_text
def parsePoolAd(self, fp): pool_ad = classad.parse(fp)
def test_load_classad_from_file(self): ad = classad.parse(open("tests/test.ad")) self.assertEqual(ad["foo"], "bar") self.assertEqual(ad["baz"], classad.Value.Undefined) self.assertRaises(KeyError, ad.__getitem__, "bar")
def testTransaction(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") log_file = os.path.join(testdir, "test.log") if os.path.exists(output_file): os.unlink(output_file) if os.path.exists(log_file): os.unlink(log_file) schedd = htcondor.Schedd() ad = classad.parse(open("tests/submit_sleep.ad")) result_ads = [] cluster = schedd.submit(ad, 1, True, result_ads) with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(1)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(2)) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar']) self.assertEquals(len(ads), 1) self.assertEquals(ads[0]['foo'], 1) self.assertEquals(ads[0]['bar'], 2) with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(3)) with schedd.transaction( htcondor.TransactionFlags.NonDurable | htcondor.TransactionFlags.ShouldLog, True) as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(4)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(5)) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar', 'baz']) self.assertEquals(len(ads), 1) self.assertEquals(ads[0]['foo'], 4) self.assertEquals(ads[0]['bar'], 5) self.assertEquals(ads[0]['baz'], 3) try: with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(6)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(7)) raise Exception("force abort") except: exctype, e = sys.exc_info()[:2] if not issubclass(exctype, Exception): raise self.assertEquals(str(e), "force abort") ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar']) self.assertEquals(len(ads), 1) self.assertEquals(ads[0]['foo'], 4) self.assertEquals(ads[0]['bar'], 5) try: with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(8)) with schedd.transaction( htcondor.TransactionFlags.NonDurable | htcondor.TransactionFlags.ShouldLog, True) as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(9)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(10)) raise Exception("force abort") except: exctype, e = sys.exc_info()[:2] if not issubclass(exctype, Exception): raise self.assertEquals(str(e), "force abort") ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar', 'baz']) self.assertEquals(len(ads), 1) self.assertEquals(ads[0]['foo'], 4) self.assertEquals(ads[0]['bar'], 5) self.assertEquals(ads[0]['baz'], 3) schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster]) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) self.assertEquals(len(ads), 0)
if resubmitJobIds: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock) as lock: adjustPostScriptExitStatus(resubmitJobIds) else: adjustPostScriptExitStatus(resubmitJobIds) adjustMaxRetries(resubmitJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) dagJobId = '%d.%d' % (ad['ClusterId'], ad['ProcId']) ad['foo'] = [] try: ## Is CRAB_ResubmitList the attribute we want to edit ? ## Or is it CRAB_SiteAdUpdate ? htcondor.Schedd().edit([dagJobId], 'CRAB_ResubmitList', ad['foo']) except RuntimeError, reerror: print "ERROR: %s" % str(reerror) if __name__ == '__main__': main()
def main(): """ Need a doc string here. """ printLog("Starting AdjustSites") with open(os.environ["_CONDOR_JOB_AD"]) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog( "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if "CRAB_ResubmitList" in ad: resubmitJobIds = ad["CRAB_ResubmitList"] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, "lock"): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", "a"), htcondor.LockType.WriteLock): adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if "CRAB_SiteAdUpdate" in ad: newSiteAd = ad["CRAB_SiteAdUpdate"] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) printLog("Exiting AdjustSite")
def testTransaction(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") log_file = os.path.join(testdir, "test.log") if os.path.exists(output_file): os.unlink(output_file) if os.path.exists(log_file): os.unlink(log_file) schedd = htcondor.Schedd() ad = classad.parse(open("tests/submit_sleep.ad")) result_ads = [] cluster = schedd.submit(ad, 1, True, result_ads) with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(1)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(2)) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar']) self.assertEquals(len(ads), 1) self.assertEquals(ads[0]['foo'], 1) self.assertEquals(ads[0]['bar'], 2) with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(3)) with schedd.transaction(htcondor.TransactionFlags.NonDurable | htcondor.TransactionFlags.ShouldLog, True) as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(4)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(5)) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar', 'baz']) self.assertEquals(len(ads), 1) self.assertEquals(ads[0]['foo'], 4) self.assertEquals(ads[0]['bar'], 5) self.assertEquals(ads[0]['baz'], 3) try: with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(6)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(7)) raise Exception("force abort") except: exctype, e = sys.exc_info()[:2] if not issubclass(exctype, Exception): raise self.assertEquals(str(e), "force abort") ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar']) self.assertEquals(len(ads), 1) self.assertEquals(ads[0]['foo'], 4) self.assertEquals(ads[0]['bar'], 5) try: with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(8)) with schedd.transaction(htcondor.TransactionFlags.NonDurable | htcondor.TransactionFlags.ShouldLog, True) as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(9)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(10)) raise Exception("force abort") except: exctype, e = sys.exc_info()[:2] if not issubclass(exctype, Exception): raise self.assertEquals(str(e), "force abort") ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar', 'baz']) self.assertEquals(len(ads), 1) self.assertEquals(ads[0]['foo'], 4) self.assertEquals(ads[0]['bar'], 5) self.assertEquals(ads[0]['baz'], 3) schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster]) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) self.assertEquals(len(ads), 0)
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(os.environ["_CONDOR_JOB_AD"]): printLog("Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist") sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog("Exiting AdjustSites because the webdir upload failed three times.") sys.exit(1) printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog("Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions") clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")
def redo_sites(self, new_submit_text, crab_retry, use_resubmit_info): """ Re-define the set of sites where the job can run on by taking into account any site-white-list and site-black-list. """ ## If there is an automatic site blacklist, add it to the Job.<job_id>.submit ## content. automatic_siteblacklist = self.calculate_blacklist() if automatic_siteblacklist: self.task_ad['CRAB_SiteAutomaticBlacklist'] = automatic_siteblacklist new_submit_text += '+CRAB_SiteAutomaticBlacklist = %s\n' % str(self.task_ad.lookup('CRAB_SiteAutomaticBlacklist')) ## Get the site black- and whitelists either from the task ad or from ## self.resubmit_info. siteblacklist = set() sitewhitelist = set() if not use_resubmit_info: if 'CRAB_SiteBlacklist' in self.task_ad: siteblacklist = set(self.task_ad['CRAB_SiteBlacklist']) if 'CRAB_SiteWhitelist' in self.task_ad: sitewhitelist = set(self.task_ad['CRAB_SiteWhitelist']) else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) siteblacklist = set(self.resubmit_info[inkey].get('site_blacklist', [])) sitewhitelist = set(self.resubmit_info[inkey].get('site_whitelist', [])) ## Save the current site black- and whitelists in self.resubmit_info for the ## current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['site_blacklist'] = list(siteblacklist) self.resubmit_info[outkey]['site_whitelist'] = list(sitewhitelist) ## Add the current site black- and whitelists to the Job.<job_id>.submit ## content. if siteblacklist: new_submit_text += '+CRAB_SiteBlacklist = {"%s"}\n' % ('", "'.join(siteblacklist)) else: new_submit_text += '+CRAB_SiteBlacklist = {}\n' if sitewhitelist: new_submit_text += '+CRAB_SiteWhitelist = {"%s"}\n' % ('", "'.join(sitewhitelist)) else: new_submit_text += '+CRAB_SiteWhitelist = {}\n' ## Get the list of available sites (the sites where this job could run). if os.path.exists("site.ad.json"): with open("site.ad.json") as fd: site_info = json.load(fd) group = site_info[self.job_id] available = set(site_info['group_sites'][str(group)]) datasites = set(site_info['group_datasites'][str(group)]) else: with open("site.ad") as fd: site_ad = classad.parse(fd) available = set(site_ad['Job%s' % (self.job_id)]) ## Take the intersection between the available sites and the site whitelist. ## This is the new set of available sites. if sitewhitelist: available &= sitewhitelist ## Remove from the available sites the ones that are in the site blacklist, ## unless they are also in the site whitelist (i.e. never blacklist something ## on the whitelist). siteblacklist.update(automatic_siteblacklist) available -= (siteblacklist - sitewhitelist) if not available: self.logger.error("Can not submit since DESIRED_Sites list is empty") self.prejob_exit_code = 1 sys.exit(self.prejob_exit_code) ## Add DESIRED_SITES to the Job.<job_id>.submit content. new_submit_text = '+DESIRED_SITES="%s"\n%s' % (",".join(available), new_submit_text) new_submit_text = '+DESIRED_CMSDataLocations="%s"\n%s' % (",".join(datasites), new_submit_text) return new_submit_text
def main(): """ Need a doc string here. """ printLog("Starting AdjustSites") with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 printLog( "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog( "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock): adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) printLog("Exiting AdjustSite")
if resubmit: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", "a"), htcondor.LockType.WriteLock) as lock: adjustPost(resubmit) else: adjustPost(resubmit) resubmitDag("RunJobs.dag", resubmit) if 'CRAB_SiteAdUpdate' in ad: new_site_ad = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: site_ad = classad.parse(fd) site_ad.update(new_site_ad) with open("site.ad", "w") as fd: fd.write(str(site_ad)) id = '%d.%d' % (ad['ClusterId'], ad['ProcId']) ad['foo'] = [] try: htcondor.Schedd().edit([id], 'CRAB_ResubmitList', ad['foo']) except RuntimeError, reerror: print "ERROR: %s" % str(reerror) if __name__ == '__main__': main()
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists( os.environ["_CONDOR_JOB_AD"]): printLog( "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist" ) sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog( "Exiting AdjustSites because the webdir upload failed three times." ) sys.exit(1) printLog( "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog( "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote( ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")