def __init__(self, argv): try: self.classad = classad.parseOne( open(os.environ['_CONDOR_JOB_AD'])) except Exception as e: log.critical("Unable to open classad from environment variable _CONDOR_JOB_AD: {0}".format(e)) sys.exit(1) try: self.machine_ad = classad.parseOne( open(os.environ['_CONDOR_MACHINE_AD'])) except Exception as e: log.critical("Unable to open machinead from environment variable _CONDOR_MACHINE_AD: {0}".format(e)) sys.exit(1) self.cmd_orig = argv[1:] self.cmd = ' '.join(self.cmd_orig) self.app = self.classad['HMDCApplicationName'] self.use_xpra = self.classad['HMDCUseXpra'] self.localjobdir = self.classad['LocalJobDir'].eval() self.app_log = "{0}/{1}.out.txt".format( self.localjobdir, self.app) self.__BASENAME__ = os.path.basename(__file__) self.memory_bytes = (int(self.machine_ad['Memory']) * 1024) * 1024
def run(self): '''Main running function for a process watching a particular condor job. Creates its own logfile, watches for changes and then exits''' observer = Observer() observer.schedule(self, self.__watchdir, recursive=True) files = 0 file_space = 0 job_ad = classad.parseOne(open(self.__watchdir+"/.job.ad", "r")) jobdate = datetime.datetime.fromtimestamp( int(job_ad['JobStartDate'])).strftime('%Y-%m-%d %H:%M:%S') try: logname = ''.join([LOG_DIR, job_ad['Owner'], ".", job_ad['UidDomain'], ".", str(job_ad['QDate']), ".", str(job_ad['ClusterId']), ".", str(job_ad['ProcId']), ".log"]) logfile = open(logname, "wb") except IOError: sys.stderr.write("Problem creating logfile {0}".format(logname)) return logwriter = csv.writer(logfile) logwriter.writerow([job_ad['User'], jobdate]) observer.start() while not self.__exit.is_set(): time.sleep(1) for item in self.stat_monitors.copy(): try: file_space += os.path.getsize(item) except OSError: pass # File has been deleted during our loop files += 1 logwriter.writerow([int(time.time()), files, file_space]) files = 0 file_space = 0 logfile.close()
def main(job_classad = classad.parseOne(sys.stdin.read())): jobid = lambda ad: "{0}.{1}".format( str(ad['ClusterId']), str(ad['ProcId'])) is_interactive = lambda ad: job_classad['HMDCInteractive'] try: if not is_interactive(job_classad): return 0 except: return 0 # If the job isn't currently running, we don't care. if 2 > job_classad['JobStatus'] > 2: log.info('Job is no longer running, exiting.') return 0 log.info ('Job {0}: Running.'.format(jobid(job_classad))) try: is_job_idle = HMDCCondor()._collector.query(htcondor.AdTypes.Any, 'JobId =?= "{0}"'.format(jobid(job_classad)), ['JobCpuIsIdle'])[0]['JobCpuIsIdle'].eval() log.info('Job {0}: Idle? {1}'.format(jobid(job_classad), is_job_idle)) except: log.info('Job {0}: Unable to evaluate JobCpuIsIdle'.format(jobid(job_classad))) return 0 return check_if_preempt(job_classad, update_job(jobid(job_classad), is_job_idle, job_classad))
def main(args): os.environ["HOME"] = HOME_DIR os.environ["USER"] = PARROT_USER # Parrot variables os.environ["PARROT_ALLOW_SWITCHING_CVMFS_REPOSITORIES"] = "yes" os.environ["LD_LIBRARY_PATH"] = (os.getenv("LD_LIBRARY_PATH", "") + ":/usr/local/cctools/lib") os.environ["PATH"] = (os.getenv("PATH", "") + ":/usr/local/sbin:" + "/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + "/usr/local/cctools/bin") job_ad = classad.parseOne(open(".job.ad", "r")) pkg_dir = PKG_HOST_DIR + job_ad['parrotRun'] parrot_str = "parrot_package_run -p " + pkg_dir + " " + os.getcwd() + "/condor_exec.exe " + " ".join(args) if os.path.isdir(pkg_dir): retval = call(parrot_str, shell=True) else: sys.stderr.write("Invalid package specified") sys.exit(1) if retval != 0: sys.stderr.write("An error with parrot_pacakge run_occured") else: os.remove(MOUNTLIST_NAME) # Don't want condor to transfer the mountlist on exit
def classad_to_dict(text): ret = {} c = classad.parseOne(text) for k in c.keys(): try: ret[k.lower()] = c.eval(k) except TypeError: ret[k.lower()] = c[k] return ret
def classad_parse(inputstr): """Parse string into a classad. Uses classad.parseOne if available (HTCondor 8.3+), and classad.parse otherwise (HTCondor 8.2, deprecated in 8.3). """ if hasattr(classad, 'parseOne'): return classad.parseOne(inputstr) else: return classad.parse(inputstr)
def test_parse_one(self): ad = classad.parseOne("foo = 1\nbar = 2") self.one_ad_verify(ad) ad = classad.parseOne("[foo = 1; bar = 2]") self.one_ad_verify(ad) ad = classad.parseOne("foo = 1", classad.Parser.New) self.assertEqual(len(ad), 0) self.one_ad_verify(classad.parseOne("foo = 1\nbar = 2\n")) self.one_ad_verify(classad.parseOne("foo = 1\nbar = 1\n\nbar = 2\n")) ad = classad.parseOne("[foo = 1]", classad.Parser.Old) self.assertEqual(len(ad), 0) self.one_ad_verify(classad.parseOne("[foo = 1; bar = 1;] [bar = 2]")) self.one_ad_verify(classad.parseOne("-------\nfoo = 1\nbar = 2\n\n"))
def main(): try: ad = is_xpra_job(classad.parseOne(sys.stdin.read())) except: return 0 if ad == False: log.info("Job {0} has HMDCUseXpra == False. No clean-up required.". format(int(ad['ClusterId']))) return 0 try: return remove_dir(ad['LocalJobDir'].eval(), ad['ClusterId']) except Exception as e: log.critical("Encountered exception while removing LocalJobDir: {0}".format(e)) return 0
def _wait_for_ready(self, timeout=120): daemons = self._daemons() master_log_path = self._master_log logger.debug("Starting up daemons for {}, waiting for: {}".format( self, " ".join(sorted(daemons)))) start = time.time() while time.time() - start < timeout: time_to_give_up = int(timeout - (time.time() - start)) # if the master log does not exist yet, we can't use condor_who if not master_log_path.exists(): logger.debug( "MASTER_LOG at {} does not yet exist for {}, retrying in 1 seconds (giving up in {} seconds)." .format(master_log_path, self, time_to_give_up)) time.sleep(1) continue who = self.run_command( shlex.split( "condor_who -wait:10 'IsReady && STARTD_State =?= \"Ready\"'" ), ) if who.stdout.strip() == "": logger.debug( "condor_who stdout was unexpectedly blank for {}, retrying in 1 second (giving up in {} seconds). condor_who stderr:\n{}" .format(self, time_to_give_up, who.stderr)) time.sleep(1) continue who_ad = classad.parseOne(who.stdout) if (who_ad.get("IsReady") and who_ad.get("STARTD_State") == "Ready" and all(who_ad.get(d) == "Alive" for d in daemons)): self.state = PersonalPoolState.READY return self logger.debug( "{} is waiting for daemons to be ready (giving up in {} seconds)" .format(self, time_to_give_up)) raise TimeoutError("Standup for {} failed".format(self))
def testScheddSubmitMany(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("tests/submit.ad")) ads = [] cluster = schedd.submit(ad, 10, False, ads) #print ads[0] for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ads = list(ads) #print ads if len(ads) == 0: break if i % 2 == 0: schedd.reschedule() time.sleep(1) self.assertEqual(open(output_file).read(), "hello world\n")
def testScheddQueryPoll(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("submit.ad")) ads = [] cluster = schedd.submit(ad, 10, False, ads) for i in range(60): ads_iter = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query1") ads_iter2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query2") ads = [] for query in htcondor.poll([ads_iter, ads_iter2]): self.assertTrue(query.tag() in ["query1", "query2"]) ads += query.nextAdsNonBlocking() if len(ads) == 0: break if i % 2 == 0: schedd.reschedule()
def who(self) -> classad.ClassAd: """ Return the result of ``condor_who -quick``, as a :class:`classad.ClassAd`. If ``condor_who -quick`` fails, or the output can't be parsed into a sensible who ad, this method returns an empty ad. """ who = self.run_command(["condor_who", "-quick"]) try: parsed = classad.parseOne(who.stdout) # If there's no MASTER key in the parsed ad, it indicates # that we actually got the special post-shutdown message # from condor_who and should act like there's nothing there. if "MASTER" not in parsed: return classad.ClassAd() return parsed except Exception: return classad.ClassAd()
def testScheddSubmitMany2(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("submit.ad")) ads = [] cluster = schedd.submitMany(ad, [({'foo': 1}, 5), ({'foo': 2}, 5)], False, ads) for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus", 'ProcId', 'foo']) ads = list(ads) for ad in ads: if ad['ProcId'] < 5: self.assertEqual(ad['foo'], 1) else: self.assertEqual(ad['foo'], 2) if len(ads) == 0: break if i % 2 == 0: schedd.reschedule() time.sleep(1) self.assertEqual(open(output_file).read(), "hello world\n");
def testScheddQueryPoll(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("tests/submit.ad")) ads = [] cluster = schedd.submit(ad, 10, False, ads) for i in range(60): ads_iter = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query1") ads_iter2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query2") ads = [] for query in htcondor.poll([ads_iter, ads_iter2]): self.assertTrue(query.tag() in ["query1", "query2"]) ads += query.nextAdsNonBlocking() #print ads if len(ads) == 0: break if i % 2 == 0: schedd.reschedule()
def read_from_file(filename): """Read condor classads from file. A generator that yields condor job dicts. Args: filename (str): filename to read """ with (gzip.open(filename) if filename.endswith('.gz') else open(filename)) as f: entry = '' for line in f.readlines(): if line.startswith('***'): try: c = classad.parseOne(entry) yield classad_to_dict(c) entry = '' except: entry = '' else: entry += line + '\n'
def testScheddSubmitMany2(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("tests/submit.ad")) ads = [] cluster = schedd.submitMany(ad, [({'foo': 1}, 5), ({'foo': 2}, 5)], False, ads) #print ads[0] for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus", 'ProcId', 'foo']) ads = list(ads) #print ads for ad in ads: if ad['ProcId'] < 5: self.assertEquals(ad['foo'], 1) else: self.assertEquals(ad['foo'], 2) if len(ads) == 0: break if i % 2 == 0: schedd.reschedule() time.sleep(1) self.assertEquals(open(output_file).read(), "hello world\n");
def testScheddSubmitSpool(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("submit.ad")) result_ads = [] cluster = schedd.submit(ad, 1, True, result_ads) schedd.spool(result_ads) for i in range(60): ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) self.assertEqual(len(ads), 1) if ads[0]["JobStatus"] == 4: break if i % 5 == 0: schedd.reschedule() time.sleep(1) schedd.retrieve("ClusterId == %d" % cluster) schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster]) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"]) self.assertEqual(len(ads), 0) self.assertEqual(open(output_file).read(), "hello world\n")
# so we catch all exceptions, try to write to the outfile if we can # and always exit -1 on error. # # Exiting -1 without an outfile thus means one of two things: # 1. Couldn't parse arguments. # 2. Couldn't open outfile for writing. try: args = parse_args() except Exception: sys.exit(-1) try: try: scratch_dir = Path.cwd() job_ad = classad.parseOne((scratch_dir / ".job.ad").read_text()) out, err = scratch_dir / job_ad["Out"], scratch_dir / job_ad["Err"] with out.open(mode="a") as out_file, err.open( mode="a") as err_file: with contextlib.redirect_stdout( out_file), contextlib.redirect_stderr(err_file): print("\n------ TRANSFER PLUGIN OUTPUT ------\n") print("\n------ TRANSFER PLUGIN ERROR ------\n", file=sys.stderr) main(args) except FileNotFoundError: main(args) except Exception as e: tb = traceback.format_exc().replace("\n", " ") write_dict_to_file_as_ad( {
from datetime import datetime # Import classad __BASENAME__ = os.path.basename(__file__) # Quick and dirty debug function, please replace. def debug(will_debug, _fd, message): if will_debug: dt = datetime.utcnow().strftime("%Y%m%d %s") _fd.write("[{0}] {1}\n".format(dt, message)) return True else: return False job_classad = classad.parseOne(sys.stdin.read()) home = pwd.getpwnam(pwd.getpwuid(os.getuid())[0]).pw_dir # 'HMDCNewSubmit' try: hmdc_new_submit = job_classad['HMDCNewSubmit'] hmdc_interactive_job = job_classad['HMDCInteractive'] except: sys.exit(0) if hmdc_new_submit == False or hmdc_interactive_job == False: sys.exit(0) # Should we debug this hook? try:
def testTransaction(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") log_file = os.path.join(testdir, "test.log") if os.path.exists(output_file): os.unlink(output_file) if os.path.exists(log_file): os.unlink(log_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("submit_sleep.ad")) result_ads = [] cluster = schedd.submit(ad, 1, True, result_ads) with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(1)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(2)) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar']) self.assertEqual(len(ads), 1) self.assertEqual(ads[0]['foo'], 1) self.assertEqual(ads[0]['bar'], 2) with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(3)) with schedd.transaction( htcondor.TransactionFlags.NonDurable | htcondor.TransactionFlags.ShouldLog, True) as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(4)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(5)) ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar', 'baz']) self.assertEqual(len(ads), 1) self.assertEqual(ads[0]['foo'], 4) self.assertEqual(ads[0]['bar'], 5) self.assertEqual(ads[0]['baz'], 3) try: with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(6)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(7)) raise Exception("force abort") except: exctype, e = sys.exc_info()[:2] if not issubclass(exctype, Exception): raise self.assertEqual(str(e), "force abort") ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar']) self.assertEqual(len(ads), 1) self.assertEqual(ads[0]['foo'], 4) self.assertEqual(ads[0]['bar'], 5) try: with schedd.transaction() as txn: schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(8)) with schedd.transaction( htcondor.TransactionFlags.NonDurable | htcondor.TransactionFlags.ShouldLog, True) as txn: schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(9)) schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(10)) raise Exception("force abort") except: exctype, e = sys.exc_info()[:2] if not issubclass(exctype, Exception): raise self.assertEqual(str(e), "force abort") ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar', 'baz']) self.assertEqual(len(ads), 1) self.assertEqual(ads[0]['foo'], 4) self.assertEqual(ads[0]['bar'], 5) self.assertEqual(ads[0]['baz'], 3) # A removed job may persist in the queue for a short time, but its # JobStatus will be 3 (REMOVED) schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster]) ads = schedd.query("ClusterId == %d && JobStatus != 3" % cluster, ["JobStatus"]) self.assertEqual(len(ads), 0)
def __init__(self, result): self.status = result[0] self.ad = classad.parseOne(result[1])
def run_next_task(): parsed_ad = classad.parseOne(ad) self.progress_bar_window.start_task("Attaching to job {0}".format(parsed_ad['HMDCApplicationName'])) self.dispatcher = RCEGraphicalTaskDispatcher('attach_app', self.rceapps, self.jobid, ad) self.dispatcher.start()
def test_parse_one_ad_from_file_like_object(ad_file): ad = classad.parseOne(ad_file.open(mode="r")) assert ad["foo"] == "wiz"
"_id": id }}) + "\n" body += ad + "\n" print es.bulk(body=body)['took'] for fname in sys.argv[1:]: print "Processing file", fname fp = os.popen("condor_history -file %s -l" % fname) count = 0 ad = '' ads = [] for line in fp.xreadlines(): if line == "\n": job_ad = classad.parseOne(ad) if not job_ad: continue #print job_ad json_ad = convert_to_json(job_ad) #print es.index(index=idx, doc_type="job", body=json_ad, id=job_ad["GlobalJobId"]) ads.append((job_ad["GlobalJobId"], json_ad)) count += 1 ad = '' if len(ads) == 100: post_ads(ads) ads = [] ad += line if ad: job_ad = classad.parseOne(line) if 'GlobalJobId' in job_ad:
def test_load_classad_from_file_v2(self): ad = classad.parseOne(open("tests/test.ad")) self.assertEqual(ad["foo"], "bar") self.assertEqual(ad["baz"], classad.Value.Undefined) self.assertRaises(KeyError, ad.__getitem__, "bar")
def redo_sites(self, new_submit_text, crab_retry, use_resubmit_info): """ Re-define the set of sites where the job can run on by taking into account any site-white-list and site-black-list. """ ## If there is an automatic site blacklist, add it to the Job.<job_id>.submit ## content. automatic_siteblacklist = self.calculate_blacklist() if automatic_siteblacklist: self.task_ad[ 'CRAB_SiteAutomaticBlacklist'] = automatic_siteblacklist new_submit_text += '+CRAB_SiteAutomaticBlacklist = %s\n' % str( self.task_ad.lookup('CRAB_SiteAutomaticBlacklist')) ## Get the site black- and whitelists either from the task ad or from ## self.resubmit_info. siteblacklist = set() sitewhitelist = set() if not use_resubmit_info: if 'CRAB_SiteBlacklist' in self.task_ad: siteblacklist = set(self.task_ad['CRAB_SiteBlacklist']) if 'CRAB_SiteWhitelist' in self.task_ad: sitewhitelist = set(self.task_ad['CRAB_SiteWhitelist']) else: inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1) while inkey not in self.resubmit_info and int(inkey) > 0: inkey = str(int(inkey) - 1) siteblacklist = set(self.resubmit_info[inkey].get( 'site_blacklist', [])) sitewhitelist = set(self.resubmit_info[inkey].get( 'site_whitelist', [])) ## Save the current site black- and whitelists in self.resubmit_info for the ## current job retry number. outkey = str(crab_retry) if outkey not in self.resubmit_info: self.resubmit_info[outkey] = {} self.resubmit_info[outkey]['site_blacklist'] = list(siteblacklist) self.resubmit_info[outkey]['site_whitelist'] = list(sitewhitelist) ## Add the current site black- and whitelists to the Job.<job_id>.submit ## content. if siteblacklist: new_submit_text += '+CRAB_SiteBlacklist = {"%s"}\n' % ( '", "'.join(siteblacklist)) else: new_submit_text += '+CRAB_SiteBlacklist = {}\n' if sitewhitelist: new_submit_text += '+CRAB_SiteWhitelist = {"%s"}\n' % ( '", "'.join(sitewhitelist)) else: new_submit_text += '+CRAB_SiteWhitelist = {}\n' ## Get the list of available sites (the sites where this job could run). if os.path.exists("site.ad.json"): with open("site.ad.json") as fd: site_info = json.load(fd) group = site_info[self.job_id] available = set(site_info['group_sites'][str(group)]) datasites = set(site_info['group_datasites'][str(group)]) else: with open("site.ad") as fd: site_ad = classad.parseOne(fd) available = set(site_ad['Job%s' % (self.job_id)]) ## Take the intersection between the available sites and the site whitelist. ## This is the new set of available sites. if sitewhitelist: available &= sitewhitelist ## Remove from the available sites the ones that are in the site blacklist, ## unless they are also in the site whitelist (i.e. never blacklist something ## on the whitelist). siteblacklist.update(automatic_siteblacklist) available -= (siteblacklist - sitewhitelist) if not available: self.logger.error( "Can not submit since DESIRED_Sites list is empty") self.prejob_exit_code = 1 sys.exit(self.prejob_exit_code) ## Add DESIRED_SITES to the Job.<job_id>.submit content. new_submit_text = '+DESIRED_SITES="%s"\n%s' % (",".join(available), new_submit_text) new_submit_text = '+DESIRED_CMSDataLocations="%s"\n%s' % ( ",".join(datasites), new_submit_text) return new_submit_text
def test_parse_one_ad_from_string(ad_string): ad = classad.parseOne(ad_string) assert ad["foo"] == "wiz"
body = '' for id, ad in ads: body += json.dumps({"index": {"_index": idx, "_type": "job", "_id": id}}) + "\n" body += ad + "\n" print es.bulk(body=body)['took'] for fname in sys.argv[1:]: print "Processing file", fname fp = os.popen("condor_history -file %s -l" % fname) count = 0 ad = '' ads = [] for line in fp.xreadlines(): if line == "\n": job_ad = classad.parseOne(ad) if not job_ad: continue #print job_ad json_ad = convert_to_json(job_ad) #print es.index(index=idx, doc_type="job", body=json_ad, id=job_ad["GlobalJobId"]) ads.append((job_ad["GlobalJobId"], json_ad)) count += 1 ad = '' if len(ads) == 100: post_ads(ads) ads = [] ad += line if ad: job_ad = classad.parseOne(line) if 'GlobalJobId' in job_ad:
def executeInternal(self, *args): """The executeInternal method return 4 if the "completion" threshold is not reached, 0 otherwise""" self.stage = args[0] self.completion = int(args[1]) self.prefix = args[2] self.setupLog() self.statusCacheInfo = { } #Will be filled with the status from the status cache self.readJobStatus() completed = set(self.completedJobs(stage=self.stage)) if len(completed) < self.completion: return 4 self.readProcessedJobs() unprocessed = completed - self.processedJobs estimates = copy.copy(unprocessed) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) if self.stage == 'tail' and len(estimates - set(self.failedJobs)) == 0: estimates = set( self.completedJobs(stage='processing', processFailed=False)) self.logger.info("jobs remaining to process: %s", ", ".join(sorted(unprocessed))) # The TaskWorker saves some files that now we are gonna read with open('datadiscovery.pkl', 'rb') as fd: dataset = pickle.load(fd) #Output from the discovery process with open('taskinformation.pkl', 'rb') as fd: task = pickle.load( fd ) #A dictionary containing information about the task as in the Oracle DB with open('taskworkerconfig.pkl', 'rb') as fd: config = pickle.load(fd) #Task worker configuration # need to use user proxy as credential for talking with cmsweb config.TaskWorker.cmscert = os.environ.get('X509_USER_PROXY') config.TaskWorker.cmskey = os.environ.get('X509_USER_PROXY') config.TaskWorker.envForCMSWEB = newX509env( X509_USER_CERT=config.TaskWorker.cmscert, X509_USER_KEY=config.TaskWorker.cmskey) # need to get username from classAd to setup for Rucio access task_ad = classad.parseOne(open(os.environ['_CONDOR_JOB_AD'])) username = task_ad['CRAB_UserHN'] config.Services.Rucio_account = username # need the global black list config.TaskWorker.scratchDir = './scratchdir' if not os.path.exists(config.TaskWorker.scratchDir): os.makedirs(config.TaskWorker.scratchDir) from TaskWorker.Actions.Recurring.BanDestinationSites import CRAB3BanDestinationSites banSites = CRAB3BanDestinationSites(config, self.logger) with config.TaskWorker.envForCMSWEB: banSites.execute() # Read the automatic_splitting/throughputs/0-N files where the PJ # saved the EventThroughput # (report['steps']['cmsRun']['performance']['cpu']['EventThroughput']) # and the average size of the output per event sumEventsThr = 0 sumEventsSize = 0 count = 0 for jid in estimates: if jid in self.failedJobs: continue fn = "automatic_splitting/throughputs/{0}".format(jid) with open(fn) as fd: throughput, eventsize = json.load(fd) sumEventsThr += throughput sumEventsSize += eventsize count += 1 eventsThr = sumEventsThr / count eventsSize = sumEventsSize / count self.logger.info("average throughput for %s jobs: %s evt/s", count, eventsThr) self.logger.info("average eventsize for %s jobs: %s bytes", count, eventsSize) maxSize = getattr(config.TaskWorker, 'automaticOutputSizeMaximum', 5 * 1000**3) maxEvents = (maxSize / eventsSize) if eventsSize > 0 else 0 runtime = task['tm_split_args'].get('minutes_per_job', -1) if self.stage == "processing": # Build in a 33% error margin in the runtime to not create too # many tails. This essentially moves the peak to lower # runtimes and cuts off less of the job distribution tail. target = int(0.75 * runtime) elif self.stage == 'tail': target = int( max( getattr(config.TaskWorker, 'automaticTailRuntimeMinimumMins', 45), getattr(config.TaskWorker, 'automaticTailRuntimeFraction', 0.2) * runtime)) # `target` is in minutes, `eventsThr` is in events/second! events = int(target * eventsThr * 60) if events > maxEvents and maxEvents > 0: self.logger.info( "reduced the target event count from %s to %s to obey output size", events, maxEvents) events = int(maxEvents) splitTask = dict(task) splitTask['tm_split_algo'] = 'EventAwareLumiBased' splitTask['tm_split_args']['events_per_job'] = events if self.stage == 'tail' and not self.adjustLumisForCompletion( splitTask, unprocessed): self.logger.info("nothing to process for completion") self.saveProcessedJobs(unprocessed) return 0 # Disable retries for processing: every lumi is attempted to be # processed once in processing, thrice in the tails -> four times. # That should be enough "retries" # # See note in DagmanCreator about getting this from the Task DB if self.stage == "processing": config.TaskWorker.numAutomJobRetries = 0 try: splitter = Splitter(config, crabserver=None) split_result = splitter.execute(dataset, task=splitTask) self.logger.info("Splitting results:") for g in split_result.result[0]: msg = "Created jobgroup with length {0}".format( len(g.getJobs())) self.logger.info(msg) except TaskWorkerException as e: retmsg = "Splitting failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 try: parent = self.prefix if self.stage == 'tail' else None rucioClient = getNativeRucioClient(config=config, logger=self.logger) creator = DagmanCreator(config, crabserver=None, rucioClient=rucioClient) with config.TaskWorker.envForCMSWEB: creator.createSubdag(split_result.result, task=task, parent=parent, stage=self.stage) self.submitSubdag( 'RunJobs{0}.subdag'.format(self.prefix), getattr(config.TaskWorker, 'maxIdle', MAX_IDLE_JOBS), getattr(config.TaskWorker, 'maxPost', MAX_POST_JOBS), self.stage) except TaskWorkerException as e: retmsg = "DAG creation failed with:\n{0}".format(e) self.logger.error(retmsg) # self.set_dashboard_state('FAILED') return 1 self.saveProcessedJobs(unprocessed) return 0
def main(args): transfers = [ pickle.load(f.open("rb")) for f in Path(TRANSFER_PLUGIN_CACHE).iterdir() ] print(f"Found {len(transfers)} URL transfers to process.\n") if len(transfers) == 0: print("Nothing to do!") write_dict_to_file_as_ad( { "TransferSuccess": True, "TransferFileName": "", "TransferUrl": "", }, args["outfile"], ) return builtin_plugins = htcondor.param["FILETRANSFER_PLUGINS"].split(", ") available_methods = { plugin: classad.parseOne( subprocess.run([plugin, "-classad"], stdout=subprocess.PIPE).stdout.decode("utf-8")) ["SupportedMethods"].split(",") for plugin in reversed(builtin_plugins) } print("Available plugins and methods (in search order):") for k, v in available_methods.items(): print(f"{k} => {v}") print() deferred_transfers = [] for output_file, destination in transfers: protocol = determine_protocol(destination) plugin = find_first_plugin(available_methods, protocol) print( f"Will transfer {output_file} to {destination} using protocol {protocol} implemented by plugin {plugin}" ) deferred_transfers.append( DeferredTransfer(output_file=output_file, destination=destination, plugin=plugin)) # TODO: group transfers by plugin working = Path(USER_URL_TRANSFER_WORKING) working.mkdir(parents=True, exist_ok=True) for transfer in deferred_transfers: infile = working / f"{transfer.id}.in" outfile = working / f"{transfer.id}.out" infile.write_text( str( classad.ClassAd({ "LocalFileName": str(transfer.output_file), "Url": transfer.destination, }))) cmd = [ transfer.plugin, "-infile", str(infile), "-outfile", str(outfile), "-upload", ] print(f"Invoking {' '.join(cmd)}") run_plugin = subprocess.run( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, ) if run_plugin.returncode != 0: print( f"Plugin {transfer.plugin} failed! Its return code was {run_plugin.returncode}" ) print(f"Captured stdout:") print(run_plugin.stdout.decode()) print(f"Captured stderr:") print(run_plugin.stderr.decode()) outfile.rename(Path(args["outfile"])) sys.exit(-1) print( f"Transferred {transfer.output_file} to {transfer.destination} successfully!" ) write_dict_to_file_as_ad( { "TransferSuccess": True, "TransferFileName": "", "TransferUrl": "", }, args["outfile"], )
# Import classad __BASENAME__ = os.path.basename(__file__) # Quick and dirty debug function, please replace. def debug(will_debug, _fd, message): if will_debug: dt = datetime.utcnow().strftime("%Y%m%d %s") _fd.write("[{0}] {1}\n".format(dt, message)) return True else: return False job_classad = classad.parseOne(sys.stdin.read()) home = pwd.getpwnam(pwd.getpwuid(os.getuid())[0]).pw_dir # 'HMDCNewSubmit' try: hmdc_new_submit = job_classad['HMDCNewSubmit'] hmdc_interactive_job = job_classad['HMDCInteractive'] except: sys.exit(0) if hmdc_new_submit == False or hmdc_interactive_job == False: sys.exit(0) # Should we debug this hook? try:
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists( os.environ["_CONDOR_JOB_AD"]): printLog( "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist" ) sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOne(fd) printLog("Parsed ad: %s" % ad) # instantiate a server object to talk with crabserver host = ad['CRAB_RestHost'] dbInstance = ad['CRAB_DbInstance'] cert = ad['X509UserProxy'] crabserver = CRABRest(host, cert, cert, retry=3, userAgent='CRABSchedd') crabserver.setDbInstance(dbInstance) checkTaskInfo(crabserver, ad) # is this the first time this script runs for this task ? (it runs at each resubmit as well !) if not os.path.exists('WEB_DIR'): makeWebDir(ad) printLog( "Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 maxRetries = 3 while retries < maxRetries and exitCode != 0: exitCode = uploadWebDir(crabserver, ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog( "Exiting AdjustSites because the webdir upload failed %d times." % maxRetries) sys.exit(1) printLog( "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(crabserver, ad) printLog("Proxied webdir saved") printLog( "Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote( ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parseOne(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")