def main(): """ Need a doc string here. """ ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD'])) makeWebDir(ad) retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries*20) retries += 1 clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock) as lock: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd))
try: htcondor.Schedd().edit([id], 'CRAB_ResubmitList', ad['foo']) except RuntimeError, reerror: print "ERROR: %s" % str(reerror) # To do this right, we ought to look up how many existing retries were done # and adjust the retry account according to that. if resubmit != True: resubmit = [str(i) for i in resubmit] if resubmit: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", "a"), htcondor.LockType.WriteLock) as lock: adjustPost(resubmit) else: adjustPost(resubmit) resubmitDag("RunJobs.dag", resubmit) if 'CRAB_SiteAdUpdate' in ad: new_site_ad = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: site_ad = classad.parse(fd) site_ad.update(new_site_ad) with open("site.ad", "w") as fd: fd.write(str(site_ad)) id = '%d.%d' % (ad['ClusterId'], ad['ProcId']) ad['foo'] = [] try:
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists( os.environ["_CONDOR_JOB_AD"]): printLog( "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist" ) sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOne(fd) printLog("Parsed ad: %s" % ad) # instantiate a server object to talk with crabserver host = ad['CRAB_RestHost'] dbInstance = ad['CRAB_DbInstance'] cert = ad['X509UserProxy'] crabserver = CRABRest(host, cert, cert, retry=3, userAgent='CRABSchedd') crabserver.setDbInstance(dbInstance) checkTaskInfo(crabserver, ad) # is this the first time this script runs for this task ? (it runs at each resubmit as well !) if not os.path.exists('WEB_DIR'): makeWebDir(ad) printLog( "Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 maxRetries = 3 while retries < maxRetries and exitCode != 0: exitCode = uploadWebDir(crabserver, ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog( "Exiting AdjustSites because the webdir upload failed %d times." % maxRetries) sys.exit(1) printLog( "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(crabserver, ad) printLog("Proxied webdir saved") printLog( "Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote( ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend( adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parseOne(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")
def main(): """ Need a doc string here. """ printLog("Starting AdjustSites") with open(os.environ["_CONDOR_JOB_AD"]) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog( "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if "CRAB_ResubmitList" in ad: resubmitJobIds = ad["CRAB_ResubmitList"] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, "lock"): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", "a"), htcondor.LockType.WriteLock): adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if "CRAB_SiteAdUpdate" in ad: newSiteAd = ad["CRAB_SiteAdUpdate"] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) printLog("Exiting AdjustSite")
def main(): """ Need a doc string here. """ setupLog() if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(os.environ["_CONDOR_JOB_AD"]): printLog("Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist") sys.exit(0) printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD']) with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 if exitCode != 0: printLog("Exiting AdjustSites because the webdir upload failed three times.") sys.exit(1) printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog("Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions") clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True # Hold and release processing and tail DAGs here so that modifications # to the submission and log files will be picked up. schedd = htcondor.Schedd() tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(ad.get("CRAB_ReqName")) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Holding processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGKILL') schedd.act(htcondor.JobAction.Hold, tailconst) if resubmitJobIds: adjustedJobIds = [] filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log") for fn in filenames: if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock): adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn)) else: adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn)) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic': printLog("Releasing processing and tail DAGs") schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1') schedd.act(htcondor.JobAction.Release, tailconst) printLog("Exiting AdjustSite")
def main(): """ Need a doc string here. """ printLog("Starting AdjustSites") with open(os.environ['_CONDOR_JOB_AD']) as fd: ad = classad.parseOld(fd) printLog("Parsed ad: %s" % ad) makeWebDir(ad) printLog("Webdir has been set up. Uploading the webdir URL to the REST") retries = 0 exitCode = 1 while retries < 3 and exitCode != 0: exitCode = updateWebDir(ad) if exitCode != 0: time.sleep(retries * 20) retries += 1 printLog( "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode) saveProxiedWebdir(ad) printLog( "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions" ) clearAutomaticBlacklist() resubmitJobIds = [] if 'CRAB_ResubmitList' in ad: resubmitJobIds = ad['CRAB_ResubmitList'] try: resubmitJobIds = set(resubmitJobIds) resubmitJobIds = [str(i) for i in resubmitJobIds] except TypeError: resubmitJobIds = True if resubmitJobIds: adjustedJobIds = [] if hasattr(htcondor, 'lock'): # While dagman is not running at this point, the schedd may be writing events to this # file; hence, we only edit the file while holding an appropriate lock. # Note this lock method didn't exist until 8.1.6; prior to this, we simply # run dangerously. with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock): adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) else: adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds) ## Adjust the maximum allowed number of retries only for the job ids for which ## the POST script exit status was adjusted. Why only for these job ids and not ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as ## a general rule means "all failed job ids", we don't have a way to know if a ## job is in failed status or not just from the RunJobs.dag file, while job ids ## in adjustedJobIds correspond only to failed jobs. adjustMaxRetries(adjustedJobIds, ad) if 'CRAB_SiteAdUpdate' in ad: newSiteAd = ad['CRAB_SiteAdUpdate'] with open("site.ad") as fd: siteAd = classad.parse(fd) siteAd.update(newSiteAd) with open("site.ad", "w") as fd: fd.write(str(siteAd)) printLog("Exiting AdjustSite")