Ejemplo n.º 1
0
    def __init__(
        self,
        job_queue,
        machine_ad: str = machine_ad_defaults,
        job_ad: str = job_ad_defaults,
        pre_job_rank: str = "0",
        interval: float = 60,
        autocluster: bool = False,
    ):
        """
        Initializes the CondorClassadJobScheduler

        :param job_queue: queue of jobs that are scheduled in the following simulation
        :param machine_ad: ClassAd that is used with every drone
        :param job_ad: ClassAd that is used with every job
        :param pre_job_rank: ClassAd attribute that all drones are sorted by
        :param interval: time between scheduling cycles
        :param autocluster: could be used to decide whether to use autoclusters
        """
        self._stream_queue = job_queue
        self._drones: RankedClusters[Drone] = RankedNonClusters(
            quantization=quantization_defaults, ranking=parse(pre_job_rank)
        )
        self.interval = interval
        self.job_queue = JobQueue()
        self._collecting = True
        self._processing = Resources(jobs=0)

        # temporary solution
        self._wrapped_classads = WeakKeyDictionary()
        self._machine_classad = parse(machine_ad)
        self._job_classad = parse(job_ad)
Ejemplo n.º 2
0
 def testNetworkAccounting(self):
     jobqueue_log_dir = os.path.join(os.getcwd(), "tests_tmp/spool")
     filelist = [f for f in os.listdir(jobqueue_log_dir) if f.startswith("job_queue.log")]
     for f in filelist:
         print "Removing", f
         os.remove(os.path.join(jobqueue_log_dir, f))
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "lark_test_2.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parse(open("tests/lark_submit_2.ad"))
     ads = []
     cluster = schedd.submit(ad, 1, False, ads)
     for i in range(60):
         ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
         job_ad = ads[0]
         if job_ad["JobStatus"] == 4:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     ads = schedd.query("ClusterId == %d" % cluster, [])
     print ads[0]
     self.assertTrue("NetworkIncoming" in ads[0].keys() and ads[0]["NetworkIncoming"] > 0)
Ejemplo n.º 3
0
 def test_load_classad_from_file(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         ad = classad.parse(open("tests/test.ad"))
     self.assertEqual(ad["foo"], "bar")
     self.assertEqual(ad["baz"], classad.Value.Undefined)
     self.assertRaises(KeyError, ad.__getitem__, "bar")
Ejemplo n.º 4
0
    def redo_sites(self, new_submit_file, id, automatic_blacklist):

        if os.path.exists("site.ad.json"):
            with open("site.ad.json") as fd:
                site_info = json.load(fd)
            group = site_info[str(id)]
            available = set(site_info['groups'][str(group)])
        else:
            with open("site.ad") as fd:
                site_ad = classad.parse(fd)
            available = set(site_ad['Job%d' % id])

        blacklist = set(self.task_ad['CRAB_SiteBlacklist'])
        blacklist.update(automatic_blacklist)
        whitelist = set(self.task_ad['CRAB_SiteWhitelist'])
        if 'CRAB_SiteResubmitWhitelist' in self.task_ad:
            whitelist.update(self.task_ad['CRAB_SiteResubmitWhitelist'])
        if 'CRAB_SiteResubmitBlacklist' in self.task_ad:
            blacklist.update(self.task_ad['CRAB_SiteResubmitBlacklist'])

        if whitelist:
            available &= whitelist
        # Never blacklist something on the whitelist
        available -= (blacklist - whitelist)

        new_submit_file = '+DESIRED_SITES="%s"\n%s' % (",".join(available),
                                                       new_submit_file)
        return new_submit_file
Ejemplo n.º 5
0
 def testNetworkPolicyNAT(self):
     jobqueue_log_dir = os.path.join(os.getcwd(), "tests_tmp/spool")
     filelist = [f for f in os.listdir(jobqueue_log_dir) if f.startswith("job_queue.log")]
     for f in filelist:
         os.remove(os.path.join(jobqueue_log_dir, f))
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "lark_test_1.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     coll = htcondor.Collector()
     ad = classad.parse(open("tests/lark_submit_1.ad"))
     ads = []
     cluster = schedd.submit(ad, 1, False, ads)
     #print ads[0]
     for i in range(60):
         #ads = coll.query(htcondor.AdTypes.Startd, "true", ["LarkNetworkType"])
         #ads = coll.query("true", ["LarkNetworkType"])
         ads = schedd.query("ClusterId == %d" % cluster, ["LarkNetworkType"])
         print ads
         if len(ads) != 0:
             if "LarkNetworkType" in ads[0].keys():
                 break
         time.sleep(1)
     #machine_ad = classad.parseOld(open(output_file, "r"))
     self.assertTrue(len(ads) == 1)
     self.assertTrue("LarkNetworkType" in ads[0].keys())
     self.assertEquals(job_ad["LarkNetworkType"], "nat")
Ejemplo n.º 6
0
 def tryUpdate(self,
               shouldwork=True,
               wantio=None,
               wantupdate=None,
               prefix="NonChirp"):
     output_file = os.path.join(htcondor_tests.testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     ad = classad.parse(open("tests/delayed_submit.ad"))
     ad["Arguments"] = "--type=update --prefix=%s --shouldwork=%s" % (
         prefix, str(shouldwork))
     if wantio == True:
         ad["WantIOProxy"] = True
     elif wantio == False:
         ad["WantIOProxy"] = False
     if wantupdate == True:
         ad["WantRemoteUpdates"] = True
     elif wantupdate == False:
         ad["WantRemoteUpdates"] = False
     cluster = self.runJob(ad)
     result_ad = self.getLastHistory(cluster)
     self.assertTrue("ExitCode" in result_ad)
     self.assertEqual(result_ad["ExitCode"], 0)
     last_line = open(output_file).readlines()[-1]
     self.assertEqual(last_line, "SUCCESS\n")
     if shouldwork:
         attr = "%sFoo" % prefix
         self.assertTrue(attr in result_ad)
         self.assertEqual(result_ad[attr], 2)
Ejemplo n.º 7
0
def get_schedulers(filename):
    list_schedulers = []
    tmpfile_1sched = open_file(filename)
    sched = classad.parse(tmpfile_1sched)
    list_schedulers.append(sched)
    tmpfile_1sched.close()
    return list_schedulers
Ejemplo n.º 8
0
 def test_load_classad_from_file(self):
     with warnings.catch_warnings():
         warnings.simplefilter("ignore")
         ad = classad.parse(open("tests/test.ad"))
     self.assertEqual(ad["foo"], "bar")
     self.assertEqual(ad["baz"], classad.Value.Undefined)
     self.assertRaises(KeyError, ad.__getitem__, "bar")
Ejemplo n.º 9
0
    def redo_sites(self, new_submit_file, id, automatic_blacklist):

        if os.path.exists("site.ad.json"):
            with open("site.ad.json") as fd:
                site_info = json.load(fd)
            group = site_info[str(id)]
            available = set(site_info['groups'][str(group)])
        else:
            with open("site.ad") as fd:
                site_ad = classad.parse(fd)
            available = set(site_ad['Job%d' % id])

        blacklist = set(self.task_ad['CRAB_SiteBlacklist'])
        blacklist.update(automatic_blacklist)
        whitelist = set(self.task_ad['CRAB_SiteWhitelist'])
        if 'CRAB_SiteResubmitWhitelist' in self.task_ad:
            whitelist.update(self.task_ad['CRAB_SiteResubmitWhitelist'])
        if 'CRAB_SiteResubmitBlacklist' in self.task_ad:
            blacklist.update(self.task_ad['CRAB_SiteResubmitBlacklist'])

        if whitelist:
            available &= whitelist
        # Never blacklist something on the whitelist
        available -= (blacklist-whitelist)

        new_submit_file = '+DESIRED_SITES="%s"\n%s' % (",".join(available), new_submit_file)
        return new_submit_file
Ejemplo n.º 10
0
 def testScheddNonblockingQuery(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parse(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 10, False, ads)
     for i in range(60):
         ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"])
         ads2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"])
         ctrs = [0, 0]
         iters = [(ads, 0), (ads2, 1)]
         while iters:
             for it, pos in iters:
                 try:
                     it.next()
                     ctrs[pos] += 1
                 except StopIteration:
                     iters.remove((it, pos))
         print ctrs
         if ctrs[0] == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     self.assertEquals(open(output_file).read(), "hello world\n")
Ejemplo n.º 11
0
 def testScheddNonblockingQueryCount(self):
     os.environ["_condor_SCHEDD_DEBUG"] = "D_FULLDEBUG|D_NETWORK"
     self.launch_daemons(["SCHEDD"])
     schedd = htcondor.Schedd()
     submit_ad = classad.parse(open("tests/submit_large.ad"))
     schedd.act(htcondor.JobAction.Remove, "true")
     ads = []
     time.sleep(1)
     while ads:
         time.sleep(.2)
         ads = schedd.query("true")
     #print ads
     for i in range(1, 60):
         print "Testing querying %d jobs in queue." % i
         schedd.submit(submit_ad, i, True, ads)
         ads = schedd.query("true", ["ClusterID", "ProcID"])
         ads2 = list(schedd.xquery("true", ["ClusterID", "ProcID", "a1", "a2", "a3", "a4"]))
         #print ads
         #print ads2
         self.assertNotEqual(ads2[0].lookup("ProcID"), classad.Value.Undefined)
         for ad in ads:
             found_ad = False
             for ad2 in ads2:
                 if ad2["ProcID"] == ad["ProcID"] and ad2["ClusterID"] == ad["ClusterID"]:
                     found_ad = True
                     break
             self.assertTrue(found_ad, msg="Ad %s missing from xquery results: %s" % (ad, ads2))
         self.assertEquals(len(ads), i, msg="Old query protocol gives incorrect number of results (expected %d, got %d)" % (i, len(ads)))
         self.assertEquals(len(ads2), i, msg="New query protocol gives incorrect number of results (expected %d, got %d)" % (i, len(ads2)))
         schedd.act(htcondor.JobAction.Remove, "true")
         while ads:
             time.sleep(.2)
             ads = schedd.query("true")
Ejemplo n.º 12
0
 def testScheddSubmitSpool(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parse(open("tests/submit.ad"))
     result_ads = []
     cluster = schedd.submit(ad, 1, True, result_ads)
     #print result_ads[0]
     schedd.spool(result_ads)
     for i in range(60):
         ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
         #print ads
         self.assertEquals(len(ads), 1)
         if ads[0]["JobStatus"] == 4:
             break
         if i % 5 == 0:
             schedd.reschedule()
         time.sleep(1)
     schedd.retrieve("ClusterId == %d" % cluster)
     #print "Final status:", schedd.query("ClusterId == %d" % cluster)[0];
     schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster])
     ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
     self.assertEquals(len(ads), 0)
     self.assertEquals(open(output_file).read(), "hello world\n");
Ejemplo n.º 13
0
 def testScheddNonblockingQuery(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parse(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 10, False, ads)
     for i in range(60):
         ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"])
         ads2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"])
         ctrs = [0, 0]
         iters = [(ads, 0), (ads2, 1)]
         while iters:
             for it, pos in iters:
                 try:
                     it.next()
                     ctrs[pos] += 1
                 except StopIteration:
                     iters.remove((it, pos))
         print ctrs
         if ctrs[0] == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     self.assertEquals(open(output_file).read(), "hello world\n");
Ejemplo n.º 14
0
 def testScheddNonblockingQueryCount(self):
     os.environ["_condor_SCHEDD_DEBUG"] = "D_FULLDEBUG|D_NETWORK"
     self.launch_daemons(["SCHEDD"])
     schedd = htcondor.Schedd()
     submit_ad = classad.parse(open("tests/submit_large.ad"))
     schedd.act(htcondor.JobAction.Remove, "true")
     ads = []
     time.sleep(1)
     while ads:
         time.sleep(.2)
         ads = schedd.query("true")
     #print ads
     for i in range(1, 60):
         print "Testing querying %d jobs in queue." % i
         schedd.submit(submit_ad, i, True, ads)
         ads = schedd.query("true", ["ClusterID", "ProcID"])
         ads2 = list(schedd.xquery("true", ["ClusterID", "ProcID", "a1", "a2", "a3", "a4"]))
         #print ads
         #print ads2
         self.assertNotEqual(ads2[0].lookup("ProcID"), classad.Value.Undefined)
         for ad in ads:
             found_ad = False
             for ad2 in ads2:
                 if ad2["ProcID"] == ad["ProcID"] and ad2["ClusterID"] == ad["ClusterID"]:
                     found_ad = True
                     break
             self.assertTrue(found_ad, msg="Ad %s missing from xquery results: %s" % (ad, ads2))
         self.assertEquals(len(ads), i, msg="Old query protocol gives incorrect number of results (expected %d, got %d)" % (i, len(ads)))
         self.assertEquals(len(ads2), i, msg="New query protocol gives incorrect number of results (expected %d, got %d)" % (i, len(ads2)))
         schedd.act(htcondor.JobAction.Remove, "true")
         while ads:
             time.sleep(.2)
             ads = schedd.query("true")
Ejemplo n.º 15
0
 def testScheddSubmitSpool(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parse(open("tests/submit.ad"))
     result_ads = []
     cluster = schedd.submit(ad, 1, True, result_ads)
     #print result_ads[0]
     schedd.spool(result_ads)
     for i in range(60):
         ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
         #print ads
         self.assertEquals(len(ads), 1)
         if ads[0]["JobStatus"] == 4:
             break
         if i % 5 == 0:
             schedd.reschedule()
         time.sleep(1)
     schedd.retrieve("ClusterId == %d" % cluster)
     #print "Final status:", schedd.query("ClusterId == %d" % cluster)[0];
     schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster])
     ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
     self.assertEquals(len(ads), 0)
     self.assertEquals(open(output_file).read(), "hello world\n");
Ejemplo n.º 16
0
def get_worker_nodes(filename):
    worker_nodes = []
    tmpfile_wn = open_file(filename)
    wn = classad.parse(tmpfile_wn)
    worker_nodes.append(wn)
    tmpfile_wn.close()
    return worker_nodes
Ejemplo n.º 17
0
def classad_parse(inputstr):
    """Parse string into a classad.

    Uses classad.parseOne if available (HTCondor 8.3+), and
    classad.parse otherwise (HTCondor 8.2, deprecated in 8.3).

    """
    if hasattr(classad, 'parseOne'):
        return classad.parseOne(inputstr)
    else:
        return classad.parse(inputstr)
Ejemplo n.º 18
0
    def testNegotiate(self):
        #htcondor.param['TOOL_DEBUG'] = 'D_FULLDEBUG'
        #os.environ['_condor_SCHEDD_DEBUG'] = 'D_FULLDEBUG, D_NETWORK'
        #htcondor.enable_debug()

        self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD"])
        output_file = os.path.join(testdir, "test.out")
        if os.path.exists(output_file):
            os.unlink(output_file)
        schedd = htcondor.Schedd()

        schedd.act(htcondor.JobAction.Remove, 'true')
        ad = classad.parse(open("tests/submit.ad"))
        ads = []
        cluster = schedd.submit(ad, 1, False, ads)

        # Get claim for startd
        claim_ads = []
        for i in range(10):
            startd_ads = htcondor.Collector().locateAll(
                htcondor.DaemonTypes.Startd)
            private_ads = htcondor.Collector().query(
                htcondor.AdTypes.StartdPrivate)
            if (len(startd_ads) != htcondor.param['NUM_CPUS']) or (
                    len(private_ads) != htcondor.param['NUM_CPUS']):
                time.sleep(1)
                continue
            break
        self.assertEquals(len(startd_ads), len(private_ads))
        self.assertEquals(len(startd_ads), htcondor.param['NUM_CPUS'])
        for ad in htcondor.Collector().locateAll(htcondor.DaemonTypes.Startd):
            for pvt_ad in private_ads:
                if pvt_ad.get('Name') == ad['Name']:
                    ad['ClaimId'] = pvt_ad['Capability']
                    claim_ads.append(ad)
        self.assertEquals(len(claim_ads), len(startd_ads))
        claim = claim_ads[0]

        me = "%s@%s" % (pwd.getpwuid(
            os.geteuid()).pw_name, htcondor.param['UID_DOMAIN'])
        with schedd.negotiate(me) as session:
            requests = list(session)
            self.assertEquals(len(requests), 1)
            request = requests[0]
            self.assertTrue(request.symmetricMatch(claim))
            session.sendClaim(claim['ClaimId'], claim, request)

        for i in range(60):
            ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"])
            ads = list(ads)
            if len(ads) == 0:
                break
            time.sleep(1)
        self.assertEquals(open(output_file).read(), "hello world\n")
Ejemplo n.º 19
0
def get_jobs_scheduled_attributes(maxrange):
    list_jobs_scheduled_attr = []
    # gather scheduled jobs from respective file job$n.txt
    # files are generated by generate_job_sched.py
    for i in range(0, maxrange):
        filename = 'test-files/job' + str(i) + '.txt'
        tmpfile_1line = open_file(filename)
        tmpobj = classad.parse(tmpfile_1line)
        list_jobs_scheduled_attr.append(tmpobj)
        tmpfile_1line.close()
    return list_jobs_scheduled_attr
Ejemplo n.º 20
0
def classad_parse(inputstr):
    """Parse string into a classad.

    Uses classad.parseOne if available (HTCondor 8.3+), and
    classad.parse otherwise (HTCondor 8.2, deprecated in 8.3).

    """
    if hasattr(classad, 'parseOne'):
        return classad.parseOne(inputstr)
    else:
        return classad.parse(inputstr)
Ejemplo n.º 21
0
def main():
    """
    Need a doc string here.
    """
    ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
    makeWebDir(ad)

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries*20)
        retries += 1

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True
    if resubmitJobIds:
        adjustedJobIds = []
        if hasattr(htcondor, 'lock'):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock) as lock:
                adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        else:
            adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))
Ejemplo n.º 22
0
    def testNegotiate(self):
        #htcondor.param['TOOL_DEBUG'] = 'D_FULLDEBUG'
        #os.environ['_condor_SCHEDD_DEBUG'] = 'D_FULLDEBUG, D_NETWORK'
        #htcondor.enable_debug()

        self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD"])
        output_file = os.path.join(testdir, "test.out")
        if os.path.exists(output_file):
            os.unlink(output_file)
        schedd = htcondor.Schedd()

        schedd.act(htcondor.JobAction.Remove, 'true')
        ad = classad.parse(open("tests/submit.ad"))
        ads = []
        cluster = schedd.submit(ad, 1, False, ads)

        # Get claim for startd
        claim_ads = []
        for i in range(10):
            startd_ads = htcondor.Collector().locateAll(htcondor.DaemonTypes.Startd)
            private_ads = htcondor.Collector().query(htcondor.AdTypes.StartdPrivate)
            if (len(startd_ads) != htcondor.param['NUM_CPUS']) or (len(private_ads) != htcondor.param['NUM_CPUS']):
                time.sleep(1)
                continue
            break
        self.assertEquals(len(startd_ads), len(private_ads))
        self.assertEquals(len(startd_ads), htcondor.param['NUM_CPUS'])
        for ad in htcondor.Collector().locateAll(htcondor.DaemonTypes.Startd):
            for pvt_ad in private_ads:
                if pvt_ad.get('Name') == ad['Name']:
                    ad['ClaimId'] = pvt_ad['Capability']
                    claim_ads.append(ad)
        self.assertEquals(len(claim_ads), len(startd_ads))
        claim = claim_ads[0]

        me = "%s@%s" % (pwd.getpwuid(os.geteuid()).pw_name, htcondor.param['UID_DOMAIN'])
        with schedd.negotiate(me) as session:
            requests = list(session)
            self.assertEquals(len(requests), 1)
            request = requests[0]
            self.assertTrue(request.symmetricMatch(claim))
            session.sendClaim(claim['ClaimId'], claim, request)

        for i in range(60):
            ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"])
            ads = list(ads)
            if len(ads) == 0:
                break
            time.sleep(1)
        self.assertEquals(open(output_file).read(), "hello world\n");
Ejemplo n.º 23
0
 def testScheddNonblockingQueryRemove(self):
     os.environ["_condor_SCHEDD_DEBUG"] = "D_FULLDEBUG|D_NETWORK"
     self.launch_daemons(["SCHEDD"])
     schedd = htcondor.Schedd()
     submit_ad = classad.parse(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submit(submit_ad, 300, False, ads)
     ads = schedd.xquery("ClusterId == %d" % cluster)
     print str(datetime.datetime.now())
     print str(datetime.datetime.now())
     schedd.act(htcondor.JobAction.Remove, "ClusterId == %d" % cluster)
     time.sleep(3)
     print str(datetime.datetime.now())
     print len(list(ads))
     print str(datetime.datetime.now())
Ejemplo n.º 24
0
 def testScheddNonblockingQueryRemove(self):
     os.environ["_condor_SCHEDD_DEBUG"] = "D_FULLDEBUG|D_NETWORK"
     self.launch_daemons(["SCHEDD"])
     schedd = htcondor.Schedd()
     submit_ad = classad.parse(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submit(submit_ad, 300, False, ads)
     ads = schedd.xquery("ClusterId == %d" % cluster)
     print str(datetime.datetime.now())
     print str(datetime.datetime.now())
     schedd.act(htcondor.JobAction.Remove, "ClusterId == %d" % cluster)
     time.sleep(3)
     print str(datetime.datetime.now())
     print len(list(ads))
     print str(datetime.datetime.now())
Ejemplo n.º 25
0
 def parseSiteAd(self, fp, task_ad, nodes):
     site_ad = classad.parse(fp)
     blacklist = set(task_ad['CRAB_SiteBlacklist'])
     whitelist = set(task_ad['CRAB_SiteWhitelist'])
     for key, val in site_ad.items():
         m = self.job_name_re.match(key)
         if not m:
             continue
         nodeid = m.groups()[0]
         sites = set(val.eval())
         if whitelist:
             sites &= whitelist
         # Never blacklist something on the whitelist
         sites -= (blacklist-whitelist)
         info = nodes.setdefault(nodeid, {})
         info['AvailableSites'] = list([i.eval() for i in sites])
Ejemplo n.º 26
0
 def tryIO(self, shouldwork=True, wantio=None):
     open(os.path.join(htcondor_tests.testdir, "test_chirp_io"), "w").write("hello world")
     output_file = os.path.join(htcondor_tests.testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     ad = classad.parse(open("tests/delayed_submit.ad"))
     ad["Arguments"] = "--type=io --shouldwork=%s" % str(shouldwork)
     if wantio == True:
         ad["WantIOProxy"] = True
     elif wantio == False:
         ad["WantIOProxy"] = False
     cluster = self.runJob(ad)
     result_ad = self.getLastHistory(cluster)
     self.assertTrue("ExitCode" in result_ad)
     self.assertEquals(result_ad["ExitCode"], 0)
     last_line = open(output_file).readlines()[-1]
     self.assertEquals(last_line, "SUCCESS\n")
Ejemplo n.º 27
0
 def tryIO(self, shouldwork=True, wantio=None):
     open(os.path.join(htcondor_tests.testdir, "test_chirp_io"), "w").write("hello world")
     output_file = os.path.join(htcondor_tests.testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     ad = classad.parse(open("tests/delayed_submit.ad"))
     ad["Arguments"] = "--type=io --shouldwork=%s" % str(shouldwork)
     if wantio:
         ad["WantIOProxy"] = True
     elif not wantio:
         ad["WantIOProxy"] = False
     cluster = self.runJob(ad)
     result_ad = self.getLastHistory(cluster)
     self.assertTrue("ExitCode" in result_ad)
     self.assertEqual(result_ad["ExitCode"], 0)
     last_line = open(output_file).readlines()[-1]
     self.assertEqual(last_line, "SUCCESS\n")
Ejemplo n.º 28
0
 def testDelayedUpdateDOS(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(htcondor_tests.testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     ad = classad.parse(open("tests/delayed_submit.ad"))
     ad["Arguments"] = "--type=delayeddos"
     cluster = self.runJob(ad)
     result_ad = self.getLastHistory(cluster)
     self.assertTrue("ExitCode" in result_ad)
     self.assertEquals(result_ad["ExitCode"], 0)
     last_line = open(output_file).readlines()[-1]
     self.assertEquals(last_line, "SUCCESS\n")
     self.assertTrue("ChirpFoo" in result_ad)
     self.assertEquals(result_ad["ChirpFoo"], "0" * 990)
     self.assertFalse("ChirpBar" in result_ad)
     for i in range(1, 50):
         self.assertTrue(("ChirpFoo%d" % i) in result_ad)
     self.assertFalse("ChirpFoo50" in result_ad)
Ejemplo n.º 29
0
 def testDelayedUpdateDOS(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(htcondor_tests.testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     ad = classad.parse(open("tests/delayed_submit.ad"))
     ad["Arguments"] = "--type=delayeddos"
     cluster = self.runJob(ad)
     result_ad = self.getLastHistory(cluster)
     self.assertTrue("ExitCode" in result_ad)
     self.assertEqual(result_ad["ExitCode"], 0)
     last_line = open(output_file).readlines()[-1]
     self.assertEqual(last_line, "SUCCESS\n")
     self.assertTrue("ChirpFoo" in result_ad)
     self.assertEqual(result_ad["ChirpFoo"], "0" * 990)
     self.assertFalse("ChirpBar" in result_ad)
     for i in range(1, 50):
         self.assertTrue(("ChirpFoo%d" % i) in result_ad)
     self.assertFalse("ChirpFoo50" in result_ad)
Ejemplo n.º 30
0
 def testScheddSubmit(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parse(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 1, False, ads)
     #print ads[0]
     for i in range(60):
         ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
         #print ads
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     self.assertEquals(open(output_file).read(), "hello world\n");
Ejemplo n.º 31
0
 def testScheddSubmit(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parse(open("tests/submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 1, False, ads)
     #print ads[0]
     for i in range(60):
         ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
         #print ads
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     self.assertEquals(open(output_file).read(), "hello world\n");
Ejemplo n.º 32
0
 def tryDelayedUpdate(self, prefix="Chirp", shouldwork=True, wantupdate=None):
     output_file = os.path.join(htcondor_tests.testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     ad = classad.parse(open("tests/delayed_submit.ad"))
     ad["Arguments"] = "--prefix=%s --type=delayed --shouldwork=%s" % (prefix, str(shouldwork))
     if wantupdate == True:
         ad["WantDelayedUpdates"] = True
     elif wantupdate == False:
         ad["WantDelayedUpdates"] = False
     cluster = self.runJob(ad)
     result_ad = self.getLastHistory(cluster)
     attr = "%sFoo" % prefix
     self.assertTrue("ExitCode" in result_ad)
     self.assertEquals(result_ad["ExitCode"], 0)
     last_line = open(output_file).readlines()[-1]
     self.assertEquals(last_line, "SUCCESS\n")
     if shouldwork:
         self.assertTrue(attr in result_ad)
         self.assertEquals(result_ad[attr], 2)
Ejemplo n.º 33
0
    def parseSiteAd(self, fp, task_ad, nodes):
        site_ad = classad.parse(fp)

        blacklist = set(task_ad['CRAB_SiteBlacklist'])
        whitelist = set(task_ad['CRAB_SiteWhitelist'])
        if 'CRAB_SiteResubmitWhitelist' in task_ad:
            whitelist.update(task_ad['CRAB_SiteResubmitWhitelist'])
        if 'CRAB_SiteResubmitBlacklist' in task_ad:
            blacklist.update(task_ad['CRAB_SiteResubmitBlacklist'])

        for key, val in site_ad.items():
            m = self.job_name_re.match(key)
            if not m:
                continue
            nodeid = m.groups()[0]
            sites = set(val.eval())
            if whitelist:
                sites &= whitelist
            # Never blacklist something on the whitelist
            sites -= (blacklist-whitelist)

            info = nodes.setdefault(nodeid, {})
            info['AvailableSites'] = list([i.eval() for i in sites])
Ejemplo n.º 34
0
 def testNetworkPolicyOVS(self):
     jobqueue_log_dir = os.path.join(os.getcwd(), "tests_tmp/spool")
     if(os.path.exists(jobqueue_log_dir)):
         filelist = [f for f in os.listdir(jobqueue_log_dir) if f.startswith("job_queue.log")]
         for f in filelist:
             print "Removing", f
             os.remove(os.path.join(jobqueue_log_dir, f))
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "lark_test_4.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     fd = os.popen("curl www.google.com")
     content = fd.read()
     schedd = htcondor.Schedd()
     ad = classad.parse(open("tests/lark_submit_4.ad"))
     ads = []
     cluster = schedd.submit(ad, 1, False, ads)
     for i in range(60):
         ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
         print ads
         if len(ads) == 0:
             break
         time.sleep(1)
     self.assertEquals(open(output_file).read(), content)
Ejemplo n.º 35
0
 def redo_sites(self, new_submit_text, crab_retry, use_resubmit_info):
     """
     Re-define the set of sites where the job can run on by taking into account
     any site-white-list and site-black-list.
     """
     ## If there is an automatic site blacklist, add it to the Job.<job_id>.submit
     ## content.
     automatic_siteblacklist = self.calculate_blacklist()
     if automatic_siteblacklist:
         self.task_ad[
             'CRAB_SiteAutomaticBlacklist'] = automatic_siteblacklist
         new_submit_text += '+CRAB_SiteAutomaticBlacklist = %s\n' % str(
             self.task_ad.lookup('CRAB_SiteAutomaticBlacklist'))
     ## Get the site black- and whitelists either from the task ad or from
     ## self.resubmit_info.
     siteblacklist = set()
     sitewhitelist = set()
     if not use_resubmit_info:
         if 'CRAB_SiteBlacklist' in self.task_ad:
             siteblacklist = set(self.task_ad['CRAB_SiteBlacklist'])
         if 'CRAB_SiteWhitelist' in self.task_ad:
             sitewhitelist = set(self.task_ad['CRAB_SiteWhitelist'])
     else:
         inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
         while inkey not in self.resubmit_info and int(inkey) > 0:
             inkey = str(int(inkey) - 1)
         siteblacklist = set(self.resubmit_info[inkey].get(
             'site_blacklist', []))
         sitewhitelist = set(self.resubmit_info[inkey].get(
             'site_whitelist', []))
     ## Save the current site black- and whitelists in self.resubmit_info for the
     ## current job retry number.
     outkey = str(crab_retry)
     if outkey not in self.resubmit_info:
         self.resubmit_info[outkey] = {}
     self.resubmit_info[outkey]['site_blacklist'] = list(siteblacklist)
     self.resubmit_info[outkey]['site_whitelist'] = list(sitewhitelist)
     ## Add the current site black- and whitelists to the Job.<job_id>.submit
     ## content.
     if siteblacklist:
         new_submit_text += '+CRAB_SiteBlacklist = {"%s"}\n' % (
             '", "'.join(siteblacklist))
     else:
         new_submit_text += '+CRAB_SiteBlacklist = {}\n'
     if sitewhitelist:
         new_submit_text += '+CRAB_SiteWhitelist = {"%s"}\n' % (
             '", "'.join(sitewhitelist))
     else:
         new_submit_text += '+CRAB_SiteWhitelist = {}\n'
     ## Get the list of available sites (the sites where this job could run).
     if os.path.exists("site.ad.json"):
         with open("site.ad.json") as fd:
             site_info = json.load(fd)
         group = site_info[str(self.job_id)]
         available = set(site_info['group_sites'][str(group)])
         datasites = set(site_info['group_datasites'][str(group)])
     else:
         with open("site.ad") as fd:
             site_ad = classad.parse(fd)
         available = set(site_ad['Job%d' % (self.job_id)])
     ## Take the intersection between the available sites and the site whitelist.
     ## This is the new set of available sites.
     if sitewhitelist:
         available &= sitewhitelist
     ## Remove from the available sites the ones that are in the site blacklist,
     ## unless they are also in the site whitelist (i.e. never blacklist something
     ## on the whitelist).
     siteblacklist.update(automatic_siteblacklist)
     available -= (siteblacklist - sitewhitelist)
     ## Add DESIRED_SITES to the Job.<job_id>.submit content.
     new_submit_text = '+DESIRED_SITES="%s"\n%s' % (",".join(available),
                                                    new_submit_text)
     new_submit_text = '+DESIRED_CMSDataLocations="%s"\n%s' % (
         ",".join(datasites), new_submit_text)
     return new_submit_text
Ejemplo n.º 36
0
 def parsePoolAd(self, fp):
     pool_ad = classad.parse(fp)
Ejemplo n.º 37
0
 def test_load_classad_from_file(self):
     ad = classad.parse(open("tests/test.ad"))
     self.assertEqual(ad["foo"], "bar")
     self.assertEqual(ad["baz"], classad.Value.Undefined)
     self.assertRaises(KeyError, ad.__getitem__, "bar")
Ejemplo n.º 38
0
 def parsePoolAd(self, fp):
     pool_ad = classad.parse(fp)
Ejemplo n.º 39
0
 def test_load_classad_from_file(self):
     ad = classad.parse(open("tests/test.ad"))
     self.assertEqual(ad["foo"], "bar")
     self.assertEqual(ad["baz"], classad.Value.Undefined)
     self.assertRaises(KeyError, ad.__getitem__, "bar")
Ejemplo n.º 40
0
    def testTransaction(self):
        self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
        output_file = os.path.join(testdir, "test.out")
        log_file = os.path.join(testdir, "test.log")
        if os.path.exists(output_file):
            os.unlink(output_file)
        if os.path.exists(log_file):
            os.unlink(log_file)
        schedd = htcondor.Schedd()
        ad = classad.parse(open("tests/submit_sleep.ad"))
        result_ads = []
        cluster = schedd.submit(ad, 1, True, result_ads)

        with schedd.transaction() as txn:
            schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(1))
            schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(2))
        ads = schedd.query("ClusterId == %d" % cluster,
                           ["JobStatus", 'foo', 'bar'])
        self.assertEquals(len(ads), 1)
        self.assertEquals(ads[0]['foo'], 1)
        self.assertEquals(ads[0]['bar'], 2)

        with schedd.transaction() as txn:
            schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(3))
            with schedd.transaction(
                    htcondor.TransactionFlags.NonDurable
                    | htcondor.TransactionFlags.ShouldLog, True) as txn:
                schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(4))
                schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(5))
        ads = schedd.query("ClusterId == %d" % cluster,
                           ["JobStatus", 'foo', 'bar', 'baz'])
        self.assertEquals(len(ads), 1)
        self.assertEquals(ads[0]['foo'], 4)
        self.assertEquals(ads[0]['bar'], 5)
        self.assertEquals(ads[0]['baz'], 3)

        try:
            with schedd.transaction() as txn:
                schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(6))
                schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(7))
                raise Exception("force abort")
        except:
            exctype, e = sys.exc_info()[:2]
            if not issubclass(exctype, Exception):
                raise
            self.assertEquals(str(e), "force abort")
        ads = schedd.query("ClusterId == %d" % cluster,
                           ["JobStatus", 'foo', 'bar'])
        self.assertEquals(len(ads), 1)
        self.assertEquals(ads[0]['foo'], 4)
        self.assertEquals(ads[0]['bar'], 5)

        try:
            with schedd.transaction() as txn:
                schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(8))
                with schedd.transaction(
                        htcondor.TransactionFlags.NonDurable
                        | htcondor.TransactionFlags.ShouldLog, True) as txn:
                    schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(9))
                    schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(10))
                raise Exception("force abort")
        except:
            exctype, e = sys.exc_info()[:2]
            if not issubclass(exctype, Exception):
                raise
            self.assertEquals(str(e), "force abort")
        ads = schedd.query("ClusterId == %d" % cluster,
                           ["JobStatus", 'foo', 'bar', 'baz'])
        self.assertEquals(len(ads), 1)
        self.assertEquals(ads[0]['foo'], 4)
        self.assertEquals(ads[0]['bar'], 5)
        self.assertEquals(ads[0]['baz'], 3)

        schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster])
        ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
        self.assertEquals(len(ads), 0)
Ejemplo n.º 41
0
    if resubmitJobIds:
        if hasattr(htcondor, 'lock'):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock) as lock:
                adjustPostScriptExitStatus(resubmitJobIds)
        else:
            adjustPostScriptExitStatus(resubmitJobIds)
        adjustMaxRetries(resubmitJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))
        dagJobId = '%d.%d' % (ad['ClusterId'], ad['ProcId'])
        ad['foo'] = []
        try:
            ## Is CRAB_ResubmitList the attribute we want to edit ?
            ## Or is it CRAB_SiteAdUpdate ?
            htcondor.Schedd().edit([dagJobId], 'CRAB_ResubmitList', ad['foo'])
        except RuntimeError, reerror:
            print "ERROR: %s" % str(reerror)


if __name__ == '__main__':
    main()
Ejemplo n.º 42
0
def main():
    """
    Need a doc string here.
    """
    printLog("Starting AdjustSites")

    with open(os.environ["_CONDOR_JOB_AD"]) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode)

    saveProxiedWebdir(ad)

    printLog(
        "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if "CRAB_ResubmitList" in ad:
        resubmitJobIds = ad["CRAB_ResubmitList"]
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True
    if resubmitJobIds:
        adjustedJobIds = []
        if hasattr(htcondor, "lock"):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", "a"), htcondor.LockType.WriteLock):
                adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        else:
            adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if "CRAB_SiteAdUpdate" in ad:
        newSiteAd = ad["CRAB_SiteAdUpdate"]
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    printLog("Exiting AdjustSite")
Ejemplo n.º 43
0
    def testTransaction(self):
        self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
        output_file = os.path.join(testdir, "test.out")
        log_file = os.path.join(testdir, "test.log")
        if os.path.exists(output_file):
            os.unlink(output_file)
        if os.path.exists(log_file):
            os.unlink(log_file)
        schedd = htcondor.Schedd()
        ad = classad.parse(open("tests/submit_sleep.ad"))
        result_ads = []
        cluster = schedd.submit(ad, 1, True, result_ads)

        with schedd.transaction() as txn:
            schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(1))
            schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(2))
        ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar'])
        self.assertEquals(len(ads), 1)
        self.assertEquals(ads[0]['foo'], 1)
        self.assertEquals(ads[0]['bar'], 2)

        with schedd.transaction() as txn:
            schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(3))
            with schedd.transaction(htcondor.TransactionFlags.NonDurable | htcondor.TransactionFlags.ShouldLog, True) as txn:
                schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(4))
                schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(5))
        ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar', 'baz'])
        self.assertEquals(len(ads), 1)
        self.assertEquals(ads[0]['foo'], 4)
        self.assertEquals(ads[0]['bar'], 5)
        self.assertEquals(ads[0]['baz'], 3)

        try:
            with schedd.transaction() as txn:
                schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(6))
                schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(7))
                raise Exception("force abort")
        except:
            exctype, e = sys.exc_info()[:2]
            if not issubclass(exctype, Exception):
                raise
            self.assertEquals(str(e), "force abort")
        ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar'])
        self.assertEquals(len(ads), 1)
        self.assertEquals(ads[0]['foo'], 4)
        self.assertEquals(ads[0]['bar'], 5)

        try:
            with schedd.transaction() as txn:
                schedd.edit(["%d.0" % cluster], 'baz', classad.Literal(8))
                with schedd.transaction(htcondor.TransactionFlags.NonDurable | htcondor.TransactionFlags.ShouldLog, True) as txn:
                    schedd.edit(["%d.0" % cluster], 'foo', classad.Literal(9))
                    schedd.edit(["%d.0" % cluster], 'bar', classad.Literal(10))
                raise Exception("force abort")
        except:
            exctype, e = sys.exc_info()[:2]
            if not issubclass(exctype, Exception): 
                raise
            self.assertEquals(str(e), "force abort")
        ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus", 'foo', 'bar', 'baz'])
        self.assertEquals(len(ads), 1)
        self.assertEquals(ads[0]['foo'], 4)
        self.assertEquals(ads[0]['bar'], 5)
        self.assertEquals(ads[0]['baz'], 3)

        schedd.act(htcondor.JobAction.Remove, ["%d.0" % cluster])
        ads = schedd.query("ClusterId == %d" % cluster, ["JobStatus"])
        self.assertEquals(len(ads), 0)
Ejemplo n.º 44
0
def main():
    """
    Need a doc string here.
    """
    setupLog()

    if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(os.environ["_CONDOR_JOB_AD"]):
        printLog("Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist")
        sys.exit(0)

    printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD'])

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    if exitCode != 0:
        printLog("Exiting AdjustSites because the webdir upload failed three times.")
        sys.exit(1)

    printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode)

    saveProxiedWebdir(ad)

    printLog("Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions")

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True

    # Hold and release processing and tail DAGs here so that modifications
    # to the submission and log files will be picked up.
    schedd = htcondor.Schedd()
    tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(ad.get("CRAB_ReqName"))
    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Holding processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGKILL')
        schedd.act(htcondor.JobAction.Hold, tailconst)

    if resubmitJobIds:
        adjustedJobIds = []
        filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log")
        for fn in filenames:
            if hasattr(htcondor, 'lock'):
                # While dagman is not running at this point, the schedd may be writing events to this
                # file; hence, we only edit the file while holding an appropriate lock.
                # Note this lock method didn't exist until 8.1.6; prior to this, we simply
                # run dangerously.
                with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock):
                    adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn))
            else:
                adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn))
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Releasing processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1')
        schedd.act(htcondor.JobAction.Release, tailconst)

    printLog("Exiting AdjustSite")
Ejemplo n.º 45
0
 def redo_sites(self, new_submit_text, crab_retry, use_resubmit_info):
     """
     Re-define the set of sites where the job can run on by taking into account
     any site-white-list and site-black-list.
     """
     ## If there is an automatic site blacklist, add it to the Job.<job_id>.submit
     ## content.
     automatic_siteblacklist = self.calculate_blacklist()
     if automatic_siteblacklist:
         self.task_ad['CRAB_SiteAutomaticBlacklist'] = automatic_siteblacklist
         new_submit_text += '+CRAB_SiteAutomaticBlacklist = %s\n' % str(self.task_ad.lookup('CRAB_SiteAutomaticBlacklist'))
     ## Get the site black- and whitelists either from the task ad or from
     ## self.resubmit_info.
     siteblacklist = set()
     sitewhitelist = set()
     if not use_resubmit_info:
         if 'CRAB_SiteBlacklist' in self.task_ad:
             siteblacklist = set(self.task_ad['CRAB_SiteBlacklist'])
         if 'CRAB_SiteWhitelist' in self.task_ad:
             sitewhitelist = set(self.task_ad['CRAB_SiteWhitelist'])
     else:
         inkey = str(crab_retry) if crab_retry == 0 else str(crab_retry - 1)
         while inkey not in self.resubmit_info and int(inkey) > 0:
             inkey = str(int(inkey) -  1)
         siteblacklist = set(self.resubmit_info[inkey].get('site_blacklist', []))
         sitewhitelist = set(self.resubmit_info[inkey].get('site_whitelist', []))
     ## Save the current site black- and whitelists in self.resubmit_info for the
     ## current job retry number.
     outkey = str(crab_retry)
     if outkey not in self.resubmit_info:
         self.resubmit_info[outkey] = {}
     self.resubmit_info[outkey]['site_blacklist'] = list(siteblacklist)
     self.resubmit_info[outkey]['site_whitelist'] = list(sitewhitelist)
     ## Add the current site black- and whitelists to the Job.<job_id>.submit
     ## content.
     if siteblacklist:
         new_submit_text += '+CRAB_SiteBlacklist = {"%s"}\n' % ('", "'.join(siteblacklist))
     else:
         new_submit_text += '+CRAB_SiteBlacklist = {}\n'
     if sitewhitelist:
         new_submit_text += '+CRAB_SiteWhitelist = {"%s"}\n' % ('", "'.join(sitewhitelist))
     else:
         new_submit_text += '+CRAB_SiteWhitelist = {}\n'
     ## Get the list of available sites (the sites where this job could run).
     if os.path.exists("site.ad.json"):
         with open("site.ad.json") as fd:
             site_info = json.load(fd)
         group = site_info[self.job_id]
         available = set(site_info['group_sites'][str(group)])
         datasites = set(site_info['group_datasites'][str(group)])
     else:
         with open("site.ad") as fd:
             site_ad = classad.parse(fd)
         available = set(site_ad['Job%s' % (self.job_id)])
     ## Take the intersection between the available sites and the site whitelist.
     ## This is the new set of available sites.
     if sitewhitelist:
         available &= sitewhitelist
     ## Remove from the available sites the ones that are in the site blacklist,
     ## unless they are also in the site whitelist (i.e. never blacklist something
     ## on the whitelist).
     siteblacklist.update(automatic_siteblacklist)
     available -= (siteblacklist - sitewhitelist)
     if not available:
       self.logger.error("Can not submit since DESIRED_Sites list is empty")
       self.prejob_exit_code = 1
       sys.exit(self.prejob_exit_code)
     ## Add DESIRED_SITES to the Job.<job_id>.submit content.
     new_submit_text = '+DESIRED_SITES="%s"\n%s' % (",".join(available), new_submit_text)
     new_submit_text = '+DESIRED_CMSDataLocations="%s"\n%s' % (",".join(datasites), new_submit_text)
     return new_submit_text
Ejemplo n.º 46
0
def main():
    """
    Need a doc string here.
    """
    printLog("Starting AdjustSites")

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    printLog(
        "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir"
        % exitCode)

    saveProxiedWebdir(ad)

    printLog(
        "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True
    if resubmitJobIds:
        adjustedJobIds = []
        if hasattr(htcondor, 'lock'):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'),
                               htcondor.LockType.WriteLock):
                adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        else:
            adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    printLog("Exiting AdjustSite")
Ejemplo n.º 47
0
    if resubmit:
        if hasattr(htcondor, 'lock'):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", "a"),
                               htcondor.LockType.WriteLock) as lock:
                adjustPost(resubmit)
        else:
            adjustPost(resubmit)
        resubmitDag("RunJobs.dag", resubmit)

    if 'CRAB_SiteAdUpdate' in ad:
        new_site_ad = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            site_ad = classad.parse(fd)
        site_ad.update(new_site_ad)
        with open("site.ad", "w") as fd:
            fd.write(str(site_ad))
        id = '%d.%d' % (ad['ClusterId'], ad['ProcId'])
        ad['foo'] = []
        try:
            htcondor.Schedd().edit([id], 'CRAB_ResubmitList', ad['foo'])
        except RuntimeError, reerror:
            print "ERROR: %s" % str(reerror)


if __name__ == '__main__':
    main()
Ejemplo n.º 48
0
def main():
    """
    Need a doc string here.
    """
    setupLog()

    if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(
            os.environ["_CONDOR_JOB_AD"]):
        printLog(
            "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist"
        )
        sys.exit(0)

    printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" %
             os.environ['_CONDOR_JOB_AD'])

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    if exitCode != 0:
        printLog(
            "Exiting AdjustSites because the webdir upload failed three times."
        )
        sys.exit(1)

    printLog(
        "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir"
        % exitCode)

    saveProxiedWebdir(ad)

    printLog(
        "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True

    # Hold and release processing and tail DAGs here so that modifications
    # to the submission and log files will be picked up.
    schedd = htcondor.Schedd()
    tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(
        ad.get("CRAB_ReqName"))
    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Holding processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGKILL')
        schedd.act(htcondor.JobAction.Hold, tailconst)

    if resubmitJobIds:
        adjustedJobIds = []
        filenames = getGlob(ad, "RunJobs.dag.nodes.log",
                            "RunJobs[1-9]*.subdag.nodes.log")
        for fn in filenames:
            if hasattr(htcondor, 'lock'):
                # While dagman is not running at this point, the schedd may be writing events to this
                # file; hence, we only edit the file while holding an appropriate lock.
                # Note this lock method didn't exist until 8.1.6; prior to this, we simply
                # run dangerously.
                with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock):
                    adjustedJobIds.extend(
                        adjustPostScriptExitStatus(resubmitJobIds, fn))
            else:
                adjustedJobIds.extend(
                    adjustPostScriptExitStatus(resubmitJobIds, fn))
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Releasing processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1')
        schedd.act(htcondor.JobAction.Release, tailconst)

    printLog("Exiting AdjustSite")
Ejemplo n.º 49
0
    if resubmit:
        if hasattr(htcondor, 'lock'):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", "a"), htcondor.LockType.WriteLock) as lock:
                adjustPost(resubmit)
        else:
            adjustPost(resubmit)
        resubmitDag("RunJobs.dag", resubmit)

    if 'CRAB_SiteAdUpdate' in ad:
        new_site_ad = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            site_ad = classad.parse(fd)
        site_ad.update(new_site_ad)
        with open("site.ad", "w") as fd:
            fd.write(str(site_ad))
        id = '%d.%d' % (ad['ClusterId'], ad['ProcId'])
        ad['foo'] = []
        try:
            htcondor.Schedd().edit([id], 'CRAB_ResubmitList', ad['foo'])
        except RuntimeError, reerror:
            print "ERROR: %s" % str(reerror)

if __name__ == '__main__':
    main()