Beispiel #1
0
 def test_warnings(self):
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always")
         classad.parseOld("foo = 1\nbar = 2")
         self.assertEqual(len(w), 1)
         self.assertTrue(issubclass(w[-1].category, DeprecationWarning))
         self.assertTrue("deprecated" in str(w[-1].message))
Beispiel #2
0
 def test_warnings(self):
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("always")
         classad.parseOld("foo = 1\nbar = 2")
         self.assertEqual(len(w), 1)
         self.assertTrue(issubclass(w[-1].category, DeprecationWarning))
         self.assertTrue("deprecated" in str(w[-1].message))
Beispiel #3
0
def main():
    """
    Need a doc string here.
    """
    ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
    makeWebDir(ad)

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries*20)
        retries += 1

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = set(ad['CRAB_ResubmitList'])
        dagJobId = '%d.%d' % (ad['ClusterId'], ad['ProcId'])
        ad['foo'] = []
        try:
            htcondor.Schedd().edit([dagJobId], 'CRAB_ResubmitList', ad['foo'])
        except RuntimeError, reerror:
            print "ERROR: %s" % str(reerror)
Beispiel #4
0
def main():
    ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
    make_webdir(ad)
    make_job_submit(ad)

    retries = 0
    exit_code = 1
    while retries < 3 and exit_code != 0:
        exit_code = updatewebdir(ad)
        if exit_code != 0:
            time.sleep(retries * 20)
        retries += 1

    clear_automatic_blacklist(ad)

    blacklist = set()
    if 'CRAB_SiteBlacklist' in ad:
        blacklist = set(ad['CRAB_SiteBlacklist'])

    whitelist = set()
    if 'CRAB_SiteWhitelist' in ad:
        whitelist = set(ad['CRAB_SiteWhitelist'])

    resubmit = []
    if 'CRAB_ResubmitList' in ad:
        resubmit = set(ad['CRAB_ResubmitList'])
        id = '%d.%d' % (ad['ClusterId'], ad['ProcId'])
        ad['foo'] = []
        try:
            htcondor.Schedd().edit([id], 'CRAB_ResubmitList', ad['foo'])
        except RuntimeError, reerror:
            print "ERROR: %s" % str(reerror)
Beispiel #5
0
    def get_job_ad_from_condor_q(self):
        """
        Need a doc string here.
        """
        if self.dag_clusterid == -1:
            return

        shutil.copy("job_log", "job_log.%s" % str(self.dag_jobid))

        p = subprocess.Popen([
            "condor_q", "-debug", "-l", "-userlog",
            "job_log.%s" % str(self.dag_jobid),
            str(self.dag_jobid)
        ],
                             stdout=subprocess.PIPE,
                             stderr=sys.stderr)
        output, _ = p.communicate()
        status = p.returncode

        try:
            os.unlink("job_log.%s" % str(self.dag_jobid))
        except:
            pass

        if status:
            raise FatalError("Failed to query condor user log:\n%s" % output)

        for text_ad in output.split("\n\n"):
            try:
                ad = classad.parseOld(text_ad)
            except SyntaxError:
                continue
            if ad:
                self.ads.append(ad)
        self.ad = self.ads[-1]
Beispiel #6
0
 def get_job_ad_from_file(self):
     """
     Need a doc string here
     """
     self.ads.append(self.ad)
     if self.dag_retry == 0:
         msg = "This is job retry number 0. Will not try to search and load previous job ads."
         self.logger.info(msg)
         return
     for dag_retry in range(self.dag_retry):
         job_ad_file = os.path.join(".", "finished_jobs",
                                    "job.%s.%d" % (self.job_id, dag_retry))
         if os.path.isfile(job_ad_file):
             try:
                 with open(job_ad_file) as fd:
                     ad = classad.parseOld(fd)
             except Exception:
                 msg = "Unable to parse classads from file %s. Continuing." % (
                     job_ad_file)
                 self.logger.warning(msg)
                 continue
             if ad:
                 self.ads.append(ad)
         else:
             msg = "File %s does not exist. Continuing." % (job_ad_file)
             self.logger.warning(msg)
Beispiel #7
0
    def get_job_ad(self):
        try:
            cluster = int(self.cluster.split(".")[0])
            if cluster == -1:
                return
        except ValueError:
            pass

        shutil.copy("job_log", "job_log.%s" % str(self.cluster))

        p = subprocess.Popen(["condor_q", "-debug", "-l", "-userlog", "job_log.%s" % str(self.cluster), str(self.cluster)], stdout=subprocess.PIPE, stderr=sys.stderr)
        output, _ = p.communicate()
        status = p.returncode

        try:
            os.unlink("job_log.%s" % str(self.cluster))
        except:
            pass

        if status:
            raise FatalError("Failed to query condor user log:\n%s" % output)
        self.ads = []
        for text_ad in output.split("\n\n"):
            try:
                ad = classad.parseOld(text_ad)
            except SyntaxError:
                continue
            if ad:
                self.ads.append(ad)
        self.ad = self.ads[-1]
        if 'JOBGLIDEIN_CMSSite' in self.ad:
            self.site = self.ad['JOBGLIDEIN_CMSSite']
    def get_job_ad_from_condor_q(self):
        """
        Need a doc string here.
        """
        if self.dag_clusterid == -1:
            return

        shutil.copy("job_log", "job_log.%s" % str(self.dag_jobid))

        p = subprocess.Popen(
            ["condor_q", "-debug", "-l", "-userlog", "job_log.%s" % str(self.dag_jobid), str(self.dag_jobid)],
            stdout=subprocess.PIPE,
            stderr=sys.stderr,
        )
        output, _ = p.communicate()
        status = p.returncode

        try:
            os.unlink("job_log.%s" % str(self.dag_jobid))
        except:
            pass

        if status:
            raise FatalError("Failed to query condor user log:\n%s" % output)

        for text_ad in output.split("\n\n"):
            try:
                ad = classad.parseOld(text_ad)
            except SyntaxError:
                continue
            if ad:
                self.ads.append(ad)
        self.ad = self.ads[-1]
Beispiel #9
0
def main():
    ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
    make_webdir(ad)
    make_job_submit(ad)

    retries = 0
    exit_code = 1
    while retries < 3 and exit_code != 0:
        exit_code = updatewebdir(ad)
        if exit_code != 0:
            time.sleep(retries*20)
        retries += 1

    clear_automatic_blacklist(ad)

    blacklist = set()
    if 'CRAB_SiteBlacklist' in ad:
        blacklist = set(ad['CRAB_SiteBlacklist'])

    whitelist = set()
    if 'CRAB_SiteWhitelist' in ad:
        whitelist = set(ad['CRAB_SiteWhitelist'])

    resubmit = []
    if 'CRAB_ResubmitList' in ad:
        resubmit = set(ad['CRAB_ResubmitList'])
        id = '%d.%d' % (ad['ClusterId'], ad['ProcId'])
        ad['foo'] = []
        try:
            htcondor.Schedd().edit([id], 'CRAB_ResubmitList', ad['foo'])
        except RuntimeError, reerror:
            print "ERROR: %s" % str(reerror)
Beispiel #10
0
 def get_task_ad(self):
     """
     Need a doc string here.
     """
     self.task_ad = {}
     try:
         self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
     except:
         msg = "Got exception while trying to parse the job ad."
         self.logger.exception(msg)
Beispiel #11
0
 def get_task_ad(self):
     """
     Need a doc string here.
     """
     self.task_ad = {}
     try:
         self.logger.info("Loading classads from: %s" %
                          os.environ['_CONDOR_JOB_AD'])
         self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
     except:
         msg = "Got exception while trying to parse the job ad."
         self.logger.exception(msg)
Beispiel #12
0
 def test_old_classad(self):
     ad = classad.parseOld(open("tests/test.old.ad"))
     contents = open("tests/test.old.ad").read()
     keys = []
     for line in contents.splitlines():
         info = line.split(" = ")
         if len(info) != 2:
             continue
         self.assertTrue(info[0] in ad)
         self.assertEqual(ad.lookup(info[0]).__repr__(), info[1])
         keys.append(info[0])
     for key in ad:
         self.assertTrue(key in keys)
Beispiel #13
0
 def get_task_ad(self):
     """
     Need a doc string here.
     """
     self.task_ad = {}
     try:
         self.logger.info("Loading classads from: %s" % os.environ['_CONDOR_JOB_AD'])
         self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
         self.logger.info(os.listdir('.'))
         self.logger.info(str(self.task_ad))
     except:
         msg = "Got exception while trying to parse the job ad."
         self.logger.exception(msg)
Beispiel #14
0
 def test_old_classad(self):
     ad = classad.parseOld(open("tests/test.old.ad"))
     contents = open("tests/test.old.ad").read()
     keys = []
     for line in contents.splitlines():
         info = line.split(" = ")
         if len(info) != 2:
             continue
         self.assertTrue(info[0] in ad)
         self.assertEqual(ad.lookup(info[0]).__repr__(), info[1])
         keys.append(info[0])
     for key in ad:
         self.assertTrue(key in keys)
Beispiel #15
0
def main():
    """
    Need a doc string here.
    """
    ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
    makeWebDir(ad)

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries*20)
        retries += 1

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True
    if resubmitJobIds:
        adjustedJobIds = []
        if hasattr(htcondor, 'lock'):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'), htcondor.LockType.WriteLock) as lock:
                adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        else:
            adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))
Beispiel #16
0
    def parseHistoryFile(self, historyFile):
        xml_out = {}
        with open(historyFile) as fd:
            job_ad = classad.parseOld(fd)
            for key in job_ad.keys():
                temp = ''
                try:
                    temp = str(int(job_ad[key])) # force boolean values to be converted to integer
                except:
                    temp = job_ad[key]
                    if key not in ['GlobalJobId', 'TaskType', 'Owner', 'CRAB_ReqName', 'CRAB_JobSW', 'CRAB_AsyncDest', 'MATCH_EXP_JOB_GLIDEIN_Entry_Name', 'MATCH_EXP_JOB_GLIDEIN_CMSSite']:
                        continue
                xml_out[key] = temp

        if len(xml_out) > 0:
            self.xmlBuffer = xml_out
            self.totalParsed += 1
            return 1

        return 0
Beispiel #17
0
    def get_job_ad(self):
        try:
            cluster = int(self.cluster.split(".")[0])
            if cluster == -1:
                return
        except ValueError:
            pass

        shutil.copy("job_log", "job_log.%s" % str(self.cluster))

        p = subprocess.Popen([
            "condor_q", "-debug", "-l", "-userlog",
            "job_log.%s" % str(self.cluster),
            str(self.cluster)
        ],
                             stdout=subprocess.PIPE,
                             stderr=sys.stderr)
        output, _ = p.communicate()
        status = p.returncode

        try:
            os.unlink("job_log.%s" % str(self.cluster))
        except:
            pass

        if status:
            raise FatalError("Failed to query condor user log:\n%s" % output)
        self.ads = []
        for text_ad in output.split("\n\n"):
            try:
                ad = classad.parseOld(text_ad)
            except SyntaxError:
                continue
            if ad:
                self.ads.append(ad)
        self.ad = self.ads[-1]
        if 'JOBGLIDEIN_CMSSite' in self.ad:
            self.site = self.ad['JOBGLIDEIN_CMSSite']
Beispiel #18
0
 def get_job_ad_from_file(self):
     """
     Need a doc string here
     """
     self.ads = []
     self.ads.append(self.ad)
     if self.crab_retry == 0:
         print 'Job is retry num 0. Will not try to search and load previous job ads.'
         return
     for crab_retry in range(1, int(self.crab_retry + 1)):
         job_ad_file = "./finished_jobs/job.%d.%d" % (self.job_id, crab_retry)
         if os.path.isfile(job_ad_file):
             with open(job_ad_file, "r") as fd:
                 text_ad = fd.read_lines()
             try:
                 ad = classad.parseOld(text_ad)
             except SyntaxError as e:
                 print 'Unable to parse classads from file %s' % job_Ad
                 continue
             if ad:
                 self.ads.append(ad)
         else:
             print 'File %s does not exist. Continuing' % job_ad_file
 def get_job_ad_from_file(self):
     """
     Need a doc string here
     """
     self.ads.append(self.ad)
     if self.dag_retry == 0:
         msg = "This is job retry number 0. Will not try to search and load previous job ads."
         self.logger.info(msg)
         return
     for dag_retry in range(self.dag_retry):
         job_ad_file = os.path.join(".", "finished_jobs", "job.%d.%d" % (self.job_id, dag_retry))
         if os.path.isfile(job_ad_file):
             try:
                 with open(job_ad_file) as fd:
                     ad = classad.parseOld(fd)
             except Exception:
                 msg = "Unable to parse classads from file %s. Continuing." % (job_ad_file)
                 self.logger.warning(msg)
                 continue
             if ad:
                 self.ads.append(ad)
         else:
             msg = "File %s does not exist. Continuing." % (job_ad_file)
             self.logger.warning(msg)
def main():
    opts = parse_opts()
    config = get_config()
    setup_log(level=config["log_level"], logfile=config["log_file"], syslog_facility=config["syslog_facility"], debug=opts.debug)

    route_ad = classad.ClassAd(sys.stdin.readline())
    #logger.debug("Route Ad: %s", route_ad.__str__())
    separator_line = sys.stdin.readline()
    try:
        assert separator_line == "------\n"
    except AssertionError:
        logger.error("Separator line was not second line of STDIN")
        return(FAILURE)
    try:
        ad = classad.parseOld(sys.stdin)
    except SyntaxError:
        logger.error("Unable to parse classad")
        return(FAILURE)
    # try:
    #     ad = classad.parse(input_classad)
    # except SyntaxError:
    #     try:
    #         ad = classad.parseOld(input_classad)
    #     except SyntaxError:
    #         logger.error("Unable to parse classad")
    #         return(FAILURE)

    # Set some variables based on incoming job ad
    jobid = "%s.%s" % (ad["ClusterId"], ad["ProcId"])

    # Perform transformations normally done by condor when a hook is not used
    # The version that fixes this is not yet defined so comparing against 9.9.9
    condor_version = classad.version()
    if StrictVersion(condor_version) < StrictVersion('9.9.9'):
        vanillaToGrid(ad, route_ad)

    # Test if job is a pilot
    #if "x509UserProxyFirstFQAN" in ad and "/local/Role=pilot" in ad.eval("x509UserProxyFirstFQAN"):
    if "x509UserProxyFirstFQAN" in ad and "/Role=pilot" in ad.eval("x509UserProxyFirstFQAN"):
        logger.debug("Job=%s x509UserProxyFirstFQAN='%s' is a pilot", jobid, ad["x509UserProxyFirstFQAN"])
        pilot_job = True
    else:
        logger.debug("Job=%s x509UserProxyFirstFQAN='%s' is not a pilot", jobid, ad.get("x509UserProxyFirstFQAN", "None"))
        pilot_job = False

    # TEST
    #if ad["Owner"] == "treydock":
    #    logger.error("Job=%s Invalid. Reason='TEST', setting JobStatus=5.", jobid)
    #    ad["JobStatus"] = 5
    #    ad["SITELocalUser"] = False
    #    ad["HoldReason"] = "Job invalid - TEST"
    #    print ad.printOld()
    #    return(SUCCESS)
    # END TEST

    # If not a pilot then return unmodified ad
    if not pilot_job:
        logger.debug("Job=%s is not a pilot job, returning ad", jobid)
        print ad.printOld()
        return(SUCCESS)

    # If owner or route are in ignore_users or ignore_routes then return unmodified ad
    if config["ignore_users"] and ad["owner"] in config["ignore_users"]:
        logger.debug("Job=%s Owner=%s is in ignore_users list, returning ad", jobid, ad["owner"])
        print ad.printOld()
        return(SUCCESS)
    if config["ignore_routes"] and route_ad["name"] in config["ignore_routes"]:
        logger.debug("Job=%s Route=%s is in ignore_routes list, returning ad", jobid, route_ad["name"])
        print ad.printOld()
        return(SUCCESS)

    # Get pending requests data
    pending_requests = get_pending_requests(data_file=config["user_requests_json"])

    # If unable to determine pending requests, mark job invalid
    if not pending_requests or "idle" not in pending_requests or "users" not in pending_requests:
        return(mark_job_invalid(ad=ad, jobid=jobid, reason="pending requests missing required data"))
    # If no idle users defined, mark job invalid
    idle_users = pending_requests["idle"]
    if not idle_users:
        return(mark_job_invalid(ad=ad, jobid=jobid, reason="pending requests contains no idle users"))
    # If no idle user DNs, mark job invalid
    pending_user_dns = pending_requests["users"]
    if not pending_user_dns:
        return(mark_job_invalid(ad=ad, jobid=jobid, reason="pending requests contains no user DNs"))

    # Get all users with idle jobs
    pending_users = {}
    for user, idle in idle_users.iteritems():
        if idle != 0:
            pending_users[user] = idle

    # If no pending user jobs, mark job invalid
    if not pending_users:
        return(mark_job_invalid(ad=ad, jobid=jobid, reason="no pending user jobs found"))

    # Determine which user to assign to the pilot
    # Priority: user with most idle jobs
    pending_user = sorted(pending_users, key=pending_users.get, reverse=True)[0]
    logger.debug("Pending users:\n%s", json.dumps(pending_users))
    logger.debug("Job=%s selected user to run job name=%s idle=%s", jobid, pending_user, pending_users[pending_user])

    # If the DN can't be found in the pending request JSON, job is invalid
    pending_user_dn = pending_user_dns.get(pending_user)
    if not pending_user_dn:
        return(mark_job_invalid(ad=ad, jobid=jobid, reason="unable to find pending user DN"))

    # The idle user selected is a CERN username, we need to map the associated DN to find local user
    local_grid_map = get_local_grid_map(dn=pending_user_dn, grid_mapfile=config["grid_mapfile"])
    if not local_grid_map:
        return(mark_job_invalid(ad=ad, jobid=jobid, reason="unable to get local gridmap information for DN='%s'" % pending_user_dn))
    new_owner = local_grid_map["username"]

    # Set USER_DN environment variable to new owner's DN
    if not ad["environment"] or ad["environment"] == "":
        new_environment = "USER_DN='%s'" % local_grid_map["dn"]
    else:
        new_environment = ad["environment"] + " USER_DN='%s'" % local_grid_map["dn"]

    # Get location of spooled files and change ownership
    #if "Iwd" in ad.keys():
    #    iwd = ad["Iwd"]
    #    if os.path.isdir(iwd):
    #        _pwd = pwd.getpwnam(new_owner)
    #        _uid = _pwd.pw_uid
    #        _gid = _pwd.pw_gid
    #        logger.debug("Modify permissions for Job=%s Set uid=%s gid=%s Iwd=%s", jobid, _uid, _gid, iwd)
    #        chown_wrapper_cmd = [
    #            os.path.join(os.path.dirname(os.path.realpath(__file__)), "chown_iwd"), str(_uid), str(_gid), iwd
    #        ]
    #        chown_wrapper_exit_code = subprocess.call(chown_wrapper_cmd)
    #        if chown_wrapper_exit_code != 0:
    #            return(mark_job_invalid(ad=ad, jobid=jobid, reason="chown wrapper failed with exit code %s" % chown_wrapper_exit_code))

    # Hack to replace arguments with values we can use
    #if "Arguments" in ad.keys():
    #    job_arguments = ad["Arguments"]
    #    if "-param_GLIDEIN_Glexec_Use OPTIONAL" in job_arguments:
    #        new_job_arguments = job_arguments.replace("-param_GLIDEIN_Glexec_Use OPTIONAL", "-param_GLIDEIN_Glexec_Use NEVER")
    #        logger.info("Update Job=%s set Arguments='%s'", jobid, new_job_arguments)
    #        ad["Arguments"] = new_job_arguments

    # Define remote_cerequirements to pass to submit script

    # Set new ad values
    logger.info("Update Job=%s set Owner=%s", jobid, new_owner)
    logger.info("Update Job=%s set Environment=\"%s\"", jobid, new_environment)
    ad["owner"] = new_owner
    ad["environment"] = new_environment

    #logger.debug("Route Ad:\n%s", route_ad.__str__())
    #logger.debug("Class Ad:\n%s", ad.printOld())
    print ad.printOld()
    return(SUCCESS)
Beispiel #21
0
def bootstrap():
    print("Entering TaskManagerBootstrap with args: %s" % sys.argv)
    command = sys.argv[1]
    if command == "POSTJOB":
        return PostJob.PostJob().execute(*sys.argv[2:])
    elif command == "PREJOB":
        return PreJob.PreJob().execute(*sys.argv[2:])
    elif command == "PREDAG":
        return PreDAG.PreDAG().execute(*sys.argv[2:])

    infile, outfile = sys.argv[2:]

    adfile = os.environ["_CONDOR_JOB_AD"]
    print("Parsing classad")
    with open(adfile, "r") as fd:
        ad = classad.parseOld(fd)
    print("..done")
    in_args = []
    if infile != "None":
        with open(infile, "r") as fd:
            in_args = pickle.load(fd)

    config = Configuration.Configuration()
    config.section_("Services")
    config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'

    ad['tm_taskname'] = ad.eval("CRAB_Workflow")
    ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo")
    ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL")
    ad['tm_input_dataset'] = ad.eval("DESIRED_CMSDataset")
    ad['tm_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_AdditionalOutputFiles"))
    ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_TFileOutputFiles"))
    ad['tm_edm_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_EDMOutputFiles"))
    ad['tm_site_whitelist'] = HTCondorUtils.unquote(
        ad.eval("CRAB_SiteWhitelist"))
    ad['tm_site_blacklist'] = HTCondorUtils.unquote(
        ad.eval("CRAB_SiteBlacklist"))
    ad['tm_job_type'] = 'Analysis'
    print("TaskManager got this raw ad")
    print(ad)
    pure_ad = {}
    for key in ad:
        try:
            pure_ad[key] = ad.eval(key)
            if isinstance(pure_ad[key], classad.Value):
                del pure_ad[key]
            if isinstance(pure_ad[key], list):
                pure_ad[key] = [i.eval() for i in pure_ad[key]]
        except:
            pass
    ad = pure_ad
    ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"])
    ad['tm_split_args'] = ad["CRAB_AlgoArgs"]
    ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '')
    print("TaskManagerBootstrap got this ad:")
    pprint.pprint(ad)

    results = task.execute(in_args, task=ad).result

    print(results)
    with open(outfile, "w") as fd:
        pickle.dump(results, fd)

    return 0
def main():
    opts = parse_opts()
    config = get_config()
    setup_log(level=config["log_level"],
              logfile=config["log_file"],
              syslog_facility=config["syslog_facility"],
              debug=opts.debug)

    route_ad = classad.ClassAd(sys.stdin.readline())
    #logger.debug("Route Ad: %s", route_ad.__str__())
    separator_line = sys.stdin.readline()
    try:
        assert separator_line == "------\n"
    except AssertionError:
        logger.error("Separator line was not second line of STDIN")
        return (FAILURE)
    try:
        ad = classad.parseOld(sys.stdin)
    except SyntaxError:
        logger.error("Unable to parse classad")
        return (FAILURE)
    # try:
    #     ad = classad.parse(input_classad)
    # except SyntaxError:
    #     try:
    #         ad = classad.parseOld(input_classad)
    #     except SyntaxError:
    #         logger.error("Unable to parse classad")
    #         return(FAILURE)

    # Set some variables based on incoming job ad
    jobid = "%s.%s" % (ad["ClusterId"], ad["ProcId"])

    # Perform transformations normally done by condor when a hook is not used
    # The version that fixes this is not yet defined so comparing against 9.9.9
    condor_version = classad.version()
    if StrictVersion(condor_version) < StrictVersion('9.9.9'):
        vanillaToGrid(ad, route_ad)

    # Test if job is a pilot
    #if "x509UserProxyFirstFQAN" in ad and "/local/Role=pilot" in ad.eval("x509UserProxyFirstFQAN"):
    if "x509UserProxyFirstFQAN" in ad and "/Role=pilot" in ad.eval(
            "x509UserProxyFirstFQAN"):
        logger.debug("Job=%s x509UserProxyFirstFQAN='%s' is a pilot", jobid,
                     ad["x509UserProxyFirstFQAN"])
        pilot_job = True
    else:
        logger.debug("Job=%s x509UserProxyFirstFQAN='%s' is not a pilot",
                     jobid, ad.get("x509UserProxyFirstFQAN", "None"))
        pilot_job = False

    # TEST
    #if ad["Owner"] == "treydock":
    #    logger.error("Job=%s Invalid. Reason='TEST', setting JobStatus=5.", jobid)
    #    ad["JobStatus"] = 5
    #    ad["SITELocalUser"] = False
    #    ad["HoldReason"] = "Job invalid - TEST"
    #    print ad.printOld()
    #    return(SUCCESS)
    # END TEST

    # If not a pilot then return unmodified ad
    if not pilot_job:
        logger.debug("Job=%s is not a pilot job, returning ad", jobid)
        print ad.printOld()
        return (SUCCESS)

    # If owner or route are in ignore_users or ignore_routes then return unmodified ad
    if config["ignore_users"] and ad["owner"] in config["ignore_users"]:
        logger.debug("Job=%s Owner=%s is in ignore_users list, returning ad",
                     jobid, ad["owner"])
        print ad.printOld()
        return (SUCCESS)
    if config["ignore_routes"] and route_ad["name"] in config["ignore_routes"]:
        logger.debug("Job=%s Route=%s is in ignore_routes list, returning ad",
                     jobid, route_ad["name"])
        print ad.printOld()
        return (SUCCESS)

    # Get pending requests data
    pending_requests = get_pending_requests(
        data_file=config["user_requests_json"])

    # If unable to determine pending requests, mark job invalid
    if not pending_requests or "idle" not in pending_requests or "users" not in pending_requests:
        return (mark_job_invalid(
            ad=ad,
            jobid=jobid,
            reason="pending requests missing required data"))
    # If no idle users defined, mark job invalid
    idle_users = pending_requests["idle"]
    if not idle_users:
        return (mark_job_invalid(
            ad=ad,
            jobid=jobid,
            reason="pending requests contains no idle users"))
    # If no idle user DNs, mark job invalid
    pending_user_dns = pending_requests["users"]
    if not pending_user_dns:
        return (mark_job_invalid(
            ad=ad, jobid=jobid,
            reason="pending requests contains no user DNs"))

    # Get all users with idle jobs
    pending_users = {}
    for user, idle in idle_users.iteritems():
        if idle != 0:
            pending_users[user] = idle

    # If no pending user jobs, mark job invalid
    if not pending_users:
        return (mark_job_invalid(ad=ad,
                                 jobid=jobid,
                                 reason="no pending user jobs found"))

    # Determine which user to assign to the pilot
    # Priority: user with most idle jobs
    pending_user = sorted(pending_users, key=pending_users.get,
                          reverse=True)[0]
    logger.debug("Pending users:\n%s", json.dumps(pending_users))
    logger.debug("Job=%s selected user to run job name=%s idle=%s", jobid,
                 pending_user, pending_users[pending_user])

    # If the DN can't be found in the pending request JSON, job is invalid
    pending_user_dn = pending_user_dns.get(pending_user)
    if not pending_user_dn:
        return (mark_job_invalid(ad=ad,
                                 jobid=jobid,
                                 reason="unable to find pending user DN"))

    # The idle user selected is a CERN username, we need to map the associated DN to find local user
    local_grid_map = get_local_grid_map(dn=pending_user_dn,
                                        grid_mapfile=config["grid_mapfile"])
    if not local_grid_map:
        return (mark_job_invalid(
            ad=ad,
            jobid=jobid,
            reason="unable to get local gridmap information for DN='%s'" %
            pending_user_dn))
    new_owner = local_grid_map["username"]

    # Set USER_DN environment variable to new owner's DN
    if not ad["environment"] or ad["environment"] == "":
        new_environment = "USER_DN='%s'" % local_grid_map["dn"]
    else:
        new_environment = ad[
            "environment"] + " USER_DN='%s'" % local_grid_map["dn"]

    # Get location of spooled files and change ownership
    #if "Iwd" in ad.keys():
    #    iwd = ad["Iwd"]
    #    if os.path.isdir(iwd):
    #        _pwd = pwd.getpwnam(new_owner)
    #        _uid = _pwd.pw_uid
    #        _gid = _pwd.pw_gid
    #        logger.debug("Modify permissions for Job=%s Set uid=%s gid=%s Iwd=%s", jobid, _uid, _gid, iwd)
    #        chown_wrapper_cmd = [
    #            os.path.join(os.path.dirname(os.path.realpath(__file__)), "chown_iwd"), str(_uid), str(_gid), iwd
    #        ]
    #        chown_wrapper_exit_code = subprocess.call(chown_wrapper_cmd)
    #        if chown_wrapper_exit_code != 0:
    #            return(mark_job_invalid(ad=ad, jobid=jobid, reason="chown wrapper failed with exit code %s" % chown_wrapper_exit_code))

    # Hack to replace arguments with values we can use
    #if "Arguments" in ad.keys():
    #    job_arguments = ad["Arguments"]
    #    if "-param_GLIDEIN_Glexec_Use OPTIONAL" in job_arguments:
    #        new_job_arguments = job_arguments.replace("-param_GLIDEIN_Glexec_Use OPTIONAL", "-param_GLIDEIN_Glexec_Use NEVER")
    #        logger.info("Update Job=%s set Arguments='%s'", jobid, new_job_arguments)
    #        ad["Arguments"] = new_job_arguments

    # Define remote_cerequirements to pass to submit script

    # Set new ad values
    logger.info("Update Job=%s set Owner=%s", jobid, new_owner)
    logger.info("Update Job=%s set Environment=\"%s\"", jobid, new_environment)
    ad["owner"] = new_owner
    ad["environment"] = new_environment

    #logger.debug("Route Ad:\n%s", route_ad.__str__())
    #logger.debug("Class Ad:\n%s", ad.printOld())
    print ad.printOld()
    return (SUCCESS)
Beispiel #23
0
 def getLastHistory(self, cluster):
     fd = os.popen("condor_history -match 1 -l %d" % cluster)
     ad = classad.parseOld(fd.read()[:-1])
     self.assertFalse(fd.close())
     return ad
Beispiel #24
0
 def get_task_ad(self):
     self.task_ad = {}
     try:
         self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
     except Exception:
         print traceback.format_exc()
def bootstrap():
    print("Entering TaskManagerBootstrap with args: %s" % sys.argv)
    command = sys.argv[1]
    if command == "POSTJOB":
        return PostJob.PostJob().execute(*sys.argv[2:])
    elif command == "PREJOB":
        return PreJob.PreJob().execute(*sys.argv[2:])
    elif command == "PREDAG":
        return PreDAG.PreDAG().execute(*sys.argv[2:])

    infile, outfile = sys.argv[2:]

    adfile = os.environ["_CONDOR_JOB_AD"]
    print("Parsing classad")
    with open(adfile, "r") as fd:
        ad = classad.parseOld(fd)
    print("..done")
    in_args = []
    if infile != "None":
        with open(infile, "r") as fd:
            in_args = pickle.load(fd)

    config = Configuration.Configuration()
    config.section_("Services")
    config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'
    
    ad['tm_taskname'] = ad.eval("CRAB_Workflow")
    ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo")
    ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL")
    ad['tm_input_dataset'] = ad.eval("DESIRED_CMSDataset")
    ad['tm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_AdditionalOutputFiles"))
    ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_TFileOutputFiles"))
    ad['tm_edm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_EDMOutputFiles"))
    ad['tm_site_whitelist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteWhitelist"))
    ad['tm_site_blacklist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteBlacklist"))
    ad['tm_job_type'] = 'Analysis'
    print("TaskManager got this raw ad")
    print(ad)
    pure_ad = {}
    for key in ad:
        try:
            pure_ad[key] = ad.eval(key)
            if isinstance(pure_ad[key], classad.Value):
                del pure_ad[key]
            if isinstance(pure_ad[key], list):
                pure_ad[key] = [i.eval() for i in pure_ad[key]]
        except:
            pass
    ad = pure_ad
    ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"])
    ad['tm_split_args'] = ad["CRAB_AlgoArgs"]
    ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '')
    print("TaskManagerBootstrap got this ad:")
    pprint.pprint(ad)

    results = task.execute(in_args, task=ad).result

    print(results)
    with open(outfile, "w") as fd:
        pickle.dump(results, fd)

    return 0
 def test_old_classad(self):
     ad = classad.parseOld(open("tests/test.old.ad"))
     contents = open("tests/test.old.ad").read()
     self.assertEqual(ad.printOld(), contents)
Beispiel #27
0
 def get_task_ad(self):
     self.task_ad = {}
     try:
         self.task_ad = classad.parseOld(open(os.environ['_CONDOR_JOB_AD']))
     except Exception:
         print traceback.format_exc()
Beispiel #28
0
def main():
    """
    Need a doc string here.
    """
    setupLog()

    if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(os.environ["_CONDOR_JOB_AD"]):
        printLog("Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist")
        sys.exit(0)

    printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" % os.environ['_CONDOR_JOB_AD'])

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    if exitCode != 0:
        printLog("Exiting AdjustSites because the webdir upload failed three times.")
        sys.exit(1)

    printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode)

    saveProxiedWebdir(ad)

    printLog("Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions")

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True

    # Hold and release processing and tail DAGs here so that modifications
    # to the submission and log files will be picked up.
    schedd = htcondor.Schedd()
    tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(ad.get("CRAB_ReqName"))
    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Holding processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGKILL')
        schedd.act(htcondor.JobAction.Hold, tailconst)

    if resubmitJobIds:
        adjustedJobIds = []
        filenames = getGlob(ad, "RunJobs.dag.nodes.log", "RunJobs[1-9]*.subdag.nodes.log")
        for fn in filenames:
            if hasattr(htcondor, 'lock'):
                # While dagman is not running at this point, the schedd may be writing events to this
                # file; hence, we only edit the file while holding an appropriate lock.
                # Note this lock method didn't exist until 8.1.6; prior to this, we simply
                # run dangerously.
                with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock):
                    adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn))
            else:
                adjustedJobIds.extend(adjustPostScriptExitStatus(resubmitJobIds, fn))
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Releasing processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1')
        schedd.act(htcondor.JobAction.Release, tailconst)

    printLog("Exiting AdjustSite")
 def getLastHistory(self, cluster):
     fd = os.popen("condor_history -match 1 -l %d" % cluster)
     ad = classad.parseOld(fd.read()[:-1])
     self.assertFalse(fd.close())
     return ad
def bootstrap():
    print "Entering TaskManagerBootstrap with args: %s" % sys.argv
    command = sys.argv[1]
    if command == "POSTJOB":
        return PostJob.PostJob().execute(*sys.argv[2:])
    elif command == "PREJOB":
        return PreJob.PreJob().execute(*sys.argv[2:])
    elif command == "FINAL":
        return Final.Final().execute(*sys.argv[2:])
    elif command == "ASO":
        return ASO.async_stageout(*sys.argv[2:])

    infile, outfile = sys.argv[2:]

    adfile = os.environ["_CONDOR_JOB_AD"]
    print "Parsing classad"
    with open(adfile, "r") as fd:
        ad = classad.parseOld(fd)
    print "..done"
    in_args = []
    if infile != "None":
        with open(infile, "r") as fd:
            in_args = pickle.load(fd)

    config = Configuration.Configuration()
    config.section_("Services")
    config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'
    
    ad['tm_taskname'] = ad.eval("CRAB_Workflow")
    ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo")
    ad['tm_dbs_url'] = ad.eval("CRAB_DBSURL")
    ad['tm_input_dataset'] = ad.eval("CRAB_InputData")
    ad['tm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_AdditionalOutputFiles"))
    ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_TFileOutputFiles"))
    ad['tm_edm_outfiles'] = HTCondorUtils.unquote(ad.eval("CRAB_EDMOutputFiles"))
    ad['tm_site_whitelist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteWhitelist"))
    ad['tm_site_blacklist'] = HTCondorUtils.unquote(ad.eval("CRAB_SiteBlacklist"))
    ad['tm_job_type'] = 'Analysis'
    print "TaskManager got this raw ad"
    print ad
    pure_ad = {}
    for key in ad:
        try:
            pure_ad[key] = ad.eval(key)
            if isinstance(pure_ad[key], classad.Value):
                del pure_ad[key]
            if isinstance(pure_ad[key], types.ListType):
                pure_ad[key] = [i.eval() for i in pure_ad[key]]
        except:
            pass
    ad = pure_ad
    ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"])
    ad['tm_split_args'] = ad["CRAB_AlgoArgs"]
    ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '')
    print "TaskManagerBootstrap got this ad:"
    pprint.pprint(ad)
    if command == "DBS":
        task = DBSDataDiscovery.DBSDataDiscovery(config)
    elif command == "SPLIT":
        task = Splitter.Splitter(config)
        print "Got this result from the splitter"
        pprint.pprint(task)
    results = task.execute(in_args, task=ad).result
    if command == "SPLIT":
        results = DagmanCreator.create_subdag(results, task=ad)

    print results
    with open(outfile, "w") as fd:
        pickle.dump(results, fd)

    return 0
Beispiel #31
0
def bootstrap():
    print "Entering TaskManagerBootstrap with args: %s" % sys.argv
    command = sys.argv[1]
    if command == "POSTJOB":
        return PostJob.PostJob().execute(*sys.argv[2:])
    elif command == "PREJOB":
        return PreJob.PreJob().execute(*sys.argv[2:])
    elif command == "FINAL":
        return Final.Final().execute(*sys.argv[2:])
    elif command == "ASO":
        return ASO.async_stageout(*sys.argv[2:])

    infile, outfile = sys.argv[2:]

    adfile = os.environ["_CONDOR_JOB_AD"]
    print "Parsing classad"
    with open(adfile, "r") as fd:
        ad = classad.parseOld(fd)
    print "..done"
    in_args = []
    if infile != "None":
        with open(infile, "r") as fd:
            in_args = pickle.load(fd)

    config = Configuration.Configuration()
    config.section_("Services")
    config.Services.DBSUrl = 'https://cmsweb.cern.ch/dbs/prod/phys03/DBSWriter/'

    ad['tm_taskname'] = ad.eval("CRAB_Workflow")
    ad['tm_split_algo'] = ad.eval("CRAB_SplitAlgo")
    ad['tm_dbs_url'] = ad.eval("CRAB_DBSUrl")
    ad['tm_input_dataset'] = ad.eval("CRAB_InputData")
    ad['tm_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_AdditionalOutputFiles"))
    ad['tm_tfile_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_TFileOutputFiles"))
    ad['tm_edm_outfiles'] = HTCondorUtils.unquote(
        ad.eval("CRAB_EDMOutputFiles"))
    ad['tm_site_whitelist'] = HTCondorUtils.unquote(
        ad.eval("CRAB_SiteWhitelist"))
    ad['tm_site_blacklist'] = HTCondorUtils.unquote(
        ad.eval("CRAB_SiteBlacklist"))
    ad['tm_job_type'] = 'Analysis'
    print "TaskManager got this raw ad"
    print ad
    pure_ad = {}
    for key in ad:
        try:
            pure_ad[key] = ad.eval(key)
            if isinstance(pure_ad[key], classad.Value):
                del pure_ad[key]
            if isinstance(pure_ad[key], types.ListType):
                pure_ad[key] = [i.eval() for i in pure_ad[key]]
        except:
            pass
    ad = pure_ad
    ad['CRAB_AlgoArgs'] = json.loads(ad["CRAB_AlgoArgs"])
    ad['tm_split_args'] = ad["CRAB_AlgoArgs"]
    ad['tarball_location'] = os.environ.get('CRAB_TARBALL_LOCATION', '')
    print "TaskManagerBootstrap got this ad:"
    pprint.pprint(ad)
    if command == "DBS":
        task = DBSDataDiscovery.DBSDataDiscovery(config)
    elif command == "SPLIT":
        task = Splitter.Splitter(config)
        print "Got this result from the splitter"
        pprint.pprint(task)
    results = task.execute(in_args, task=ad).result
    if command == "SPLIT":
        results = DagmanCreator.create_subdag(results, task=ad)

    print results
    with open(outfile, "w") as fd:
        pickle.dump(results, fd)

    return 0
Beispiel #32
0
def main():
    """
    Need a doc string here.
    """
    printLog("Starting AdjustSites")

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    printLog(
        "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir"
        % exitCode)

    saveProxiedWebdir(ad)

    printLog(
        "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True
    if resubmitJobIds:
        adjustedJobIds = []
        if hasattr(htcondor, 'lock'):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", 'a'),
                               htcondor.LockType.WriteLock):
                adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        else:
            adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    printLog("Exiting AdjustSite")
Beispiel #33
0
def main():
    """
    Need a doc string here.
    """
    setupLog()

    if '_CONDOR_JOB_AD' not in os.environ or not os.path.exists(
            os.environ["_CONDOR_JOB_AD"]):
        printLog(
            "Exiting AdjustSites since _CONDOR_JOB_AD is not in the environment or does not exist"
        )
        sys.exit(0)

    printLog("Starting AdjustSites with _CONDOR_JOB_AD=%s" %
             os.environ['_CONDOR_JOB_AD'])

    with open(os.environ['_CONDOR_JOB_AD']) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    if exitCode != 0:
        printLog(
            "Exiting AdjustSites because the webdir upload failed three times."
        )
        sys.exit(1)

    printLog(
        "Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir"
        % exitCode)

    saveProxiedWebdir(ad)

    printLog(
        "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if 'CRAB_ResubmitList' in ad:
        resubmitJobIds = ad['CRAB_ResubmitList']
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True

    # Hold and release processing and tail DAGs here so that modifications
    # to the submission and log files will be picked up.
    schedd = htcondor.Schedd()
    tailconst = "TaskType =?= \"TAIL\" && CRAB_ReqName =?= %s" % classad.quote(
        ad.get("CRAB_ReqName"))
    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Holding processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGKILL')
        schedd.act(htcondor.JobAction.Hold, tailconst)

    if resubmitJobIds:
        adjustedJobIds = []
        filenames = getGlob(ad, "RunJobs.dag.nodes.log",
                            "RunJobs[1-9]*.subdag.nodes.log")
        for fn in filenames:
            if hasattr(htcondor, 'lock'):
                # While dagman is not running at this point, the schedd may be writing events to this
                # file; hence, we only edit the file while holding an appropriate lock.
                # Note this lock method didn't exist until 8.1.6; prior to this, we simply
                # run dangerously.
                with htcondor.lock(open(fn, 'a'), htcondor.LockType.WriteLock):
                    adjustedJobIds.extend(
                        adjustPostScriptExitStatus(resubmitJobIds, fn))
            else:
                adjustedJobIds.extend(
                    adjustPostScriptExitStatus(resubmitJobIds, fn))
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if 'CRAB_SiteAdUpdate' in ad:
        newSiteAd = ad['CRAB_SiteAdUpdate']
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    if resubmitJobIds and ad.get('CRAB_SplitAlgo') == 'Automatic':
        printLog("Releasing processing and tail DAGs")
        schedd.edit(tailconst, "HoldKillSig", 'SIGUSR1')
        schedd.act(htcondor.JobAction.Release, tailconst)

    printLog("Exiting AdjustSite")
def getStatus():
    base_dir = os.environ.get("_CONDOR_SCRATCH_DIR", os.getcwd())
    fd = open(os.path.join(base_dir, ".machine.ad"))
    machineAd = classad.parseOld(fd)
    return htcondor.Collector().query(htcondor.AdTypes.Startd, "Name =?= %s" % machineAd.lookup("Name").__str__())[0]
    try:
        fp = open(".pilot.ad")
        st = os.fstat(fp.fileno())
        ad["AD_FOUND"] = classad.ExprTree("true")
        if launch_time - st.st_mtime < 600:
            ad["AD_FRESH"] = classad.ExprTree("true")
        else:
            print "Pilot ad too old"
    except IOError, oe:
        if oe.errno == errno.ENOENT:
            print "No pilot ad available"
        else:
            raise
    if not fp:
        return ad
    pilot_ad = classad.parseOld(fp)
    for key in pilot_ad:
        if key not in ad:
            ad[key] = pilot_ad.lookup(key)
    return ad


def main():
    ad = getAd()
    global chirp_verb
    for attr in ad.keys():
        val = ad.lookup(attr)
        attr = "PILOT_" + attr
        if chirp(attr, val) and chirp_verb == "set_job_attr_delayed":
            chirp_verb = "set_job_attr"
            retval = chirp(attr, val)
Beispiel #36
0
def main():
    """
    Need a doc string here.
    """
    printLog("Starting AdjustSites")

    with open(os.environ["_CONDOR_JOB_AD"]) as fd:
        ad = classad.parseOld(fd)
    printLog("Parsed ad: %s" % ad)

    makeWebDir(ad)

    printLog("Webdir has been set up. Uploading the webdir URL to the REST")

    retries = 0
    exitCode = 1
    while retries < 3 and exitCode != 0:
        exitCode = updateWebDir(ad)
        if exitCode != 0:
            time.sleep(retries * 20)
        retries += 1

    printLog("Webdir URL has been uploaded, exit code is %s. Setting the classad for the proxied webdir" % exitCode)

    saveProxiedWebdir(ad)

    printLog(
        "Proxied webdir saved. Clearing the automatic blacklist and handling RunJobs.dag.nodes.log for resubmissions"
    )

    clearAutomaticBlacklist()

    resubmitJobIds = []
    if "CRAB_ResubmitList" in ad:
        resubmitJobIds = ad["CRAB_ResubmitList"]
        try:
            resubmitJobIds = set(resubmitJobIds)
            resubmitJobIds = [str(i) for i in resubmitJobIds]
        except TypeError:
            resubmitJobIds = True
    if resubmitJobIds:
        adjustedJobIds = []
        if hasattr(htcondor, "lock"):
            # While dagman is not running at this point, the schedd may be writing events to this
            # file; hence, we only edit the file while holding an appropriate lock.
            # Note this lock method didn't exist until 8.1.6; prior to this, we simply
            # run dangerously.
            with htcondor.lock(open("RunJobs.dag.nodes.log", "a"), htcondor.LockType.WriteLock):
                adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        else:
            adjustedJobIds = adjustPostScriptExitStatus(resubmitJobIds)
        ## Adjust the maximum allowed number of retries only for the job ids for which
        ## the POST script exit status was adjusted. Why only for these job ids and not
        ## for all job ids in resubmitJobIds? Because if resubmitJobIds = True, which as
        ## a general rule means "all failed job ids", we don't have a way to know if a
        ## job is in failed status or not just from the RunJobs.dag file, while job ids
        ## in adjustedJobIds correspond only to failed jobs.
        adjustMaxRetries(adjustedJobIds, ad)

    if "CRAB_SiteAdUpdate" in ad:
        newSiteAd = ad["CRAB_SiteAdUpdate"]
        with open("site.ad") as fd:
            siteAd = classad.parse(fd)
        siteAd.update(newSiteAd)
        with open("site.ad", "w") as fd:
            fd.write(str(siteAd))

    printLog("Exiting AdjustSite")