Ejemplo n.º 1
0
def get_ads_from_schedd(schedd_ad, args):
    schedd = htcondor.Schedd(schedd_ad)
    selection = args.selection or 'true'
    history_iter = schedd.history(selection, [], args.n_docs_to_query)

    job_ads = [j for j in history_iter]

    pckfile = os.path.join(args.filename)
    # If the file already exists, append the ads
    if os.path.isfile(pckfile):
        with open(pckfile, 'r') as pfile:
            job_ads.extend(pickle.load(pfile))

    with open(pckfile, 'w') as pfile:
        pickle.dump(job_ads, pfile, pickle.HIGHEST_PROTOCOL)

    print "   ...done, wrote %d docs to %s" % (len(job_ads), pckfile)
    return True
Ejemplo n.º 2
0
 def fetch_job_ads(self, query, attributes=list()):
     """
     Get CRAB schedulers from HTCondor
     """
     query = str(query)
     ads = list()
     schedulers = self.collector.locateAll(htcondor.DaemonTypes.Schedd)
     for scheduler in schedulers:
         try:
             schedd = htcondor.Schedd(scheduler)
             returned_ads = schedd.query(query, attributes)
             for ad in returned_ads:
                 ads.append(ad)
         except Exception as e:
             self.logger.warning(
                 'CRAB query failed for\nquery: %s\nattributes: %s\n    Reason:\n    %s',
                 str(query), str(attributes), str(e))
     return ads
Ejemplo n.º 3
0
def error_before_hold():
    map = htmap.map(lambda x: 1 / x, [0, 1])

    schedd = htcondor.Schedd()
    cluster_id = map._cluster_ids[0]
    schedd.act(htcondor.JobAction.Hold,
               f"(ClusterID == {cluster_id}) && (ProcID == 1)")

    map.wait(holds_ok=True, errors_ok=True)

    assert map.component_statuses == [
        htmap.ComponentStatus.ERRORED,
        htmap.ComponentStatus.HELD,
    ]

    yield map

    map.remove()
Ejemplo n.º 4
0
 def testScheddSubmitMany(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parseOne(open("submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 10, False, ads)
     for i in range(60):
         ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"])
         ads = list(ads)
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
         time.sleep(1)
     self.assertEqual(open(output_file).read(), "hello world\n")
Ejemplo n.º 5
0
def submit_outer_dag(
    working_dir: Path,
    source_dir: Path,
    dest_dir: Path,
    requirements: Optional[str] = None,
    unique_id: Optional[str] = None,
    test_mode: bool = False,
):

    # Only import htcondor.dags submit-side
    import htcondor.dags as dags

    working_dir = working_dir.resolve()
    dest_dir = dest_dir.resolve()

    working_dir.mkdir(parents=True, exist_ok=True)
    dest_dir.mkdir(parents=True, exist_ok=True)

    transfer_manifest_path = dest_dir / "transfer_manifest.txt"

    outer_dag = make_outer_dag(
        dest_dir,
        requirements,
        source_dir,
        test_mode,
        transfer_manifest_path,
        unique_id,
        working_dir,
    )

    if requirements:
        (working_dir / "requirements.txt").write_text(requirements)

    outer_dag_file = dags.write_dag(outer_dag,
                                    dag_dir=working_dir,
                                    dag_file_name="outer.dag")

    dag_args = {'force': 1}
    sub = htcondor.Submit.from_dag(str(outer_dag_file), dag_args)

    with change_dir(working_dir):
        schedd = htcondor.Schedd()
        with schedd.transaction() as txn:
            return sub.queue(txn)
Ejemplo n.º 6
0
 def renew_session(self, retry=3, init=False):
     # Make logger
     tmpLog = core_utils.make_logger(
         baseLogger,
         'submissionHost={0}'.format(self.submissionHost),
         method_name='CondorClient.renew_session')
     # Clear security session if not initialization
     if not init:
         tmpLog.info('Renew condor session')
         self.secman.invalidateAllSessions()
     # Recreate collector and schedd object
     i_try = 1
     while i_try <= retry:
         try:
             tmpLog.info('Try {0}'.format(i_try))
             if self.condor_pool:
                 self.collector = htcondor.Collector(self.condor_pool)
             else:
                 self.collector = htcondor.Collector()
             if self.condor_schedd:
                 self.scheddAd = self.collector.locate(
                     htcondor.DaemonTypes.Schedd, self.condor_schedd)
             else:
                 self.scheddAd = self.collector.locate(
                     htcondor.DaemonTypes.Schedd)
             self.schedd = htcondor.Schedd(self.scheddAd)
             tmpLog.info('Success')
             break
         except Exception as e:
             tmpLog.warning(
                 'Recreate condor collector and schedd failed: {0}'.format(
                     e))
             if i_try < retry:
                 tmpLog.warning('Failed. Retry...')
             else:
                 tmpLog.warning(
                     'Retry {0} times. Still failed. Skipped'.format(i_try))
                 return False
             i_try += 1
             self.secman.invalidateAllSessions()
             time.sleep(3)
     # Sleep
     time.sleep(3)
     return True
Ejemplo n.º 7
0
def get_out_err_files(dagmanid,
                      exitcode=None,
                      schedd=None,
                      user=getuser(),
                      maxjobs=0):
    """Get the paths of the output and error files for nodes in a given DAG

    Parameters
    ----------
    dagmanid : `int`
        the ClusterId of the DAG
    exitcode : `int`, optional
        return only nodes with this exitcode, or return all nodes if
        `None`
    schedd : `htcondor.Schedd`, optional
        the open connection to the scheduler
    user : `str`, optional
        the name of the user who submitted the DAG, defaults to you
    maxjobs : `int`, optional
        maximum number of condor history records to return, defaults
        to `0` meaning 'all'

    Returns
    -------
    filedict : `dict`
        a `dict` of `(nodeid, [files])` pairs
    """
    if schedd is None:
        schedd = htcondor.Schedd()
    history = list(
        schedd.history('DAGManJobId==%d && Owner=="%s"' % (dagmanid, user),
                       ['ExitCode', 'Out', 'Err', 'ClusterId'], maxjobs))
    out = {}
    for node in history:
        try:
            ec = node['ExitCode']
        except KeyError:
            warnings.warn("Failed to get ExitCode for node %r" % node)
            continue
        if exitcode is not None and ec != exitcode:
            continue
        out[node['ClusterId']] = [node['Out'], node['Err']]
    return out
Ejemplo n.º 8
0
def get_job_duration_history(classad,
                             value,
                             user=getuser(),
                             maxjobs=0,
                             schedd=None):
    """Return the durations of history condor jobs

    This method uses the python bindings for `htcondor`, which seems
    to have network transfer limits, do not use for large job numbers
    (>2000), instead use `get_job_duration_history_shell` which calls
    to `condor_history` in the shell.

    Parameters
    ----------
    classad : `str`
        name of classad providing unique identifier for job type
    value :
        value of classad
    user : `str`, optional
        name of submitting user
    maxjobs : `int`, optional
        maximum number of matches to return

    Returns
    -------
    times, durations : `tuple` of `numpy.ndarray`
        two arrays with the job end time and durations of each matched
        condor process
    """
    if schedd is None:
        schedd = htcondor.Schedd()
    if isinstance(value, str):
        value = '"%s"' % value
    history = list(
        schedd.history('%s==%s && Owner=="%s"' % (classad, value, user),
                       ['EnteredCurrentStatus', 'JobStartDate'], maxjobs))
    times = numpy.zeros(len(history))
    jobdur = numpy.zeros(len(history))
    for i, h in enumerate(history):
        times[i] = (to_gps(datetime.fromtimestamp(h['EnteredCurrentStatus'])) +
                    time.timezone)
        jobdur[i] = h['EnteredCurrentStatus'] - h['JobStartDate']
    return times, jobdur
Ejemplo n.º 9
0
    def __init__(self):

        aCTProcess.__init__(self)

        self.schedd = htcondor.Schedd()

        self.condorjobstatemap = {
            0: 'Undefined',  # used before real state is known
            1: 'Idle',
            2: 'Running',
            3: 'Removed',
            4: 'Completed',
            5: 'Held',
            6: 'Transferring Output',
            7: 'Suspended'
        }

        # store the last checkJobs time to avoid overloading of GIIS
        self.checktime = time.time()
Ejemplo n.º 10
0
    def killWorkflowJobs(self, workflow):
        """
        _killWorkflowJobs_

        Kill all the jobs belonging to a specific workflow.
        """
        logging.info("Going to remove all the jobs for workflow %s", workflow)

        schedd = htcondor.Schedd()

        try:
            schedd.act(htcondor.JobAction.Remove,
                       "WMAgent_RequestName == %s" % classad.quote(workflow))
        except RuntimeError:
            logging.warn(
                "Error while killing jobs on the schedd: WMAgent_RequestName=%s",
                workflow)

        return
Ejemplo n.º 11
0
    def kill(self, jobs, raiseEx=False):
        """
        _kill_

        Kill a list of jobs based on the WMBS job names.
        Kill can happen for schedd running on localhost... TBC.
        """
        logging.info("Killing %i jobs from the queue", len(jobs))

        schedd = htcondor.Schedd()
        gridIds = [job['gridid'] for job in jobs]
        try:
            schedd.act(htcondor.JobAction.Remove, gridIds)
        except RuntimeError:
            logging.warn("Error while killing jobs on the schedd: %s", gridIds)
            if raiseEx:
                raise

        return
Ejemplo n.º 12
0
 def getJobs(self, timestamp):
     data = []
     query = 'TaskType =?= "ROOT" && JobStatus =?= 2 && QDate < %d' % (timestamp)
     attributes = ["CRAB_InputData", "QDate", "CRAB_UserHN", "CRAB_JobCount", "DAG_NodesQueued"]
     schedulers = self.locateAll(htcondor.DaemonTypes.Schedd)
     for scheduler in schedulers:
         # query all schedulers, if error retry up to 3 times
         for attempt in range(3):
             try:
                 schedd = htcondor.Schedd(scheduler)
                 jobs = schedd.query(query, attributes)
                 for job in jobs:
                     data.append(job)
             except IOError, e:
                 continue
             else:
                 break
         else:
             self.error(e)
Ejemplo n.º 13
0
def condor_submit_process(mp_queue, host, jdl_map_list):
    """
    Function for new process to submit condor
    """
    # initialization
    errStr = ''
    batchIDs_list = []
    # parse schedd and pool name
    condor_schedd, condor_pool = None, None
    if host in ('LOCAL', 'None'):
        tmpLog.debug('submissionHost is {0}, treated as local schedd. Skipped'.format(host))
    else:
        try:
            condor_schedd, condor_pool = host.split(',')[0:2]
        except ValueError:
            tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(host))
    # get schedd
    try:
        if condor_pool:
            collector = htcondor.Collector(condor_pool)
        else:
            collector = htcondor.Collector()
        if condor_schedd:
            scheddAd = collector.locate(htcondor.DaemonTypes.Schedd, condor_schedd)
        else:
            scheddAd = collector.locate(htcondor.DaemonTypes.Schedd)
        schedd = htcondor.Schedd(scheddAd)
    except Exception as e:
        errStr = 'create condor collector and schedd failed; {0}: {1}'.format(e.__class__.__name__, e)
    else:
        submit_obj = htcondor.Submit()
        try:
            with schedd.transaction() as txn:
                # TODO: Currently spool is not supported in htcondor.Submit ...
                submit_result = submit_obj.queue_with_itemdata(txn, 1, iter(jdl_map_list))
                clusterid = submit_result.cluster()
                first_proc = submit_result.first_proc()
                num_proc = submit_result.num_procs()
                batchIDs_list.extend(['{0}.{1}'.format(clusterid, procid)
                                        for procid in range(first_proc, first_proc + num_proc)])
        except RuntimeError as e:
            errStr = 'submission failed; {0}: {1}'.format(e.__class__.__name__, e)
    mp_queue.put((batchIDs_list, errStr))
Ejemplo n.º 14
0
def condorCleanup():
    # Retrieve all jobs from condor schedd
    schedd = condor.Schedd()
    jobs = schedd.xquery('true', [
        'WMAgent_RequestName', 'JobStatus', 'WMAgent_JobID', 'ServerTime',
        'EnteredCurrentStatus'
    ])

    # timeout keyed by condor status
    timeout = {
        1: 3.1 * 24 * 3600,  # Idle/Pending --> 3.1 days
        2: 2.1 * 24 * 3600,  # Running --> 2.1 days
        5: 0.1 * 24 * 3600
    }  # Held --> 0.1 days

    listJobsToRemove = []
    jobsRemovedInfo = []
    for job in jobs:
        if job['JobStatus'] not in (1, 2, 5):
            continue
        timeThisStatus = job['ServerTime'] - job['EnteredCurrentStatus']
        if timeThisStatus > timeout[job['JobStatus']]:
            listJobsToRemove.append(job['WMAgent_JobID'])
            jobsRemovedInfo.append(job)

    if jobsRemovedInfo:
        with open('jobs_removed_script.txt', 'w') as f:
            for line in jobsRemovedInfo:
                f.writelines(str(line))

    print "Number of jobs to be removed from condor: %s" % len(
        listJobsToRemove)

    ad = classad.ClassAd()
    while len(listJobsToRemove) > 0:
        ad['foo'] = listJobsToRemove[:100]
        listJobsToRemove = listJobsToRemove[100:]
        jobsConstraint = "member(WMAgent_JobID, %s)" % ad.lookup(
            "foo").__repr__()
        out = schedd.act(condor.JobAction.Remove, jobsConstraint)
        #print "Outcome: %s" % str(out)
    return
Ejemplo n.º 15
0
def saveProxiedWebdir(crabserver, ad):
    """ The function queries the REST interface to get the proxied webdir and sets
        a classad so that we report this to the dashboard instead of the regular URL.

        The webdir (if exists) is written to a file named 'webdir' so that
        prejobs can read it and report to dashboard. If the proxied URL does not exist
        (i.e.: schedd not at CERN), we report the usual webdir.

        See https://github.com/dmwm/CRABServer/issues/4883
    """
    # Get the proxied webdir from the REST itnerface
    task = ad['CRAB_ReqName']
    webDir_adName = 'CRAB_WebDirURL'
    ad[webDir_adName] = ad['CRAB_localWebDirURL']
    proxied_webDir = getProxiedWebDir(crabserver=crabserver,
                                      task=task,
                                      logFunction=printLog)
    if proxied_webDir:  # Prefer the proxied webDir to the non-proxied one
        ad[webDir_adName] = str(proxied_webDir)

    if ad[webDir_adName]:
        # This condor_edit is required because in the REST interface we look for the webdir if the DB upload failed (or in general if we use the "old logic")
        # See https://github.com/dmwm/CRABServer/blob/3.3.1507.rc8/src/python/CRABInterface/HTCondorDataWorkflow.py#L398
        dagJobId = '%d.%d' % (ad['ClusterId'], ad['ProcId'])
        try:
            htcondor.Schedd().edit([dagJobId], webDir_adName,
                                   '{0}'.format(ad.lookup(webDir_adName)))
        except RuntimeError as reerror:
            printLog(str(reerror))

        # We need to use a file to communicate this to the prejob. I tried to read the corresponding ClassAd from the preJob like:
        # htcondor.Schedd().xquery(requirements="ClusterId == %d && ProcId == %d" % (self.task_ad['ClusterId'], self.task_ad['ProcId']), projection=[webDir_adName]).next().get(webDir_adName)
        # but it is too heavy of an operation with HTCondor v8.8.3
        with open("webdir", "w") as fd:
            fd.write(ad[webDir_adName])
    else:
        printLog(
            "Cannot get proxied webdir from the server. Maybe the schedd does not have one in the REST configuration?"
        )
        return 1

    return 0
Ejemplo n.º 16
0
def process_schedd(starttime, schedd_ad):
    my_start = time.time()
    print "Querying %s for jobs." % schedd_ad["Name"]
    buffered_ads = {}
    schedd = htcondor.Schedd(schedd_ad)
    if time.time() - starttime > TIMEOUT_MINS*60:
        print "Crawler has been running for more than %d minutes; exiting." % TIMEOUT_MINS
        return
    count = 0
    total_upload = 0
    try:
        es = htcondor_es.es.get_server_handle()
        query_iter = schedd.xquery()
        json_ad = '{}'
        for job_ad in query_iter:
            #print "Processing ad %s." % job_ad.get("GlobalJobId")
            json_ad = htcondor_es.convert_to_json.convert_to_json(job_ad)
            if not json_ad:
                continue
            idx = htcondor_es.es.get_index(job_ad["QDate"])
            ad_list = buffered_ads.setdefault(idx, [])
            ad_list.append((job_ad["GlobalJobId"], json_ad))
            if len(ad_list) == 250:
                st = time.time()
                htcondor_es.es.post_ads(es, idx, ad_list)
                total_upload += time.time() - st
                buffered_ads[idx] = []
            #print es.index(index=idx, doc_type="job", body=json_ad, id=job_ad["GlobalJobId"])
            count += 1
            if time.time() - starttime > TIMEOUT_MINS*60:
                print "Crawler has been running for more than %d minutes; exiting." % TIMEOUT_MINS
                break
        #print "Sample ad for", job_ad["GlobalJobId"]
        #json_ad = json.loads(json_ad)
        #keys = json_ad.keys()
        #keys.sort()
        #for key in keys:
        #    print key, "=", json_ad[key]
    except RuntimeError:
        print "Failed to query schedd for jobs:", schedd_ad["Name"]
    except Exception, e:
        print "Failure when processing schedd query:", str(e)
Ejemplo n.º 17
0
def query(cluster_ids):
    schedd = htcondor.Schedd()
    attr_list = ["ClusterId", "ProcId", "JobStatus", "EnteredCurrentStatus", "ExitCode", "RemoveReason"]
    status = []

    for cluster_id in cluster_ids:
        query = schedd.query(
                    constraint='ClusterId=?={}'.format(cluster_id),
                    attr_list=attr_list)
        if query:
            for query_item in query:
                cluster_dict = get_dict(query_item, attr_list)
                status.append(cluster_dict)
        else:
            condor_it = schedd.history('ClusterId == {}'.format(cluster_id), attr_list, match=1)
            for query_item in condor_it:
                if query_item:
                    cluster_dict = get_dict(query_item, attr_list)
                    status.append(cluster_dict)
    print(json.dumps(status))
Ejemplo n.º 18
0
def update_jobs_list(condor_collector, hostname):
    """ Queries the schedds to generate a list of preemptable jobs"""
    logger = logging.getLogger('condor_efficient_defrag')

    condor_schedd = htcondor.Schedd(condor_collector.locate(htcondor.DaemonTypes.Schedd, hostname))

    jobs = []
    pjobs = []

    try:
       jobs = condor_schedd.query('isPreemptable =?= True', ["GlobalJobId"])
    except ValueError:
        logger.error("Caught ValueError - could not connect to schedd on %s, skipping reading jobs in queue.", hostname)
    except IOError:
        logger.error("Caught IOError - could not connect to schedd on %s, skipping reading jobs in queue.", hostname)

    for job in jobs:
       pjobs.append(job["GlobalJobId"])

    return pjobs
Ejemplo n.º 19
0
 def testScheddQueryPoll(self):
     self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"])
     output_file = os.path.join(testdir, "test.out")
     if os.path.exists(output_file):
         os.unlink(output_file)
     schedd = htcondor.Schedd()
     ad = classad.parseOne(open("submit.ad"))
     ads = []
     cluster = schedd.submit(ad, 10, False, ads)
     for i in range(60):
         ads_iter = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query1")
         ads_iter2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query2")
         ads = []
         for query in htcondor.poll([ads_iter, ads_iter2]):
             self.assertTrue(query.tag() in ["query1", "query2"])
             ads += query.nextAdsNonBlocking()
         if len(ads) == 0:
             break
         if i % 2 == 0:
             schedd.reschedule()
def get_pool_resource_utilization_gpu(pool,
                                      retry_delay=30,
                                      max_retries=4,
                                      schedd_constraint=True):
    coll = htcondor.Collector(pool)
    retries = 0
    while retries < max_retries:
        try:
            schedd_ads = coll.query(htcondor.AdTypes.Schedd, schedd_constraint)
        except:
            logger.warning(
                "trouble getting pool {0} schedds, retrying in {1}s.".format(
                    pool, retry_delay))
            retries += 1
            schedd_ads = None
            time.sleep(retry_delay)
        else:
            break

    if schedd_ads is None:
        logger.error(
            "trouble getting pool {0} schedds, giving up.".format(pool))
        return {}

    memory_usage = 0
    disk_usage = 0
    for ad in schedd_ads:
        try:
            schedd = htcondor.Schedd(ad)
            results = schedd.query('jobstatus==2',
                                   ['ResidentSetSize_RAW', 'DiskUsage_RAW'])
        except Exception as e:
            logger.error(e)
        else:
            for r in results:
                memory_usage += r.get('ResidentSetSize_RAW', 0)
                disk_usage += r.get('DiskUsage_RAW', 0)
    return {
        "MemoryUsage": memory_usage / 1024,
        "DiskUsage": disk_usage,
    }
Ejemplo n.º 21
0
def condor_submit(outdir, filename, channel):
    ##Condor class
    job = htcondor.Submit()
    schedd = htcondor.Schedd()

    skimfilename = "_".join(filename.split("/"))[:-5] + "_Skim.root"

    ##Condor configuration
    job["executable"] = "{}/src/ChargedHiggs/nanoAOD_processing/batch/condor_script.sh".format(
        os.environ["CMSSW_BASE"])
    job["arguments"] = " ".join([filename, channel])
    job["universe"] = "vanilla"

    job["should_transfer_files"] = "YES"
    job["transfer_input_files"] = ",".join([
        os.environ["CMSSW_BASE"] + "/src/ChargedHiggs",
        os.environ["CMSSW_BASE"] + "/src/x509"
    ])

    job["log"] = outdir + "/log/job_$(Cluster).log"
    job["output"] = outdir + "/log/job_$(Cluster).out"
    job["error"] = outdir + "/log/job_$(Cluster).err"

    #job["+RequestRuntime"]    = "{}".format(60*60*12)
    job["when_to_transfer_output"] = "ON_EXIT"
    job["transfer_output_remaps"] = '"' + '{filename} = {outdir}/{filename}'.format(
        filename=skimfilename, outdir=outdir) + '"'

    ##Agressively submit your jobs
    def submit(schedd, job):
        with schedd.transaction() as txn:
            job.queue(txn)
            print "Submit job for file {}".format(filename)

    while (True):
        try:
            submit(schedd, job)
            break

        except:
            pass
Ejemplo n.º 22
0
def condor_submit(MHc, Mh, fragment, lhefile):
    job = htcondor.Submit()
    schedd = htcondor.Schedd()

    name = lhefile.split("/")[-1][:-4]

    outdir = "/nfs/dust/cms/user/{}/Signal/Hc+hTol4b_MHc{}_Mh{}/Samples/".format(os.environ["USER"], MHc, Mh)
    os.system("mkdir -p {}".format(outdir)) 
    os.system("mkdir -p {}/log".format(outdir)) 

    job["executable"] = "{}/src/ChargedHiggs/MCproduction/batch/produceMC.sh".format(os.environ["CMSSW_BASE"])
    job["arguments"] = " ".join([fragment.split("/")[-1], lhefile.split("/")[-1], name])
    job["universe"]       = "vanilla"

    job["should_transfer_files"] = "YES"
    job["transfer_input_files"]       = ",".join([fragment, lhefile, os.environ["CMSSW_BASE"] + "/src/x509", os.environ["HOME"] + "/.dasmaps/"])

    job["log"]                    = "{}/log/job_$(Cluster).log".format(outdir)
    job["output"]                    = "{}/log/job_$(Cluster).out".format(outdir)
    job["error"]                    = "{}/log/job_$(Cluster).err".format(outdir)

    job["when_to_transfer_output"] = "ON_EXIT"
    job["transfer_output_remaps"] = '"' + '{filename}_MINIAOD.root = {outdir}/{filename}_MINIAOD.root; {filename}_NANOAOD.root = {outdir}/{filename}_NANOAOD.root '.format(filename=name, outdir=outdir) + '"'

    job["on_exit_hold"] = "(ExitBySignal == True) || (ExitCode != 0)"  
    job["periodic_release"] =  "(NumJobStarts < 5) && ((CurrentTime - EnteredCurrentStatus) > 60)"

    job["+RequestRuntime"]    = "{}".format(60*60*12)

    def submit(schedd, job):
        with schedd.transaction() as txn:
            job.queue(txn)
          
    while(True):
        try: 
            submit(schedd, job)
            print "Submit job for file {}".format(lhefile)
            break    

        except:
            pass
Ejemplo n.º 23
0
 def __init__(self, scheddwrap=None):
     """
     :param HTCondorRemoteScheddWrapper scheddwrap: [optional] when provided, 
         the current object is built on it to contact a remote schedd.
         Otherwise, a local schedd is assumed.
     """
     self.log = logging.getLogger('htcondorschedd')
     self.log.addHandler(logging.NullHandler())
     if scheddwrap:
         self.schedd = scheddwrap.schedd
         self.address = scheddwrap.address
     else:
         self.address = None
         try:
             self.schedd = htcondor.Schedd()
         except Exception as ex:
             self.log.critical('Unable to instantiate an Schedd object')
             raise ScheddNotReachable()
     # Lock object to serialize the submission and query calls
     self.lock = threading.Lock()
     self.log.debug('HTCondorSchedd object initialized')
Ejemplo n.º 24
0
    def kill(self, jobs):
        """
        _kill_

        Kill a list of jobs based on the WMBS job names.
        Kill can happen for schedd running on localhost... TBC.
        """
        sd = condor.Schedd()
        ad = classad.ClassAd()
        listJobIds = [job['jobid'] for job in jobs]
        ad['foo'] = listJobIds
        logging.info("Killing %i jobs from the queue", len(listJobIds))
        jobsConstraint = "member(WMAgent_JobID, %s)" % ad.lookup(
            "foo").__repr__()
        try:
            sd.act(condor.JobAction.Remove, jobsConstraint)
        except RuntimeError:
            logging.warn("Error while killing jobs on the schedd: %s",
                         listJobIds)

        return
Ejemplo n.º 25
0
    def Submit(self, wait=True):

        # Submit the job defined by submit_args
        Utils.TLog("Submitting job with arguments: " + str(self._job_args))
        schedd = htcondor.Schedd()
        submit = htcondor.Submit(self._job_args)
        try:
            with schedd.transaction() as txn:
                self._cluster_id = submit.queue(txn)
        except:
            print("Job submission failed for an unknown error")
            return JOB_FAILURE

        Utils.TLog("Job running on cluster " + str(self._cluster_id))

        # Wait until job has finished running?
        if wait is True:
            self.WaitForFinish()

        # If we got this far, we assume the job succeeded.
        return JOB_SUCCESS
Ejemplo n.º 26
0
def submit(config_files, batch_directory, batch_log_dir, run_script):
    logger.info("Will submit {0} jobs".format(len(config_files)))
    schedd = htcondor.Schedd()
    results = []

    job_cfgs = []
    for i, cfg in enumerate(config_files):
        job_cfgs.append(
            __create_job_cfg(i, cfg, batch_directory, batch_log_dir,
                             run_script))

    if 'cern.ch' in socket.gethostname():
        results = list(
            _submit_via_command_line(job_cfgs, config_files, batch_directory))
    else:
        with schedd.transaction() as txn:
            for job_cfg, cfg in zip(job_cfgs, config_files):
                result = __submit_one(txn, job_cfgs, cfg)
                results.append(result)

    return results
Ejemplo n.º 27
0
    def idle_jobs(self, owners=[], exclude_owners=[]):
        if self.test:
            return [{1: 1}, {}]
        qinfo = []
        for schedd_ad in \
                htcondor.Collector().locateAll(htcondor.DaemonTypes.Schedd):
            schedd = htcondor.Schedd(schedd_ad)
            qinfo += schedd.xquery(projection=['RequestCpus', 'Owner'],
                                   requirements='JobStatus=1')

        full_idle_jobs = {}
        selected_idle_jobs = {}
        for q in qinfo:
            core = int(q.get('RequestCpus'))
            owner = q.get('Owner')
            if core not in full_idle_jobs:
                full_idle_jobs[core] = 0
            full_idle_jobs[core] += 1
            if len(owners) == 0 and len(exclude_owners) == 0:
                continue
            if len(owners) > 0:
                is_owner = 0
                for o in owners:
                    if owner.startswith(o):
                        is_owner = 1
                        break
                if is_owner == 0:
                    continue
            if len(exclude_owners) > 0:
                is_owner = 1
                for o in exclude_owners:
                    if owner.startswith(o):
                        is_owner = 0
                        break
                if is_owner == 0:
                    continue
            if core not in selected_idle_jobs:
                selected_idle_jobs[core] = 0
            selected_idle_jobs[core] += 1
        return [full_idle_jobs, selected_idle_jobs]
Ejemplo n.º 28
0
def job_command_consumer(testrun=False):
    job_commands_key = config.job_commands_key
    sleep_interval = config.command_sleep_interval

    while (True):
        try:
            redis_con = setup_redis_connection()
            command_string = redis_con.lpop(job_commands_key)
            if command_string is not None:
                command_dict = json.loads(command_string)
                command = command_dict["command"]
                if command == "set_job_hold":
                    job_id = command_dict["job_id"]
                    #if you don't supply it in a list format it seems to update all jobs
                    logging.info("Holding %s" % job_id)
                    s = htcondor.Schedd()
                    s.edit([
                        job_id,
                    ], "JobStatus", "5")
                    if (testrun):
                        return True

            else:
                logging.info(
                    "No command in redis list, begining sleep interval...")
                if (testrun):
                    return False
                time.sleep(sleep_interval)

        except Exception as e:
            logging.error(
                "Failure connecting to redis or executing condor command, begining sleep interval..."
            )
            logging.error(e)
            if (testrun):
                return False
            time.sleep(sleep_interval)

        except (SystemExit, KeyboardInterrupt):
            return
Ejemplo n.º 29
0
 def submit_DAG2AD(self, dagfile):
     DAGMAN="/usr/bin/condor_dagman"
     dag = dagfile
     schedd = htcondor.Schedd()
     ad = classad.ClassAd({  "JobUniverse": 7,  "Cmd": DAGMAN, "Arguments": "-f -l . -Lockfile %s.lock -AutoRescue 1 -DoRescueFrom 0 " \
               "-Dag %s -Suppress_notification -CsdVersion '%s' -Force -Dagman %s" % (dag, dag, htcondor.version(), DAGMAN),
                 "Env": "_CONDOR_MAX_DAGMAN_LOG=0;_CONDOR_DAGMAN_LOG=%s.dagman.out;" \
                  "_CONDOR_SCHEDD_DAEMON_AD_FILE=%s;_CONDOR_SCHEDD_ADDRESS_FILE=%s" %  (dag, htcondor.param["SCHEDD_DAEMON_AD_FILE"], htcondor.param["SCHEDD_ADDRESS_FILE"]),
                 "EnvDelim": ";",
                 "Out": "%s.lib.out" % dag,
                 "Err": "%s.lib.err" % dag,
                 "ShouldTransferFiles": "IF_NEEDED",
                 "UserLog": os.path.abspath("%s.dagman.log" % dag),
                 "KillSig": "SIGTERM",
                 "RemoveKillSig": "SIGUSR1",
                 #"OtherJobRemoveRequirements": classad.ExprTree('eval(strcat("DAGManJobId == ", ClusterId))'),
                 "OnExitRemove": classad.ExprTree('( ExitSignal =?= 11 || ( ExitCode =!= undefined && ExitCode >= 0 && ExitCode <= 2 ) )'),
                 "FileSystemDomain": htcondor.param['FILESYSTEM_DOMAIN'],
                 "Requirements": classad.ExprTree('true || false'),
                 })
     cluster = schedd.submit(ad)
     print("Submitted as cluster %d" % cluster)
Ejemplo n.º 30
0
    def Submit(self, wait=False):

        # Submit the job defined by submit_args
        Utils.TLog("Submitting job with arguments: " + str(self._job_args))
        schedd = htcondor.Schedd()
        submit = htcondor.Submit(self._job_args)
        try:
            with schedd.transaction() as txn:
                self._cluster_id = submit.queue(txn)
        except:
            print("Job submission failed for an unknown error")
            return JOB_FAILURE

        Utils.TLog("Job running on cluster " + str(self._cluster_id))

        # Wait until job has finished running?
        if wait is True:
            submit_result = self.WaitForFinish()
            return submit_result

        # If we aren't waiting for finish, return None
        return None