def getCollectors(pool, pool1, main=False): """Get both collectors""" if main: coll = htcondor.Collector(pool.split(":")[0]) coll1 = htcondor.Collector(pool1.split(":")[0]) return coll, coll1 else: coll = htcondor.Collector(pool) coll1 = htcondor.Collector(pool1) return coll, coll1
def __getcollector(self): self.log.debug('starting') if self.hostname: address = _address(self.hostname, self.port) collector = htcondor.Collector(address) self.log.debug('got remote collector') else: collector = htcondor.Collector() self.log.debug('got local collector') self.__validate_collector(collector) return collector
def testNegotiate(self): #htcondor.param['TOOL_DEBUG'] = 'D_FULLDEBUG' #os.environ['_condor_SCHEDD_DEBUG'] = 'D_FULLDEBUG, D_NETWORK' #htcondor.enable_debug() self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() schedd.act(htcondor.JobAction.Remove, 'true') ad = classad.parseOne(open("tests/submit.ad")) ads = [] cluster = schedd.submit(ad, 1, False, ads) # Get claim for startd claim_ads = [] for i in range(10): startd_ads = htcondor.Collector().locateAll( htcondor.DaemonTypes.Startd) private_ads = htcondor.Collector().query( htcondor.AdTypes.StartdPrivate) if (len(startd_ads) != htcondor.param['NUM_CPUS']) or ( len(private_ads) != htcondor.param['NUM_CPUS']): time.sleep(1) continue break self.assertEqual(len(startd_ads), len(private_ads)) self.assertEqual(len(startd_ads), htcondor.param['NUM_CPUS']) for ad in htcondor.Collector().locateAll(htcondor.DaemonTypes.Startd): for pvt_ad in private_ads: if pvt_ad.get('Name') == ad['Name']: ad['ClaimId'] = pvt_ad['Capability'] claim_ads.append(ad) self.assertEqual(len(claim_ads), len(startd_ads)) claim = claim_ads[0] me = "%s@%s" % (pwd.getpwuid( os.geteuid()).pw_name, htcondor.param['UID_DOMAIN']) with schedd.negotiate(me) as session: requests = list(session) self.assertEqual(len(requests), 1) request = requests[0] self.assertTrue(request.symmetricMatch(claim)) session.sendClaim(claim['ClaimId'], claim, request) for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ads = list(ads) if len(ads) == 0: break time.sleep(1) self.assertEqual(open(output_file).read(), "hello world\n")
def collector(self): """ The :class:`htcondor.Collector` for the personal pool's collector. """ with self.use_config(): # This odd construction ensure that the Collector we return # doesn't just point to "the local collector" - that could be # overridden by changing CONDOR_CONFIG after the Collector # was initialized. Locating first keeps it stable. return htcondor.Collector(htcondor.Collector().locate( htcondor.DaemonTypes.Collector))
def __init__(self, collector=None): """Helper class to query HTCondor via python bindings.""" if collector is None: self.collector = htcondor.Collector() else: self.collector = htcondor.Collector(collector) """Central collector.""" self.schedds = [ htcondor.Schedd(classAd) for classAd in self.collector.query(htcondor.AdTypes.Schedd) ] """List of schedd objects, retrieved from collector."""
def waitRemoteDaemon(self, dtype, dname, pool=None, timeout=5): if pool: coll = htcondor.Collector(pool) else: coll = htcondor.Collector() for i in range(timeout): try: return coll.locate(dtype, dname) except Exception: pass time.sleep(1) return coll.locate(dtype, dname)
def getSchedd(scheddurl,coll=""): if len(scheddurl)>0: try: if len(coll)>0: coll = htcondor.Collector(coll) else: coll = htcondor.Collector() # defaults to local scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, scheddurl) schedd = htcondor.Schedd(scheddAd) except: print "Warning: could not locate schedd "+scheddurl return None else: schedd = htcondor.Schedd() # defaults to local return schedd
def condor_submit_process(mp_queue, host, jdl_map_list): """ Function for new process to submit condor """ # initialization errStr = '' batchIDs_list = [] # parse schedd and pool name condor_schedd, condor_pool = None, None if host in ('LOCAL', 'None'): tmpLog.debug( 'submissionHost is {0}, treated as local schedd. Skipped'.format( host)) else: try: condor_schedd, condor_pool = host.split(',')[0:2] except ValueError: tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(host)) # get schedd try: if condor_pool: collector = htcondor.Collector(condor_pool) else: collector = htcondor.Collector() if condor_schedd: scheddAd = collector.locate(htcondor.DaemonTypes.Schedd, condor_schedd) else: scheddAd = collector.locate(htcondor.DaemonTypes.Schedd) schedd = htcondor.Schedd(scheddAd) except Exception as e: errStr = 'create condor collector and schedd failed; {0}: {1}'.format( e.__class__.__name__, e) else: submit_obj = htcondor.Submit() try: with schedd.transaction() as txn: # TODO: Currently spool is not supported in htcondor.Submit ... submit_result = submit_obj.queue_with_itemdata( txn, 1, iter(jdl_map_list)) clusterid = submit_result.cluster() first_proc = submit_result.first_proc() num_proc = submit_result.num_procs() batchIDs_list.extend([ '{0}.{1}'.format(clusterid, procid) for procid in range(first_proc, first_proc + num_proc) ]) except RuntimeError as e: errStr = 'submission failed; {0}: {1}'.format( e.__class__.__name__, e) mp_queue.put((batchIDs_list, errStr))
def fetch_using_bindings(self, constraint=None, format_list=None): """Fetch the condor_q results using htcondor-python bindings Args: constraint (str): Constraints to be applied to the query format_list (list): Classad attr & type. [(attr1, 'i'), ('attr2', 's')] Returns (dict): Dict containing the results """ global disk_cache results_dict = {} # defined here in case of exception constraint = bindings_friendly_constraint(constraint) attrs = bindings_friendly_attrs(format_list) self.security_obj.save_state() try: self.security_obj.enforce_requests() htcondor_full_reload() if self.pool_name: collector = htcondor.Collector(str(self.pool_name)) else: collector = htcondor.Collector() if self.schedd_name is None: schedd = htcondor.Schedd() else: schedd_ad = disk_cache.get(self.schedd_name + '.locate') if schedd_ad is None: schedd_ad = collector.locate(htcondor.DaemonTypes.Schedd, self.schedd_name) disk_cache.save(self.schedd_name + '.locate', schedd_ad) schedd = htcondor.Schedd(schedd_ad) results = schedd.query(constraint, attrs) results_dict = list2dict(results, self.group_attribute) except Exception as ex: s = 'default' if self.schedd_name is not None: s = self.schedd_name p = 'default' if self.pool_name is not None: p = self.pool_name err_str = 'Error querying schedd %s in pool %s using python bindings: %s' % ( s, p, ex) raise PBError(err_str), None, sys.exc_info()[2] finally: self.security_obj.restore_state() return results_dict
def collector(self): """ Returns a context manager that provides the :class:`htcondor.Collector` for the personal pool's collector. """ with self.use_config(): yield htcondor.Collector()
def getScheddObj(self, name): """ Return a tuple (schedd, address) containing an object representing the remote schedd and its corresponding address. Still required for OLD tasks. Remove it later TODO """ info = name.split("_") if len(info) > 3: name = info[2] else: name = self.getSchedd() if name == "localhost": schedd = htcondor.Schedd() with open(htcondor.param['SCHEDD_ADDRESS_FILE']) as fd: address = fd.read().split("\n")[0] else: info = name.split(":") pool = "localhost" if len(info) == 3: pool = info[1] htcondor.param['COLLECTOR_HOST'] = self.getCollector(pool) coll = htcondor.Collector() schedds = coll.query(htcondor.AdTypes.Schedd, 'regexp(%s, Name)' % HTCondorUtils.quote(info[0])) self.scheddAd = "" if not schedds: self.scheddAd = self.getCachedCollectorOutput(info[0]) else: self.cacheCollectorOutput(info[0], schedds[0]) self.scheddAd = self.getCachedCollectorOutput(info[0]) address = self.scheddAd['MyAddress'] schedd = htcondor.Schedd(self.scheddAd) return schedd, address
def get_pool_status(pool, retry_delay=30, max_retries=4, schedd_constraint=True, negotiator_constraint=True): coll = htcondor.Collector(pool) if callable(schedd_constraint): schedd_constraint = schedd_constraint(coll) if callable(negotiator_constraint): negotiator_constraint = negotiator_constraint(coll) daemons = { "schedds": htcondor.DaemonTypes.Schedd, "collectors": htcondor.DaemonTypes.Collector, "negotiators": htcondor.DaemonTypes.Negotiator } data = { "schema": "daemon.name.measurement", "metrics": {}, } for daemon_type, daemon in daemons.iteritems(): retries = 0 while retries < max_retries: try: if daemon_type == "schedds": ads = coll.query(htcondor.AdTypes.Schedd, schedd_constraint) elif daemon_type == 'negotiators': ads = coll.query(htcondor.AdTypes.Negotiator, negotiator_constraint) else: ads = coll.locateAll(daemon) except Exception as e: logger.warning( "trouble getting pool {0} {1} status, retrying in {2}s: {3}" .format(pool, daemon_type, retry_delay, e)) ads = None retries += 1 time.sleep(retry_delay) else: break if ads is None: logger.error( "trouble getting pool {0} {1} status, giving up.".format( pool, daemon_type)) else: for ad in ads: # quick hack to skip schedds starting up on worker nodes if ad['Name'].startswith('fnpc'): logger.info('skipping worker node {}'.format(ad['Name'])) continue for k in ad: if type(ad[k]) in [int, long, float]: metric = ".".join([ daemon_type, ad["Name"].replace(".", "_").replace( "@", "-").replace(" ", "_"), k ]) data["metrics"][metric] = ad[k] return [data]
def testLocate(self): self.launch_daemons(["COLLECTOR"]) coll = htcondor.Collector() coll_ad = coll.locate(htcondor.DaemonTypes.Collector) self.assertTrue("MyAddress" in coll_ad) self.assertEquals(coll_ad["Name"].split(":")[-1], os.environ["_condor_PORT"])
def __init__(self, resthost, jsonDoc, logger=None): if not logger: self.logger = logging.getLogger(__name__) handler = logging.StreamHandler(sys.stdout) formatter = logging.Formatter( "%(asctime)s:%(levelname)s:%(module)s %(message)s") handler.setFormatter(formatter) self.logger.addHandler(handler) self.logger.setLevel(logging.DEBUG) else: self.logger = logger self.jsonDoc = jsonDoc self.resthost = resthost self.pool = '' self.schedds = [] self.resthost = "cmsweb.cern.ch:8443" self.crabserver = CRABRest(hostname=resthost, localcert='/data/certs/servicecert.pem', localkey='/data/certs/servicekey.pem', retry=10, userAgent='CRABTaskWorker') self.crabserver.setDbInstance(dbInstance='prod') # use child collector on port 9620 to get schedd attributes collName = "cmsgwms-collector-global.cern.ch:9620,cmsgwms-collector-global.fnal.gov:9620" self.coll = htcondor.Collector(collName)
def get_schedds(args): """ Return a list of schedd ads representing all the schedds in the pool. """ collectors = args.collectors if collectors: collectors = collectors.split(",") else: collectors = [] logging.warning("The list of Collectors to query is empty") schedd_ads = {} for host in collectors: coll = htcondor.Collector(host) try: schedds = coll.locateAll(htcondor.DaemonTypes.Schedd) except IOError: logging.exception( f"Error while getting Schedds from Collector {host}") continue for schedd in schedds: if args.schedds and not (schedd["Name"] in args.schedds.split(",")): continue schedd["MyPool"] = host try: schedd_ads[schedd["Name"]] = schedd except KeyError: pass schedd_ads = list(schedd_ads.values()) random.shuffle(schedd_ads) return schedd_ads
def get_factory_version(node_name): htcondor.reload_config() collector = htcondor.Collector(node_name) adtype = htcondor.AdTypes.Any constraint = 'MyType == "glidefactoryglobal"' results = collector.query(adtype, constraint, ['GlideinWMSVersion']) return results[0]['GlideinWMSVersion']
def test_logging(self): # Submitting some sleep jobs job = {"executable": "/bin/sleep", "arguments": "5m", "request_memory": "1024"} sub = htcondor.Submit(job) schedd = htcondor.Schedd() with schedd.transaction() as txn: sub.queue(txn, 1) # Waiting for the glideins to start time.sleep(60) coll = htcondor.Collector() startd = coll.locateAll(htcondor.DaemonTypes.Startd)[0] url = startd['PRESIGNED_GET_URL'] log_filename = 'logfile.tar.gz' logfile_opener = urllib.URLopener() logfile_opener.retrieve(url, log_filename) with tarfile.open(log_filename, 'r:gz') as tar: tar.extractall() logdir = glob.glob('log.*')[0] self.assertTrue(os.path.exists(os.path.join(logdir, 'MasterLog')), msg='Failed to download logfile: {}'.format(url))
def getScheddObj(self, name): """ Return a tuple (schedd, address) containing an object representing the remote schedd and its corresponding address. If address is None, then we are using the BossAir plugin. Otherwise, the schedd object is of type htcondor.Schedd. """ if not self.getRemoteCondor(): if name == "localhost": schedd = htcondor.Schedd() with open(htcondor.param['SCHEDD_ADDRESS_FILE']) as fd: address = fd.read().split("\n")[0] else: info = name.split(":") pool = "localhost" if len(info) == 2: pool = info[1] coll = htcondor.Collector(self.getCollector(pool)) scheddAd = coll.locate(htcondor.DaemonTypes.Schedd, info[0]) address = scheddAd['MyAddress'] schedd = htcondor.Schedd(scheddAd) return schedd, address else: return RemoteCondorPlugin.RemoteCondorPlugin(self.config, logger=self.logger), None
def get_schedd_ads(self): import htcondor self.get_collector_node_addresses() for node in self.collector_node_addresses: collector = htcondor.Collector(node) try: self.schedd_ads = collector.query( htcondor.AdTypes.Schedd, projection = [ 'Name', 'MyAddress', 'MaxJobsRunning', 'ShadowsRunning', 'RecentDaemonCoreDutyCycle', 'TotalIdleJobs' ], constraint = self.schedd_constraints ) if self.schedd_ads: # As soon as schedd_ads are found in one collector node, use those # This may not be the correct choice for some batch systems break except Exception as e: logger.debug('Failed querying %s: %s', node, e) continue else: logger.error('Failed to collect any schedds from %s', self.collector_node_addresses) raise RuntimeError logger.debug('Found schedd ads %s', self.schedd_ads) return self.schedd_ads
def main(): overview_running = {} overview_pending = {} overview_other = {} overview_running48 = {} overview_numjobstart = {} overview_removereason = {} jobs_48 = {} jobs_maxwall = {} jobs_numjobstart = {} jobs_removereason = {} # global pool collector coll = htcondor.Collector(global_pool) schedd_ads = coll.query(htcondor.AdTypes.Schedd, 'CMSGWMS_Type=?="prodschedd"', ['Name', 'MyAddress', 'ScheddIpAddr']) # all schedds for ad in schedd_ads: if ad["Name"] not in schedds: continue print "getting jobs from %s" % ad["Name"] #fill the overview get_overview(overview_running, overview_pending, overview_other, overview_running48, overview_numjobstart, overview_removereason, jobs_48, jobs_maxwall, jobs_numjobstart, jobs_removereason, ad) print_results(overview_running, overview_pending, overview_running48, overview_numjobstart, overview_removereason, jobs_48, jobs_maxwall, jobs_numjobstart, jobs_removereason)
def get_pool_resource_utilization(pool, retry_delay=30, max_retries=4, schedd_constraint=True): coll = htcondor.Collector(pool) retries = 0 while retries < max_retries: try: schedd_ads = coll.query(htcondor.AdTypes.Schedd,schedd_constraint) except: logger.warning("trouble getting pool {0} schedds, retrying in {1}s.".format(pool,retry_delay)) retries += 1 schedd_ads = None time.sleep(retry_delay) else: break if schedd_ads is None: logger.error("trouble getting pool {0} schedds, giving up.".format(pool)) return {} memory_usage = 0 disk_usage = 0 for ad in schedd_ads: try: schedd = htcondor.Schedd(ad) results = schedd.query('jobstatus==2',['ResidentSetSize_RAW','DiskUsage_RAW']) except Exception as e: logger.error(e) else: for r in results: memory_usage += r.get('ResidentSetSize_RAW',0) disk_usage += r.get('DiskUsage_RAW',0) return { "MemoryUsage":memory_usage/1024, "DiskUsage":disk_usage, }
def get_schedd(pool=None, schedd_name=None): if schedd_name: collector = htcondor.Collector(pool) return htcondor.Schedd( collector.locate(htcondor.DaemonTypes.Schedd, schedd_name)) else: return htcondor.Schedd()
def main(): coll = htcondor.Collector() slots = coll.query(htcondor.AdTypes.Startd, projection=["Name", "Activity", "State"]) expected_machines = { "syr-compute-c0", "syr-compute-c1", "uc-compute-c0", "uc-compute-c1", "ucsd-compute-c0", "ucsd-compute-c1", "unl-compute-c0", "unl-compute-c1" } current_machines = { s["Name"] for s in slots if s["State"] == "Unclaimed" and s["Activity"] == "Idle" } if current_machines != expected_machines: print("Expected machines not found") return 1 print("Expected machines found") slots.sort(key=lambda s: s["Name"]) for s in slots: print(repr(s)) return 0
def test_startd_checks(self): startd_resources = ['PYGLIDEIN_RESOURCE_GPU', 'PYGLIDEIN_RESOURCE_CVMFS', 'PYGLIDEIN_RESOURCE_GRIDFTP'] startd_metrics = ['PYGLIDEIN_METRIC_TIME_PER_PHOTON'] coll = htcondor.Collector() startd = coll.locateAll(htcondor.DaemonTypes.Startd) if len(startd) == 0: # Submitting some sleep jobs job = {"executable": "/bin/sleep", "arguments": "5m", "request_memory": "1024"} sub = htcondor.Submit(job) schedd = htcondor.Schedd() with schedd.transaction() as txn: sub.queue(txn, 1) # Waiting for the glideins to start time.sleep(60) startd = coll.locateAll(htcondor.DaemonTypes.Startd)[0] for resource in startd_resources: self.assertTrue(startd.get(resource, False), msg='{} does not exist or equals False'.format(resource)) for metric in startd_metrics: self.assertTrue(startd.get(metric, 0) > 0, msg='{} does not exist or equals 0'.format(metric))
def __init__(self, pool="localhost"): self.pool = pool self.collector = htcondor.Collector(pool) self.bins = [(300, 'recent'), (3600, 'one_hour'), (3600 * 4, 'four_hours'), (3600 * 8, 'eight_hours'), (3600 * 24, 'one_day'), (3600 * 24 * 2, 'two_days'), (3600 * 24 * 7, 'one_week')]
def isScheddOverloaded(self): """ check whether job limit is reached in local schedd. Condition is check by following logic. ( ShadowsRunning > 9.700000000000000E-01 * MAX_RUNNING_JOBS) ) || ( RecentDaemonCoreDutyCycle > 9.800000000000000E-01 ) """ try: scheddAd = self.coll.locate(htcondor.DaemonTypes.Schedd) q = self.coll.query(htcondor.AdTypes.Schedd, 'Name == "%s"' % scheddAd['Name'], projection=['CurbMatchmaking'])[0] isOverloaded = q['CurbMatchmaking'].eval() return isOverloaded except Exception: # if there is an error, try to recreate the collector instance logging.info("Recreating Collector instance due to query error...") self.coll = htcondor.Collector() try: scheddAd = self.coll.locate(htcondor.DaemonTypes.Schedd) q = self.coll.query(htcondor.AdTypes.Schedd, 'Name == "%s"' % scheddAd['Name'], projection=['CurbMatchmaking'])[0] isOverloaded = q['CurbMatchmaking'].eval() except Exception as ex: msg = "Condor failed to fetch schedd attributes." msg += "Error message: %s" % str(ex) logging.exception(msg) # since it failed, assume it's overloaded isOverloaded = True return isOverloaded
def read_from_collector(address, history=False, constraint='true', projection=[]): """Connect to condor collectors and schedds to pull job ads directly. A generator that yields condor job dicts. Args: address (str): address of collector history (bool): read history (True) or active queue (default: False) """ import htcondor coll = htcondor.Collector(address) schedd_ads = coll.locateAll(htcondor.DaemonTypes.Schedd) for schedd_ad in schedd_ads: logging.info('getting job ads from %s', schedd_ad['Name']) schedd = htcondor.Schedd(schedd_ad) try: i = 0 if history: start_dt = datetime.now() - timedelta(minutes=10) start_stamp = time.mktime(start_dt.timetuple()) gen = schedd.history( '(EnteredCurrentStatus >= {0}) && ({1})'.format( start_stamp, constraint), projection, 10000) else: gen = schedd.query(constraint, projection) for i, entry in enumerate(gen): yield classad_to_dict(entry) logging.info('got %d entries', i) except Exception: logging.info('%s failed', schedd_ad['Name'], exc_info=True)
def getScheddObj(self, name): """ Return a tuple (schedd, address) containing an object representing the remote schedd and its corresponding address. """ info = name.split("_") if len(info) > 3: name = info[2] else: name = self.getSchedd() if name == "localhost": schedd = htcondor.Schedd() with open(htcondor.param['SCHEDD_ADDRESS_FILE']) as fd: address = fd.read().split("\n")[0] else: info = name.split(":") pool = "localhost" if len(info) == 3: pool = info[1] htcondor.param['COLLECTOR_HOST'] = self.getCollector(pool) coll = htcondor.Collector() schedds = coll.query(htcondor.AdTypes.Schedd, 'regexp(%s, Name)' % HTCondorUtils.quote(info[0])) if not schedds: raise Exception("Unable to locate schedd %s" % info[0]) self.scheddAd = schedds[0] address = self.scheddAd['MyAddress'] schedd = htcondor.Schedd(self.scheddAd) return schedd, address
def get_condor_daemons(daemon_type): collector = htcondor.Collector() try: daemons = collector.locateAll(daemon_type) except: daemons = [] return daemons
def peek(params, parsers): """Peek into the crystal ball to see the future.""" coll = htcondor.Collector() # Ignore dynamic slots, which are the ephemeral children of partitionable slots, and thus noise. # Partitionable slot definitions remain unaltered by the process of dynamic slot creation. try: content = coll.query(htcondor.AdTypes.Startd, constraint='SlotType != "Dynamic"', projection=QUERY_DATA) except htcondor.HTCondorLocateError as e: LOGGER.error( str(e) + "\n You seem to run HTCrystalBall on a system that has no htcondor pool.\n" "For information about htcondor pools, you can go to\n" "https://htcondor.readthedocs.io/en/latest/admin-manual/introduction-admin-manual.html" ) sys.exit(0) examine.prepare(cpu=params.cpu, gpu=params.gpu, ram=params.ram, disk=params.disk, jobs=params.jobs, job_duration=params.time, maxnodes=params.maxnodes, verbose=params.verbose, content=content) sys.exit(0)