def get_ads_from_schedd(schedd_ad, args): schedd = htcondor.Schedd(schedd_ad) selection = args.selection or 'true' history_iter = schedd.history(selection, [], args.n_docs_to_query) job_ads = [j for j in history_iter] pckfile = os.path.join(args.filename) # If the file already exists, append the ads if os.path.isfile(pckfile): with open(pckfile, 'r') as pfile: job_ads.extend(pickle.load(pfile)) with open(pckfile, 'w') as pfile: pickle.dump(job_ads, pfile, pickle.HIGHEST_PROTOCOL) print " ...done, wrote %d docs to %s" % (len(job_ads), pckfile) return True
def fetch_job_ads(self, query, attributes=list()): """ Get CRAB schedulers from HTCondor """ query = str(query) ads = list() schedulers = self.collector.locateAll(htcondor.DaemonTypes.Schedd) for scheduler in schedulers: try: schedd = htcondor.Schedd(scheduler) returned_ads = schedd.query(query, attributes) for ad in returned_ads: ads.append(ad) except Exception as e: self.logger.warning( 'CRAB query failed for\nquery: %s\nattributes: %s\n Reason:\n %s', str(query), str(attributes), str(e)) return ads
def error_before_hold(): map = htmap.map(lambda x: 1 / x, [0, 1]) schedd = htcondor.Schedd() cluster_id = map._cluster_ids[0] schedd.act(htcondor.JobAction.Hold, f"(ClusterID == {cluster_id}) && (ProcID == 1)") map.wait(holds_ok=True, errors_ok=True) assert map.component_statuses == [ htmap.ComponentStatus.ERRORED, htmap.ComponentStatus.HELD, ] yield map map.remove()
def testScheddSubmitMany(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("submit.ad")) ads = [] cluster = schedd.submit(ad, 10, False, ads) for i in range(60): ads = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"]) ads = list(ads) if len(ads) == 0: break if i % 2 == 0: schedd.reschedule() time.sleep(1) self.assertEqual(open(output_file).read(), "hello world\n")
def submit_outer_dag( working_dir: Path, source_dir: Path, dest_dir: Path, requirements: Optional[str] = None, unique_id: Optional[str] = None, test_mode: bool = False, ): # Only import htcondor.dags submit-side import htcondor.dags as dags working_dir = working_dir.resolve() dest_dir = dest_dir.resolve() working_dir.mkdir(parents=True, exist_ok=True) dest_dir.mkdir(parents=True, exist_ok=True) transfer_manifest_path = dest_dir / "transfer_manifest.txt" outer_dag = make_outer_dag( dest_dir, requirements, source_dir, test_mode, transfer_manifest_path, unique_id, working_dir, ) if requirements: (working_dir / "requirements.txt").write_text(requirements) outer_dag_file = dags.write_dag(outer_dag, dag_dir=working_dir, dag_file_name="outer.dag") dag_args = {'force': 1} sub = htcondor.Submit.from_dag(str(outer_dag_file), dag_args) with change_dir(working_dir): schedd = htcondor.Schedd() with schedd.transaction() as txn: return sub.queue(txn)
def renew_session(self, retry=3, init=False): # Make logger tmpLog = core_utils.make_logger( baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorClient.renew_session') # Clear security session if not initialization if not init: tmpLog.info('Renew condor session') self.secman.invalidateAllSessions() # Recreate collector and schedd object i_try = 1 while i_try <= retry: try: tmpLog.info('Try {0}'.format(i_try)) if self.condor_pool: self.collector = htcondor.Collector(self.condor_pool) else: self.collector = htcondor.Collector() if self.condor_schedd: self.scheddAd = self.collector.locate( htcondor.DaemonTypes.Schedd, self.condor_schedd) else: self.scheddAd = self.collector.locate( htcondor.DaemonTypes.Schedd) self.schedd = htcondor.Schedd(self.scheddAd) tmpLog.info('Success') break except Exception as e: tmpLog.warning( 'Recreate condor collector and schedd failed: {0}'.format( e)) if i_try < retry: tmpLog.warning('Failed. Retry...') else: tmpLog.warning( 'Retry {0} times. Still failed. Skipped'.format(i_try)) return False i_try += 1 self.secman.invalidateAllSessions() time.sleep(3) # Sleep time.sleep(3) return True
def get_out_err_files(dagmanid, exitcode=None, schedd=None, user=getuser(), maxjobs=0): """Get the paths of the output and error files for nodes in a given DAG Parameters ---------- dagmanid : `int` the ClusterId of the DAG exitcode : `int`, optional return only nodes with this exitcode, or return all nodes if `None` schedd : `htcondor.Schedd`, optional the open connection to the scheduler user : `str`, optional the name of the user who submitted the DAG, defaults to you maxjobs : `int`, optional maximum number of condor history records to return, defaults to `0` meaning 'all' Returns ------- filedict : `dict` a `dict` of `(nodeid, [files])` pairs """ if schedd is None: schedd = htcondor.Schedd() history = list( schedd.history('DAGManJobId==%d && Owner=="%s"' % (dagmanid, user), ['ExitCode', 'Out', 'Err', 'ClusterId'], maxjobs)) out = {} for node in history: try: ec = node['ExitCode'] except KeyError: warnings.warn("Failed to get ExitCode for node %r" % node) continue if exitcode is not None and ec != exitcode: continue out[node['ClusterId']] = [node['Out'], node['Err']] return out
def get_job_duration_history(classad, value, user=getuser(), maxjobs=0, schedd=None): """Return the durations of history condor jobs This method uses the python bindings for `htcondor`, which seems to have network transfer limits, do not use for large job numbers (>2000), instead use `get_job_duration_history_shell` which calls to `condor_history` in the shell. Parameters ---------- classad : `str` name of classad providing unique identifier for job type value : value of classad user : `str`, optional name of submitting user maxjobs : `int`, optional maximum number of matches to return Returns ------- times, durations : `tuple` of `numpy.ndarray` two arrays with the job end time and durations of each matched condor process """ if schedd is None: schedd = htcondor.Schedd() if isinstance(value, str): value = '"%s"' % value history = list( schedd.history('%s==%s && Owner=="%s"' % (classad, value, user), ['EnteredCurrentStatus', 'JobStartDate'], maxjobs)) times = numpy.zeros(len(history)) jobdur = numpy.zeros(len(history)) for i, h in enumerate(history): times[i] = (to_gps(datetime.fromtimestamp(h['EnteredCurrentStatus'])) + time.timezone) jobdur[i] = h['EnteredCurrentStatus'] - h['JobStartDate'] return times, jobdur
def __init__(self): aCTProcess.__init__(self) self.schedd = htcondor.Schedd() self.condorjobstatemap = { 0: 'Undefined', # used before real state is known 1: 'Idle', 2: 'Running', 3: 'Removed', 4: 'Completed', 5: 'Held', 6: 'Transferring Output', 7: 'Suspended' } # store the last checkJobs time to avoid overloading of GIIS self.checktime = time.time()
def killWorkflowJobs(self, workflow): """ _killWorkflowJobs_ Kill all the jobs belonging to a specific workflow. """ logging.info("Going to remove all the jobs for workflow %s", workflow) schedd = htcondor.Schedd() try: schedd.act(htcondor.JobAction.Remove, "WMAgent_RequestName == %s" % classad.quote(workflow)) except RuntimeError: logging.warn( "Error while killing jobs on the schedd: WMAgent_RequestName=%s", workflow) return
def kill(self, jobs, raiseEx=False): """ _kill_ Kill a list of jobs based on the WMBS job names. Kill can happen for schedd running on localhost... TBC. """ logging.info("Killing %i jobs from the queue", len(jobs)) schedd = htcondor.Schedd() gridIds = [job['gridid'] for job in jobs] try: schedd.act(htcondor.JobAction.Remove, gridIds) except RuntimeError: logging.warn("Error while killing jobs on the schedd: %s", gridIds) if raiseEx: raise return
def getJobs(self, timestamp): data = [] query = 'TaskType =?= "ROOT" && JobStatus =?= 2 && QDate < %d' % (timestamp) attributes = ["CRAB_InputData", "QDate", "CRAB_UserHN", "CRAB_JobCount", "DAG_NodesQueued"] schedulers = self.locateAll(htcondor.DaemonTypes.Schedd) for scheduler in schedulers: # query all schedulers, if error retry up to 3 times for attempt in range(3): try: schedd = htcondor.Schedd(scheduler) jobs = schedd.query(query, attributes) for job in jobs: data.append(job) except IOError, e: continue else: break else: self.error(e)
def condor_submit_process(mp_queue, host, jdl_map_list): """ Function for new process to submit condor """ # initialization errStr = '' batchIDs_list = [] # parse schedd and pool name condor_schedd, condor_pool = None, None if host in ('LOCAL', 'None'): tmpLog.debug('submissionHost is {0}, treated as local schedd. Skipped'.format(host)) else: try: condor_schedd, condor_pool = host.split(',')[0:2] except ValueError: tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(host)) # get schedd try: if condor_pool: collector = htcondor.Collector(condor_pool) else: collector = htcondor.Collector() if condor_schedd: scheddAd = collector.locate(htcondor.DaemonTypes.Schedd, condor_schedd) else: scheddAd = collector.locate(htcondor.DaemonTypes.Schedd) schedd = htcondor.Schedd(scheddAd) except Exception as e: errStr = 'create condor collector and schedd failed; {0}: {1}'.format(e.__class__.__name__, e) else: submit_obj = htcondor.Submit() try: with schedd.transaction() as txn: # TODO: Currently spool is not supported in htcondor.Submit ... submit_result = submit_obj.queue_with_itemdata(txn, 1, iter(jdl_map_list)) clusterid = submit_result.cluster() first_proc = submit_result.first_proc() num_proc = submit_result.num_procs() batchIDs_list.extend(['{0}.{1}'.format(clusterid, procid) for procid in range(first_proc, first_proc + num_proc)]) except RuntimeError as e: errStr = 'submission failed; {0}: {1}'.format(e.__class__.__name__, e) mp_queue.put((batchIDs_list, errStr))
def condorCleanup(): # Retrieve all jobs from condor schedd schedd = condor.Schedd() jobs = schedd.xquery('true', [ 'WMAgent_RequestName', 'JobStatus', 'WMAgent_JobID', 'ServerTime', 'EnteredCurrentStatus' ]) # timeout keyed by condor status timeout = { 1: 3.1 * 24 * 3600, # Idle/Pending --> 3.1 days 2: 2.1 * 24 * 3600, # Running --> 2.1 days 5: 0.1 * 24 * 3600 } # Held --> 0.1 days listJobsToRemove = [] jobsRemovedInfo = [] for job in jobs: if job['JobStatus'] not in (1, 2, 5): continue timeThisStatus = job['ServerTime'] - job['EnteredCurrentStatus'] if timeThisStatus > timeout[job['JobStatus']]: listJobsToRemove.append(job['WMAgent_JobID']) jobsRemovedInfo.append(job) if jobsRemovedInfo: with open('jobs_removed_script.txt', 'w') as f: for line in jobsRemovedInfo: f.writelines(str(line)) print "Number of jobs to be removed from condor: %s" % len( listJobsToRemove) ad = classad.ClassAd() while len(listJobsToRemove) > 0: ad['foo'] = listJobsToRemove[:100] listJobsToRemove = listJobsToRemove[100:] jobsConstraint = "member(WMAgent_JobID, %s)" % ad.lookup( "foo").__repr__() out = schedd.act(condor.JobAction.Remove, jobsConstraint) #print "Outcome: %s" % str(out) return
def saveProxiedWebdir(crabserver, ad): """ The function queries the REST interface to get the proxied webdir and sets a classad so that we report this to the dashboard instead of the regular URL. The webdir (if exists) is written to a file named 'webdir' so that prejobs can read it and report to dashboard. If the proxied URL does not exist (i.e.: schedd not at CERN), we report the usual webdir. See https://github.com/dmwm/CRABServer/issues/4883 """ # Get the proxied webdir from the REST itnerface task = ad['CRAB_ReqName'] webDir_adName = 'CRAB_WebDirURL' ad[webDir_adName] = ad['CRAB_localWebDirURL'] proxied_webDir = getProxiedWebDir(crabserver=crabserver, task=task, logFunction=printLog) if proxied_webDir: # Prefer the proxied webDir to the non-proxied one ad[webDir_adName] = str(proxied_webDir) if ad[webDir_adName]: # This condor_edit is required because in the REST interface we look for the webdir if the DB upload failed (or in general if we use the "old logic") # See https://github.com/dmwm/CRABServer/blob/3.3.1507.rc8/src/python/CRABInterface/HTCondorDataWorkflow.py#L398 dagJobId = '%d.%d' % (ad['ClusterId'], ad['ProcId']) try: htcondor.Schedd().edit([dagJobId], webDir_adName, '{0}'.format(ad.lookup(webDir_adName))) except RuntimeError as reerror: printLog(str(reerror)) # We need to use a file to communicate this to the prejob. I tried to read the corresponding ClassAd from the preJob like: # htcondor.Schedd().xquery(requirements="ClusterId == %d && ProcId == %d" % (self.task_ad['ClusterId'], self.task_ad['ProcId']), projection=[webDir_adName]).next().get(webDir_adName) # but it is too heavy of an operation with HTCondor v8.8.3 with open("webdir", "w") as fd: fd.write(ad[webDir_adName]) else: printLog( "Cannot get proxied webdir from the server. Maybe the schedd does not have one in the REST configuration?" ) return 1 return 0
def process_schedd(starttime, schedd_ad): my_start = time.time() print "Querying %s for jobs." % schedd_ad["Name"] buffered_ads = {} schedd = htcondor.Schedd(schedd_ad) if time.time() - starttime > TIMEOUT_MINS*60: print "Crawler has been running for more than %d minutes; exiting." % TIMEOUT_MINS return count = 0 total_upload = 0 try: es = htcondor_es.es.get_server_handle() query_iter = schedd.xquery() json_ad = '{}' for job_ad in query_iter: #print "Processing ad %s." % job_ad.get("GlobalJobId") json_ad = htcondor_es.convert_to_json.convert_to_json(job_ad) if not json_ad: continue idx = htcondor_es.es.get_index(job_ad["QDate"]) ad_list = buffered_ads.setdefault(idx, []) ad_list.append((job_ad["GlobalJobId"], json_ad)) if len(ad_list) == 250: st = time.time() htcondor_es.es.post_ads(es, idx, ad_list) total_upload += time.time() - st buffered_ads[idx] = [] #print es.index(index=idx, doc_type="job", body=json_ad, id=job_ad["GlobalJobId"]) count += 1 if time.time() - starttime > TIMEOUT_MINS*60: print "Crawler has been running for more than %d minutes; exiting." % TIMEOUT_MINS break #print "Sample ad for", job_ad["GlobalJobId"] #json_ad = json.loads(json_ad) #keys = json_ad.keys() #keys.sort() #for key in keys: # print key, "=", json_ad[key] except RuntimeError: print "Failed to query schedd for jobs:", schedd_ad["Name"] except Exception, e: print "Failure when processing schedd query:", str(e)
def query(cluster_ids): schedd = htcondor.Schedd() attr_list = ["ClusterId", "ProcId", "JobStatus", "EnteredCurrentStatus", "ExitCode", "RemoveReason"] status = [] for cluster_id in cluster_ids: query = schedd.query( constraint='ClusterId=?={}'.format(cluster_id), attr_list=attr_list) if query: for query_item in query: cluster_dict = get_dict(query_item, attr_list) status.append(cluster_dict) else: condor_it = schedd.history('ClusterId == {}'.format(cluster_id), attr_list, match=1) for query_item in condor_it: if query_item: cluster_dict = get_dict(query_item, attr_list) status.append(cluster_dict) print(json.dumps(status))
def update_jobs_list(condor_collector, hostname): """ Queries the schedds to generate a list of preemptable jobs""" logger = logging.getLogger('condor_efficient_defrag') condor_schedd = htcondor.Schedd(condor_collector.locate(htcondor.DaemonTypes.Schedd, hostname)) jobs = [] pjobs = [] try: jobs = condor_schedd.query('isPreemptable =?= True', ["GlobalJobId"]) except ValueError: logger.error("Caught ValueError - could not connect to schedd on %s, skipping reading jobs in queue.", hostname) except IOError: logger.error("Caught IOError - could not connect to schedd on %s, skipping reading jobs in queue.", hostname) for job in jobs: pjobs.append(job["GlobalJobId"]) return pjobs
def testScheddQueryPoll(self): self.launch_daemons(["SCHEDD", "COLLECTOR", "STARTD", "NEGOTIATOR"]) output_file = os.path.join(testdir, "test.out") if os.path.exists(output_file): os.unlink(output_file) schedd = htcondor.Schedd() ad = classad.parseOne(open("submit.ad")) ads = [] cluster = schedd.submit(ad, 10, False, ads) for i in range(60): ads_iter = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query1") ads_iter2 = schedd.xquery("ClusterId == %d" % cluster, ["JobStatus"], name="query2") ads = [] for query in htcondor.poll([ads_iter, ads_iter2]): self.assertTrue(query.tag() in ["query1", "query2"]) ads += query.nextAdsNonBlocking() if len(ads) == 0: break if i % 2 == 0: schedd.reschedule()
def get_pool_resource_utilization_gpu(pool, retry_delay=30, max_retries=4, schedd_constraint=True): coll = htcondor.Collector(pool) retries = 0 while retries < max_retries: try: schedd_ads = coll.query(htcondor.AdTypes.Schedd, schedd_constraint) except: logger.warning( "trouble getting pool {0} schedds, retrying in {1}s.".format( pool, retry_delay)) retries += 1 schedd_ads = None time.sleep(retry_delay) else: break if schedd_ads is None: logger.error( "trouble getting pool {0} schedds, giving up.".format(pool)) return {} memory_usage = 0 disk_usage = 0 for ad in schedd_ads: try: schedd = htcondor.Schedd(ad) results = schedd.query('jobstatus==2', ['ResidentSetSize_RAW', 'DiskUsage_RAW']) except Exception as e: logger.error(e) else: for r in results: memory_usage += r.get('ResidentSetSize_RAW', 0) disk_usage += r.get('DiskUsage_RAW', 0) return { "MemoryUsage": memory_usage / 1024, "DiskUsage": disk_usage, }
def condor_submit(outdir, filename, channel): ##Condor class job = htcondor.Submit() schedd = htcondor.Schedd() skimfilename = "_".join(filename.split("/"))[:-5] + "_Skim.root" ##Condor configuration job["executable"] = "{}/src/ChargedHiggs/nanoAOD_processing/batch/condor_script.sh".format( os.environ["CMSSW_BASE"]) job["arguments"] = " ".join([filename, channel]) job["universe"] = "vanilla" job["should_transfer_files"] = "YES" job["transfer_input_files"] = ",".join([ os.environ["CMSSW_BASE"] + "/src/ChargedHiggs", os.environ["CMSSW_BASE"] + "/src/x509" ]) job["log"] = outdir + "/log/job_$(Cluster).log" job["output"] = outdir + "/log/job_$(Cluster).out" job["error"] = outdir + "/log/job_$(Cluster).err" #job["+RequestRuntime"] = "{}".format(60*60*12) job["when_to_transfer_output"] = "ON_EXIT" job["transfer_output_remaps"] = '"' + '{filename} = {outdir}/{filename}'.format( filename=skimfilename, outdir=outdir) + '"' ##Agressively submit your jobs def submit(schedd, job): with schedd.transaction() as txn: job.queue(txn) print "Submit job for file {}".format(filename) while (True): try: submit(schedd, job) break except: pass
def condor_submit(MHc, Mh, fragment, lhefile): job = htcondor.Submit() schedd = htcondor.Schedd() name = lhefile.split("/")[-1][:-4] outdir = "/nfs/dust/cms/user/{}/Signal/Hc+hTol4b_MHc{}_Mh{}/Samples/".format(os.environ["USER"], MHc, Mh) os.system("mkdir -p {}".format(outdir)) os.system("mkdir -p {}/log".format(outdir)) job["executable"] = "{}/src/ChargedHiggs/MCproduction/batch/produceMC.sh".format(os.environ["CMSSW_BASE"]) job["arguments"] = " ".join([fragment.split("/")[-1], lhefile.split("/")[-1], name]) job["universe"] = "vanilla" job["should_transfer_files"] = "YES" job["transfer_input_files"] = ",".join([fragment, lhefile, os.environ["CMSSW_BASE"] + "/src/x509", os.environ["HOME"] + "/.dasmaps/"]) job["log"] = "{}/log/job_$(Cluster).log".format(outdir) job["output"] = "{}/log/job_$(Cluster).out".format(outdir) job["error"] = "{}/log/job_$(Cluster).err".format(outdir) job["when_to_transfer_output"] = "ON_EXIT" job["transfer_output_remaps"] = '"' + '{filename}_MINIAOD.root = {outdir}/{filename}_MINIAOD.root; {filename}_NANOAOD.root = {outdir}/{filename}_NANOAOD.root '.format(filename=name, outdir=outdir) + '"' job["on_exit_hold"] = "(ExitBySignal == True) || (ExitCode != 0)" job["periodic_release"] = "(NumJobStarts < 5) && ((CurrentTime - EnteredCurrentStatus) > 60)" job["+RequestRuntime"] = "{}".format(60*60*12) def submit(schedd, job): with schedd.transaction() as txn: job.queue(txn) while(True): try: submit(schedd, job) print "Submit job for file {}".format(lhefile) break except: pass
def __init__(self, scheddwrap=None): """ :param HTCondorRemoteScheddWrapper scheddwrap: [optional] when provided, the current object is built on it to contact a remote schedd. Otherwise, a local schedd is assumed. """ self.log = logging.getLogger('htcondorschedd') self.log.addHandler(logging.NullHandler()) if scheddwrap: self.schedd = scheddwrap.schedd self.address = scheddwrap.address else: self.address = None try: self.schedd = htcondor.Schedd() except Exception as ex: self.log.critical('Unable to instantiate an Schedd object') raise ScheddNotReachable() # Lock object to serialize the submission and query calls self.lock = threading.Lock() self.log.debug('HTCondorSchedd object initialized')
def kill(self, jobs): """ _kill_ Kill a list of jobs based on the WMBS job names. Kill can happen for schedd running on localhost... TBC. """ sd = condor.Schedd() ad = classad.ClassAd() listJobIds = [job['jobid'] for job in jobs] ad['foo'] = listJobIds logging.info("Killing %i jobs from the queue", len(listJobIds)) jobsConstraint = "member(WMAgent_JobID, %s)" % ad.lookup( "foo").__repr__() try: sd.act(condor.JobAction.Remove, jobsConstraint) except RuntimeError: logging.warn("Error while killing jobs on the schedd: %s", listJobIds) return
def Submit(self, wait=True): # Submit the job defined by submit_args Utils.TLog("Submitting job with arguments: " + str(self._job_args)) schedd = htcondor.Schedd() submit = htcondor.Submit(self._job_args) try: with schedd.transaction() as txn: self._cluster_id = submit.queue(txn) except: print("Job submission failed for an unknown error") return JOB_FAILURE Utils.TLog("Job running on cluster " + str(self._cluster_id)) # Wait until job has finished running? if wait is True: self.WaitForFinish() # If we got this far, we assume the job succeeded. return JOB_SUCCESS
def submit(config_files, batch_directory, batch_log_dir, run_script): logger.info("Will submit {0} jobs".format(len(config_files))) schedd = htcondor.Schedd() results = [] job_cfgs = [] for i, cfg in enumerate(config_files): job_cfgs.append( __create_job_cfg(i, cfg, batch_directory, batch_log_dir, run_script)) if 'cern.ch' in socket.gethostname(): results = list( _submit_via_command_line(job_cfgs, config_files, batch_directory)) else: with schedd.transaction() as txn: for job_cfg, cfg in zip(job_cfgs, config_files): result = __submit_one(txn, job_cfgs, cfg) results.append(result) return results
def idle_jobs(self, owners=[], exclude_owners=[]): if self.test: return [{1: 1}, {}] qinfo = [] for schedd_ad in \ htcondor.Collector().locateAll(htcondor.DaemonTypes.Schedd): schedd = htcondor.Schedd(schedd_ad) qinfo += schedd.xquery(projection=['RequestCpus', 'Owner'], requirements='JobStatus=1') full_idle_jobs = {} selected_idle_jobs = {} for q in qinfo: core = int(q.get('RequestCpus')) owner = q.get('Owner') if core not in full_idle_jobs: full_idle_jobs[core] = 0 full_idle_jobs[core] += 1 if len(owners) == 0 and len(exclude_owners) == 0: continue if len(owners) > 0: is_owner = 0 for o in owners: if owner.startswith(o): is_owner = 1 break if is_owner == 0: continue if len(exclude_owners) > 0: is_owner = 1 for o in exclude_owners: if owner.startswith(o): is_owner = 0 break if is_owner == 0: continue if core not in selected_idle_jobs: selected_idle_jobs[core] = 0 selected_idle_jobs[core] += 1 return [full_idle_jobs, selected_idle_jobs]
def job_command_consumer(testrun=False): job_commands_key = config.job_commands_key sleep_interval = config.command_sleep_interval while (True): try: redis_con = setup_redis_connection() command_string = redis_con.lpop(job_commands_key) if command_string is not None: command_dict = json.loads(command_string) command = command_dict["command"] if command == "set_job_hold": job_id = command_dict["job_id"] #if you don't supply it in a list format it seems to update all jobs logging.info("Holding %s" % job_id) s = htcondor.Schedd() s.edit([ job_id, ], "JobStatus", "5") if (testrun): return True else: logging.info( "No command in redis list, begining sleep interval...") if (testrun): return False time.sleep(sleep_interval) except Exception as e: logging.error( "Failure connecting to redis or executing condor command, begining sleep interval..." ) logging.error(e) if (testrun): return False time.sleep(sleep_interval) except (SystemExit, KeyboardInterrupt): return
def submit_DAG2AD(self, dagfile): DAGMAN="/usr/bin/condor_dagman" dag = dagfile schedd = htcondor.Schedd() ad = classad.ClassAd({ "JobUniverse": 7, "Cmd": DAGMAN, "Arguments": "-f -l . -Lockfile %s.lock -AutoRescue 1 -DoRescueFrom 0 " \ "-Dag %s -Suppress_notification -CsdVersion '%s' -Force -Dagman %s" % (dag, dag, htcondor.version(), DAGMAN), "Env": "_CONDOR_MAX_DAGMAN_LOG=0;_CONDOR_DAGMAN_LOG=%s.dagman.out;" \ "_CONDOR_SCHEDD_DAEMON_AD_FILE=%s;_CONDOR_SCHEDD_ADDRESS_FILE=%s" % (dag, htcondor.param["SCHEDD_DAEMON_AD_FILE"], htcondor.param["SCHEDD_ADDRESS_FILE"]), "EnvDelim": ";", "Out": "%s.lib.out" % dag, "Err": "%s.lib.err" % dag, "ShouldTransferFiles": "IF_NEEDED", "UserLog": os.path.abspath("%s.dagman.log" % dag), "KillSig": "SIGTERM", "RemoveKillSig": "SIGUSR1", #"OtherJobRemoveRequirements": classad.ExprTree('eval(strcat("DAGManJobId == ", ClusterId))'), "OnExitRemove": classad.ExprTree('( ExitSignal =?= 11 || ( ExitCode =!= undefined && ExitCode >= 0 && ExitCode <= 2 ) )'), "FileSystemDomain": htcondor.param['FILESYSTEM_DOMAIN'], "Requirements": classad.ExprTree('true || false'), }) cluster = schedd.submit(ad) print("Submitted as cluster %d" % cluster)
def Submit(self, wait=False): # Submit the job defined by submit_args Utils.TLog("Submitting job with arguments: " + str(self._job_args)) schedd = htcondor.Schedd() submit = htcondor.Submit(self._job_args) try: with schedd.transaction() as txn: self._cluster_id = submit.queue(txn) except: print("Job submission failed for an unknown error") return JOB_FAILURE Utils.TLog("Job running on cluster " + str(self._cluster_id)) # Wait until job has finished running? if wait is True: submit_result = self.WaitForFinish() return submit_result # If we aren't waiting for finish, return None return None