def submit(self, sdpdata, options=None): submissiondict = { #"executable": self.bindir + "worker.py", #"arguments": "-w cern" + sdpdata.filename, "executable": self.bindir + "clusterstarter.sh", "arguments": self.bindir + "worker.py -w cern " + sdpdata.filename, "log": self._hide(sdpdata.filename + '.condorlog'), "output": self._hide(sdpdata.filename + '.out'), # str(sdpdata.numlogs()).zfill(3), "error": self._hide(sdpdata.filename + '.err') # str(sdpdata.numlogs()).zfill(3)} } sdpdict = sdpdata.dict.get('cernworld').get('cluster') if sdpdict is not None: # add "+" because that's what htcondor can stomach if sdpdict.get('MaxRuntime'): sdpdict['+MaxRuntime'] = sdpdict['MaxRuntime'] del sdpdict['MaxRuntime'] if sdpdict.get('JobFlavour'): sdpdict['+JobFlavour'] = sdpdict['JobFlavour'] del sdpdict['JobFlavour'] submissiondict.update(sdpdict) op = sp.check_output(['condor_submit', '-terse'], input=str(htc.Submit(submissiondict)), encoding='utf-8') # expected output: XXXXX.0 - XXXXX.0 with XXXXX the submissionid return op.split()[0].split('.')[0]
def Submit(self, count=1): # It's easier to smash the case of the keys (since ClassAds and the # submit language don't care) than to do the case-insensitive compare. self._job_args = dict([(k.lower(), v) for k, v in self._job_args.items()]) # Extract the event log filename, or insert one if none. self._log = self._job_args.setdefault( "log", "test-{0}.log".format(os.getpid())) self._log = os.path.abspath(self._log) # Submit the job defined by submit_args Utils.TLog("Submitting job with arguments: " + str(self._job_args)) if self._schedd is None: self._schedd = htcondor.Schedd() submit = htcondor.Submit(self._job_args) try: with self._schedd.transaction() as txn: self._cluster_id = submit.queue(txn, count) self._count = count except Exception as e: print("Job submission failed for an unknown error: " + str(e)) return JOB_FAILURE Utils.TLog("Job submitted succeeded with cluster ID " + str(self._cluster_id)) # We probably don't need self._log, but it seems like it may be # handy for log messages at some point. self._jel = JobEventLog(self._log) return None
def submit_dask_workers(schedd, n_workers=1): import htcondor schedd_address = get_schedd_address(schedd) sub = { 'MY.DaskWorkerName': '"htcondor--$F(MY.JobId)--"', 'RequestCpus': '"MY.DaskWorkerCores"', 'RequestMemory': '"floor(MY.DaskWorkerMemory / 1048576)"', 'RequestDisk': '"floor(MY.DaskWorkerDisk / 1024)"', 'MY.JobId': '"$(ClusterId).$(ProcId)"', 'MY.DaskWorkerCores': '1', 'MY.DaskWorkerMemory': '2000000000', 'MY.DaskWorkerDisk': '2000000000', 'use_x509userproxy': 'true', 'Log': 'logs/dask_$(Cluster)_$(Process).log', 'output': 'logs/dask_$(Cluster)_$(Process).out', 'error': 'logs/dask_$(Cluster)_$(Process).err', 'should_transfer_files': 'YES', 'when_to_transfer_output': 'ON_EXIT_OR_EVICT', 'Environment': "", 'Arguments': "'python -m distributed.cli.dask_worker ${DASK_SCHED} --nthreads 1 --memory-limit 2.00GB --name ${USER}_${logname} --no-nanny --death-timeout 60 --worker-port 10002:10100'", 'Executable': '"dask_worker.sh"', } with schedd.transaction() as transaction: submit_object = htcondor.Submit(sub) submitted_ads = [] submit_object.queue(transaction, n_workers, submitted_ads) for ad in submitted_ads: logger.info('Submitted worker: %s', ad) return submitted_ads
def submit_sleep_job(): """Submit a sleep job and return the cluster ID""" sub = htcondor.Submit({"Executable": "/usr/bin/sleep", "Arguments": "300"}) schedd = htcondor.Schedd() with schedd.transaction() as txn: cluster_id = sub.queue(txn) return cluster_id
def start_workers(self, n=1, memory_per_worker=None, disk_per_worker=None, procs_per_worker=None, threads_per_worker=None, worker_timeout=None, transfer_files=None, extra_attribs=None): n = int(n) if n < 1: raise ValueError("n must be >= 1") if procs_per_worker: self.logger.warning("Multiple processes and adaptive scaling" " don't mix; ignoring procs_per_worker") memory_per_worker = int(memory_per_worker or self.memory_per_worker) if memory_per_worker < 1: raise ValueError("memory_per_worker must be >= 1 (MB)") disk_per_worker = int(disk_per_worker or self.disk_per_worker) if disk_per_worker < 1: raise ValueError("disk_per_worker must be >= 1 (KB)") threads_per_worker = int(threads_per_worker or self.threads_per_worker) if threads_per_worker < 1: raise ValueError("threads_per_worker must be >= 1") worker_timeout = int(worker_timeout or self.worker_timeout) if worker_timeout < 1: raise ValueError("worker_timeout must be >= 1 (sec)") transfer_files = transfer_files or self.transfer_files if transfer_files: if not isinstance(transfer_files, str): transfer_files = ', '.join(transfer_files) job = htcondor.Submit(JOB_TEMPLATE) job['MY.DaskSchedulerAddress'] = '"' + self.scheduler_address + '"' job['MY.DaskNProcs'] = "1" job['MY.DaskNThreads'] = str(threads_per_worker) job['RequestMemory'] = str(memory_per_worker) job['RequestDisk'] = str(disk_per_worker) job['MY.DaskSchedulerId'] = '"' + self.scheduler.id + '"' job['MY.DaskWorkerTimeout'] = str(worker_timeout) job['LogDir'] = self.logdir if self.script: job['Executable'] = self.script.name job['Transfer_Input_Files'] = self.worker_tarball \ + ((', ' + transfer_files) if transfer_files else '') \ + ((', ' + self.pre_script) if self.pre_script else '') else: if transfer_files: job['Transfer_Input_Files'] = transfer_files if extra_attribs: job.update(extra_attribs) classads = [] with self.schedd.transaction() as txn: clusterid = job.queue(txn, count=n, ad_results=classads) self.logger.info("%d job(s) submitted to cluster %s." % (n, clusterid)) for ad in classads: self.jobs[ad['JobId']] = ad
def submit_sleeper(): sub = htcondor.Submit({ "executable": "/bin/sleep", "arguments": "20s", 'log': LOGFILE }) schedd = htcondor.Schedd() with schedd.transaction() as txn: return sub.queue(txn)
def start(self, jupyter_args: List[str]) -> "JupyterJobManager": if self.has_running_job(): raise click.ClickException( "You already have a running Jupyter notebook server; " 'use "dask-chtc jupyter status" subcommand to see it\'s logs.') self.prep_log_files() arguments = " ".join( ["-m", "jupyter", *jupyter_args, "--no-browser", "-y"]) sub = htcondor.Submit({ "universe": "local", "JobBatchName": " ".join(("jupyter", *jupyter_args)), "executable": sys.executable, "arguments": arguments, "initialdir": Path.cwd(), "output": self.out.as_posix(), "error": self.err.as_posix(), "log": self.event_log.as_posix(), "stream_output": "true", "stream_error": "true", "getenv": "true", "environment": f"{MARKER_KEY}={MARKER_VALUE}", "transfer_executable": "false", "transfer_output_files": '""', # job_max_vacate_time doesn't actually work in local universe, # but might some day: # https://htcondor-wiki.cs.wisc.edu/index.cgi/tktview?tn=7746 "job_max_vacate_time": "60", f"My.{MARKER_KEY}": MARKER_VALUE, }) logger.debug(f"HTCondor job submit description:\n{sub}") schedd = htcondor.Schedd() with schedd.transaction() as txn: self.cluster_id = sub.queue(txn) logger.debug(f"Submitted job with cluster ID {self.cluster_id}") return self
def get_submit(submit_config): submit_config['transfer_input_files'] = ', '.join( submit_config['transfer_input_files']) if 'transfer_output_remaps' in submit_config: submit_config['transfer_output_remaps'] = '"{}"'.format('; '.join([ "{} = {}".format(*entry) for entry in submit_config['transfer_output_remaps'].items() ])) return htcondor.Submit(submit_config)
def submit_pythonbindings(self, njobsmax=None): qondor.utils.check_proxy() if not self.submittables: return import htcondor if njobsmax is None: njobsmax = 1e7 n_jobs_summed = sum([njobs for _, njobs in self.submittables]) n_jobs_total = min(n_jobs_summed, njobsmax) logger.info("Submitting all jobs; %s out of %s", n_jobs_total, n_jobs_summed) schedd = qondor.schedd.get_best_schedd() n_jobs_todo = n_jobs_total ads = [] with qondor.utils.switchdir(self.rundir): with qondor.schedd._transaction(schedd) as transaction: submit_object = htcondor.Submit() for sub_orig, njobs in self.submittables: sub = ( sub_orig.copy() ) # Keep original dict intact? Global settings already contained sub["environment"] = qondor.schedd.format_env_htcondor( sub["environment"] ) njobs = min(njobs, n_jobs_todo) n_jobs_todo -= njobs # Load the dict into the submit object for key in sub: submit_object[key] = sub[key] new_ads = [] cluster_id = ( int(submit_object.queue(transaction, njobs, new_ads)) if not qondor.DRYMODE else 0 ) logger.warning( "Submitted %s jobs for i_cluster %s (%s) to htcondor cluster %s", len(new_ads) if not qondor.DRYMODE else njobs, sub_orig["environment"]["QONDORICLUSTER"], sub_orig["environment"]["QONDORCLUSTERNAME"], cluster_id, ) ads.extend(new_ads) self.submitted.append( ( sub_orig, cluster_id, len(new_ads), [ad["ProcId"] for ad in new_ads], ) ) if n_jobs_todo == 0: break logger.info( "Summary: Submitted %s jobs to cluster %s", n_jobs_total, cluster_id )
def job_node_kwargs(node): return dict( name=node.name, submit_description=htcondor.Submit(node.file.read_text()), dir=node.dir, noop=node.noop, done=node.done, pre=node.pre, post=node.post, )
def test_save_and_load_submit(tmpdir): path = Path(tmpdir.mkdir('save_and_load_submit_test_dir')) sub = htcondor.Submit({'foo': 'bar'}) htio.save_submit(path, sub) loaded = htio.load_submit(path) assert loaded['foo'] == sub['foo']
def test_save_and_load_submit(tmpdir): path = Path(tmpdir.mkdir("save_and_load_submit_test_dir")) sub = htcondor.Submit({"foo": "bar"}) htio.save_submit(path, sub) loaded = htio.load_submit(path) assert loaded["foo"] == sub["foo"]
def make_outer_dag( dest_dir, requirements, source_dir, test_mode, transfer_manifest_path, unique_id, working_dir, ): # Only import htcondor.dags submit-side import htcondor.dags as dags outer_dag = dags.DAG() outer_dag.layer( name="calc_work", submit_description=htcondor.Submit({ "output": "calc_work.out", "error": "calc_work.err", "log": "calc_work.log", "arguments": "generate {} {}".format(source_dir, '--test-mode' if test_mode else ''), "should_transfer_files": "yes", **shared_submit_descriptors(unique_id, requirements), }), post=dags.Script( executable=THIS_FILE, arguments=[ "write_subdag", source_dir, "source_manifest.txt", dest_dir, "destination_manifest.txt", transfer_manifest_path, "--requirements_file=requirements.txt" if requirements is not None else "", "--unique-id={}".format(unique_id) if unique_id is not None else "", "--test-mode" if test_mode else "", ], ), ).child_subdag( name="inner", dag_file=working_dir / "inner.dag", post=dags.Script(executable=THIS_FILE, arguments=["analyze", transfer_manifest_path]), ) return outer_dag
def condorSubmit(skimdir, dirname, filename, index): # Condor class job = htcondor.Submit() schedd = htcondor.Schedd() skimFilename = dirname + "_{}.root".format(index) # Condor configuration job["executable"] = "{}/src/ttjet/nanoskimmer/batch/produceSkim.sh".format( os.environ["CMSSW_BASE"]) # job["arguments"] = " ".join( # [filename, skimFilename] + list(channels)) job["getenv"] = "True" job["arguments"] = " ".join([filename, skimFilename]) job["universe"] = "vanilla" job["should_transfer_files"] = "YES" # job["transfer_input_files"] = ",".join( # [os.environ["CMSSW_BASE"] + "/src/ttjet", os.environ["CMSSW_BASE"] + "/src/x509"]) # job["transfer_input_files"] = ",".join( # [os.environ["CMSSW_BASE"] + "/src/x509"]) job["transfer_input_files"] = ",".join([ os.environ["CMSSW_BASE"] + "/src/ttjet", os.environ["CMSSW_BASE"] + "/src/x509" ]) # job["transfer_input_files"] = ",".join([ # os.environ["X509_USER_PROXY"]]) job["log"] = "{}/{}/log/job_$(Cluster).log".format(skimdir, dirname) job["output"] = "{}/{}/log/job_$(Cluster).out".format(skimdir, dirname) job["error"] = "{}/{}/log/job_$(Cluster).err".format(skimdir, dirname) # print skimdir, dirname, filename, index job["when_to_transfer_output"] = "ON_EXIT" job["transfer_output_remaps"] = '"' + '{outFile} = {skimDir}/{dirName}/{outFile}'.format( outFile=skimFilename, skimDir=skimdir, dirName=dirname) + '"' # Agressively submit your jobs def submit(schedd, job): with schedd.transaction() as txn: job.queue(txn) print "Submit job for file {}".format(filename) while (True): try: submit(schedd, job) break except: pass
def prepareSubmission(self, cpu, memory, disk, jobID, jobName, command): # Convert resource requests cpu = int(math.ceil(cpu)) # integer CPUs memory = float(memory)/1024 # memory in KB disk = float(disk)/1024 # disk in KB # NOTE: formatStdOutErrPath() puts files in the Toil workflow directory, which defaults # to being in the system temporary directory ($TMPDIR, /tmp) which is unlikely to be on # a shared filesystem. So to make this work we need to set should_transfer_files = Yes # in the submit file, so that HTCondor will write the standard output/error files on the # compute node, then transfer back once the job has completed. stdoutfile = self.boss.formatStdOutErrPath(jobID, 'htcondor', '$(cluster)', 'std_output') stderrfile = self.boss.formatStdOutErrPath(jobID, 'htcondor', '$(cluster)', 'std_error') condorlogfile = self.boss.formatStdOutErrPath(jobID, 'htcondor', '$(cluster)', 'job_events') # Execute the entire command as /bin/sh -c "command" # TODO: Transfer the jobStore directory if using a local file store with a relative path. submit_parameters = { 'executable': '/bin/sh', 'transfer_executable': 'False', 'arguments': '''"-c '{0}'"'''.format(command).encode('utf-8'), # Workaround for HTCondor Python bindings Unicode conversion bug 'environment': self.getEnvString(), 'getenv': 'True', 'should_transfer_files': 'Yes', # See note above for stdoutfile, stderrfile 'output': stdoutfile, 'error': stderrfile, 'log': condorlogfile, 'request_cpus': '{0}'.format(cpu), 'request_memory': '{0:.3f}KB'.format(memory), 'request_disk': '{0:.3f}KB'.format(disk), 'leave_in_queue': '(JobStatus == 4)', '+IsToilJob': 'True', '+ToilJobID': '{0}'.format(jobID), '+ToilJobName': '"{0}"'.format(jobName), '+ToilJobKilled': 'False', } # Extra parameters for HTCondor extra_parameters = os.getenv('TOIL_HTCONDOR_PARAMS') if extra_parameters is not None: logger.debug("Extra HTCondor parameters added to submit file from TOIL_HTCONDOR_PARAMS env. variable: {}".format(extra_parameters)) for parameter, value in [parameter_value.split('=', 1) for parameter_value in extra_parameters.split(';')]: parameter = parameter.strip() value = value.strip() if parameter in submit_parameters: raise ValueError("Some extra parameters are incompatible: {}".format(extra_parameters)) submit_parameters[parameter] = value # Return the Submit object return htcondor.Submit(submit_parameters)
def submit(self, description, count=1, itemdata=None): sub = htcondor.Submit(dict(description)) logger.debug( "Submitting jobs with description:\n{}\nCount: {}\nItemdata: {}". format(sub, count, itemdata)) with self.use_config(): schedd = self.get_local_schedd() with schedd.transaction() as txn: result = sub.queue_with_itemdata(txn, count, itemdata) logger.debug("Got submit result:\n{}".format(result)) return handles.ClusterHandle(self, result)
def __submit_one(txn, job_cfg, cfg): sub = htcondor.Submit(job_cfg) out = sub.queue(txn) return dict( batch_id=int(out), batch=Batch.condor, config_file=cfg, stderr_log=job_cfg['error'], stdout_log=job_cfg['output'], job_log=job_cfg['log'], status=Status.CREATED, )
def __submit_python(self, jsd, n): """ submit using the python bindings :param JobSubmissionDescription jsd: instance of JobSubmissionDescription :param int n: number of jobs to submit :return int: the clusterid of jobs submitted """ submit_d = jsd.items() submit = htcondor.Submit(submit_d) with self.schedd.transaction() as txn: clusterid = submit.queue(txn, n) return clusterid
def submit_with_python(self, jdl_list, use_spool=False): # Make logger tmpLog = core_utils.make_logger( baseLogger, 'submissionHost={0}'.format(self.submissionHost), method_name='CondorJobSubmit.submit_with_python') # Start tmpLog.debug('Start') # Initialize errStr = '' batchIDs_list = [] # Make list of jdl map with dummy submit objects jdl_map_list = [dict(htcondor.Submit(jdl).items()) for jdl in jdl_list] # Go submit_obj = htcondor.Submit() try: with self.schedd.transaction() as txn: # TODO: Currently spool is not supported in htcondor.Submit ... submit_result = submit_obj.queue_with_itemdata( txn, 1, iter(jdl_map_list)) clusterid = submit_result.cluster() first_proc = submit_result.first_proc() num_proc = submit_result.num_procs() batchIDs_list.extend([ '{0}.{1}'.format(clusterid, procid) for procid in range(first_proc, first_proc + num_proc) ]) except RuntimeError as e: errStr = '{0}: {1}'.format(e.__class__.__name__, e) tmpLog.error('submission failed: {0}'.format(errStr)) raise if batchIDs_list: n_jobs = len(batchIDs_list) tmpLog.debug('submitted {0} jobs: {1}'.format( n_jobs, ' '.join(batchIDs_list))) elif not errStr: tmpLog.error('submitted nothing') tmpLog.debug('Done') # Return return (batchIDs_list, errStr)
def condor_submit_process(mp_queue, host, jdl_map_list): """ Function for new process to submit condor """ # initialization errStr = '' batchIDs_list = [] # parse schedd and pool name condor_schedd, condor_pool = None, None if host in ('LOCAL', 'None'): tmpLog.debug( 'submissionHost is {0}, treated as local schedd. Skipped'.format( host)) else: try: condor_schedd, condor_pool = host.split(',')[0:2] except ValueError: tmpLog.error('Invalid submissionHost: {0} . Skipped'.format(host)) # get schedd try: if condor_pool: collector = htcondor.Collector(condor_pool) else: collector = htcondor.Collector() if condor_schedd: scheddAd = collector.locate(htcondor.DaemonTypes.Schedd, condor_schedd) else: scheddAd = collector.locate(htcondor.DaemonTypes.Schedd) schedd = htcondor.Schedd(scheddAd) except Exception as e: errStr = 'create condor collector and schedd failed; {0}: {1}'.format( e.__class__.__name__, e) else: submit_obj = htcondor.Submit() try: with schedd.transaction() as txn: # TODO: Currently spool is not supported in htcondor.Submit ... submit_result = submit_obj.queue_with_itemdata( txn, 1, iter(jdl_map_list)) clusterid = submit_result.cluster() first_proc = submit_result.first_proc() num_proc = submit_result.num_procs() batchIDs_list.extend([ '{0}.{1}'.format(clusterid, procid) for procid in range(first_proc, first_proc + num_proc) ]) except RuntimeError as e: errStr = 'submission failed; {0}: {1}'.format( e.__class__.__name__, e) mp_queue.put((batchIDs_list, errStr))
def start(self, jupyter_args: List[str]) -> "JupyterJobManager": if self.has_running_job(): raise click.ClickException( 'You already have a running Jupyter notebook server; try the "status" subcommand to see it.' ) self.prep_log_files() arguments = " ".join( ["-m", "jupyter", *jupyter_args, "--no-browser", "-y"]) sub = htcondor.Submit({ "universe": "local", "JobBatchName": " ".join(("jupyter", *jupyter_args)), "executable": sys.executable, "arguments": arguments, "initialdir": Path.cwd(), "output": self.out.as_posix(), "error": self.err.as_posix(), "log": self.event_log.as_posix(), "stream_output": "true", "stream_error": "true", "getenv": "true", "transfer_executable": "false", "transfer_output_files": '""', f"My.{MARKER}": "true", }) logger.debug(f"HTCondor job submit description:\n{sub}") schedd = htcondor.Schedd() with schedd.transaction() as txn: self.cluster_id = sub.queue(txn) logger.debug(f"Submitted job with cluster ID {self.cluster_id}") return self
def Submit(jobdesc, log, appjobid, schedd): global queuelist if len(queuelist) == 0: log.error("%s: no cluster free for submission" % appjobid) return None # This method only works with condor version >= 8.5.8 but is needed to # get $() variable expansion working sub = htcondor.Submit(dict(jobdesc)) with schedd.transaction() as txn: jobid = sub.queue(txn) return jobid
def htcondor_submit(sub, njobs=1, submission_dir='.'): """ Submits the submission dict `sub` to the best scheduler. Returns the cluster id and class ad of the submitted job """ import htcondor schedd = qondor.get_best_schedd(renew=True) with qondor.utils.switchdir(submission_dir): submit_object = htcondor.Submit(sub) with schedd.transaction() as transaction: ad = [] cluster_id = submit_object.queue(transaction, njobs, ad) cluster_id = int(cluster_id) return cluster_id, ad
def submit(MHc, Mh): job = htcondor.Submit() schedd = htcondor.Schedd() lhefile = "unweighted_events.lhe" outdir = "/nfs/dust/cms/user/{}/Signal/Hc+hTol4b_MHc{}_Mh{}/LHE/".format( os.environ["USER"], MHc, Mh) outfile = "Hc+hTol4b_MHc{}_Mh{}_{}.lhe".format( MHc, Mh, str(time.time()).replace(".", "")) os.system("mkdir -p " + outdir) os.system("mkdir -p {}/log".format(outdir)) ##Condor configuration job["executable"] = "{}/src/ChargedHiggs/MCproduction/batch/produceLHE.sh".format( os.environ["CMSSW_BASE"]) job["universe"] = "vanilla" job["should_transfer_files"] = "YES" job["transfer_input_files"] = ",".join([ os.environ["CMSSW_BASE"] + "/src/ChargedHiggs/MCproduction/MG5_aMC_v2_6_4", os.environ["CMSSW_BASE"] + "/src/command.txt", os.environ["CMSSW_BASE"] + "/src/ChargedHiggs/MCproduction/SLHA/Hc+hTol4b_MHc{}_Mh{}.shla".format( MHc, Mh) ]) job["log"] = "log/job_$(Cluster).log" job["output"] = "log/job_$(Cluster).out" job["error"] = "log/job_$(Cluster).err" job["when_to_transfer_output"] = "ON_EXIT" job["transfer_output_remaps"] = '"' + '{} = {}/{}'.format( lhefile, outdir, outfile) + '"' ##Agressively submit your jobs def submit(schedd, job): with schedd.transaction() as txn: job.queue(txn) while (True): try: submit(schedd, job) print "Submit job for LHE file production" break except: pass
def _coreExecution(self, handler, particles): schedd = htcondor.Schedd() conf = self.submitf executable = conf['darwin']['executable'] executable_path = os.path.join(handler.optdir, executable) conf['htcondor']['executable'] = executable_path if not os.path.exists(executable_path): logger.error('executable "{}" not found'.format(executable_path)) sys.exit(1) # secure the job id from condor self.ids = [] for p in particles: arguments = p.coordinate.format() formatted_args = [ '-{} {}'.format(k, v) for k, v in arguments.items() ] conf['htcondor']['arguments'] = ' '.join(formatted_args) conf['htcondor']['initialdir'] = handler.particlepath(p.name) # get redirect of htcondor submit file to a dict sub = htcondor.Submit(dict(conf.items('htcondor'))) with schedd.transaction() as txn: ads = [] clusterid = sub.queue(txn, ad_results=ads) self.ids.append(clusterid) if 'should_transfer_files' in conf['htcondor'] and \ conf['htcondor']['should_transfer_files'] in ('YES',): schedd.spool(ads) req = ' || '.join('(ClusterId == {})'.format(id) for id in self.ids) proj = ['ClusterId', 'JobStatus'] finished = False while not finished: count = 0 for data in schedd.xquery(requirements=req, projection=proj): count += 1 if count == 0: finished = True else: time.sleep(self.refresh_rate) if 'should_transfer_files' in conf['htcondor'] and \ conf['htcondor']['should_transfer_files'] in ('YES',): for clusterid in self.ids: self._schedd.retrieve("ClusterId == %d".format(clusterid))
def prepareSubmission(self, cpu, memory, disk, jobID, jobName, command): # Convert resource requests cpu = int(math.ceil(cpu)) # integer CPUs memory = float(memory) / 1024 # memory in KB disk = float(disk) / 1024 # disk in KB # Execute the entire command as /bin/sh -c "command" # TODO: Transfer the jobStore directory if using a local file store with a relative path. submit_parameters = { 'executable': '/bin/sh', 'transfer_executable': 'False', 'arguments': '''"-c '{0}'"'''.format(command).encode( 'utf-8' ), # Workaround for HTCondor Python bindings Unicode conversion bug 'environment': self.getEnvString(), 'getenv': 'True', 'request_cpus': '{0}'.format(cpu), 'request_memory': '{0:.3f}KB'.format(memory), 'request_disk': '{0:.3f}KB'.format(disk), 'leave_in_queue': '(JobStatus == 4)', '+IsToilJob': 'True', '+ToilJobID': '{0}'.format(jobID), '+ToilJobName': '"{0}"'.format(jobName), '+ToilJobKilled': 'False', } # Extra parameters for HTCondor extra_parameters = os.getenv('TOIL_HTCONDOR_PARAMS') if extra_parameters is not None: logger.debug( "Extra HTCondor parameters added to submit file from TOIL_HTCONDOR_PARAMS env. variable: {}" .format(extra_parameters)) for parameter, value in [ parameter_value.split('=', 1) for parameter_value in extra_parameters.split(';') ]: parameter = parameter.strip() value = value.strip() if parameter in submit_parameters: raise ValueError( "Some extra parameters are incompatible: {}". format(extra_parameters)) submit_parameters[parameter] = value # Return the Submit object return htcondor.Submit(submit_parameters)
def test_client_metrics(self): coll = htcondor.Collector() startd = coll.locateAll(htcondor.DaemonTypes.Startd) if len(startd) == 0: # Submitting some sleep jobs job = { "executable": "/bin/sleep", "arguments": "5m", "request_memory": "500" } sub = htcondor.Submit(job) schedd = htcondor.Schedd() with schedd.transaction() as txn: sub.queue(txn, 1) # Waiting for the glideins to start time.sleep(60) uuid = 'pyglideinpyglideinclient' partition = 'Cluster' metrics = [ 'glideins.launched', 'glideins.running', 'glideins.idle', 'glideins.avg_idle_time', 'glideins.min_idle_time', 'glideins.max_idle_time' ] for metric in metrics: path = '.'.join([self.metrics_namespace, uuid, partition, metric]) url = 'http://{}/render?target={}'.format( self.metrics_graphite_server, path) url += '&format=json&from=-5min' r = requests.get(url) output = r.json() self.assertTrue(len(output) > 0, msg='{} client metric not found'.format(path)) if len(output) > 0: output = output[0] self.assertTrue(len(output['datapoints']) > 0, msg='No datapoints found for {}.'.format(path)) self.assertTrue(output['tags']['name'] == path, msg='Metrics mismatch for {}.'.format(path)) not_zeros = False for datapoint in output['datapoints']: if datapoint[0] != 0.0: not_zeros = True self.assertTrue( not_zeros, msg='Add datapoints are zero for {}.'.format(path))
def create(self, htcondor_job): """ Submit a job & return the job id """ id = None try: sub = htcondor.Submit(htcondor_job) with self._schedd.transaction() as txn: id = sub.queue(txn, 1) except Exception as err: return None if id: return id return None
def submit(self): '''Submit the test jobs for all prepIDs to HTCondor.''' prepid_campaign_map = self._get_prepid_campaign_map() for campaign_name, prepids in prepid_campaign_map.items(): # Set output directory for each campaign outdir = mcmtest_path( f'output/{self.request_name}/{campaign_name}') if not os.path.exists(outdir): os.makedirs(outdir) jobfiledir = mcmtest_path( f'job_files/{self.request_name}/{campaign_name}') if not os.path.exists(jobfiledir): os.makedirs(jobfiledir) for prepid in prepids: # Set the correct arguments for the executable arg_list = ['$(proxy_path)', prepid] args = ' '.join(arg_list) self.submission_settings['arguments'] = args # Set output log files output_file = pjoin(outdir, f'out_{prepid}.txt') log_file = pjoin(outdir, f'log_{prepid}.txt') err_file = pjoin(outdir, f'err_{prepid}.txt') self.submission_settings['output'] = output_file self.submission_settings['log'] = log_file self.submission_settings['error'] = err_file sub = htcondor.Submit(self.submission_settings) # Write the job file to submit jobfile = pjoin(jobfiledir, f'job_{prepid}.jdl') with open(jobfile, 'w+') as f: f.write(str(sub)) f.write('\nqueue 1\n') # Submit the job, if dry run is not specified if not self.dryrun: jobid = condor_submit(jobfile) print(f'Submitted job: {prepid}, Job ID: {jobid}') if self.dryrun: print('Dry run requested. No submissions will be made.') print('PrepID and campaign information:') pprint(prepid_campaign_map)
def launch(self, job_id): status = STATUS_OK msg = '' cluster_id = '' # Get the job info from the database job_info = self.db.get_job_info(job_id) # Load environment variables required for the job env = job_info['spec']['env'] executable = job_info['spec']['executable'] log_path = job_info['spec']['log'] out_path = job_info['spec']['output'] err_path = job_info['spec']['error'] for envvar in self.job_types[job_info['type']]['env']: os.environ[envvar] = '{}'.format(env[envvar]) # Create the output log directory if it does not exist os.makedirs(log_path, exist_ok=True) job_spec = { "executable": executable, # the program to run on the execute node "output": out_path, # anything the job prints to standard output will end up in this file "error": err_path, # anything the job prints to standard error will end up in this file "log": log_path, # this file will contain a record of what happened to the job "getenv": "True", } # Submit the HTCondor job htcondor_job = htcondor.Submit(job_spec) htcondor_schedd = htcondor.Schedd( ) # get the Python representation of the scheduler with htcondor_schedd.transaction( ) as txn: # open a transaction, represented by `txn` cluster_id = htcondor_job.queue( txn ) # queues one job in the current transaction; returns job's ClusterID if not isinstance(cluster_id, int): msg = 'Error submitting Condor job' status = STATUS_ERROR self.update_job(job_id, updates={ 'msg': msg, }) else: self.update_job(job_id, updates={ 'cluster_id': cluster_id, 'status': 'submitted', }) return status, msg, cluster_id