def Scan(config, ctr_dirs): """ Query the SLURM host for all jobs in /[controldir]/processing with ``squeue``. If the job has stopped running, more detailed information is fetched with ``scontrol``, and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed on all jobs with an exit code. :param str config: path to arc.conf :param ctr_dirs: list of paths to control directories :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ] """ configure(config, set_slurm) if Config.scanscriptlog: scanlogfile = arc.common.LogFile(Config.scanscriptlog) arc.common.Logger_getRootLogger().addDestination(scanlogfile) arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold) jobs = get_jobs(ctr_dirs) if not jobs: return if Config.remote_host: # NOTE: Assuming 256 B of TCP window needed for each job (squeue) ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7)*len(jobs)) execute = execute_local if not Config.remote_host else execute_remote #args = Config.slurm_bin_path + '/squeue -a -h -o %i:%T -t all -j ' + ','.join(jobs.keys()) args = Config.slurm_bin_path + '/oarstat -fj ' + '-fj'.join(jobs.keys())
def Cancel(config, jobid): """ Cancel a job. The TERM signal is sent to allow the process to terminate gracefully within 5 seconds, followed by a KILL signal. :param str config: path to arc.conf :param str jobid: local job ID :return: ``True`` if successfully cancelled, else ``False`` :rtype: :py:obj:`bool` """ debug('----- starting forkCancel.py -----', 'fork.Cancel') configure(config) if Config.remote_host: ssh_connect(Config.remote_host, Config.remote_user, Config.private_key) info('Killing job with pid %s' % jobid, 'fork.Cancel') if not Config.remote_host: import signal try: os.kill(jobid, signal.SIGTERM) time.sleep(5) os.kill(jobid, signal.SIGKILL) except OSError: # Job already died or terminated gracefully after SIGTERM pass except: return False else: args = 'kill -s TERM %i; sleep 5; kill -s KILL %i' % (jobid, jobid) handle = execute_remote(args) debug('----- exiting forkCancel.py -----', 'fork.Cancel') return True
def Cancel(config, jobid): """ Cancel a job running at an LSF host with ``bkill``. :param str config: path to arc.conf :param str grami_file: path to grami file :return: ``True`` if successfully cancelled, else ``False`` :rtype: :py:obj:`bool` """ configure(config, set_lsf) return cancel('lsf', jobid)
def Cancel(config, jobid): """ Cancel a job running at a SLURM host with ``scancel``. :param str config: path to arc.conf :param str jobid: local job ID :return: ``True`` if successfully cancelled, else ``False`` :rtype: :py:obj:`bool` """ verify_job_id(jobid) configure(config, set_slurm) return cancel("slurm", jobid)
def Cancel(config, jobid): """ Cancel a job running at a SLURM host with ``scancel``. :param str config: path to arc.conf :param str jobid: local job ID :return: ``True`` if successfully cancelled, else ``False`` :rtype: :py:obj:`bool` """ verify_job_id(jobid) configure(config, set_slurm) cmd = '%s/%s' % (Config.slurm_bin_path, 'oardel') return cancel([cmd, jobid], jobid)
def Scan(config, ctr_dirs): """ Query the local or remote (SSH) machine for all jobs in /[controldir]/processing. If the job has stopped running, the exit code is read and the comments file is updated. :param str config: path to arc.conf :param ctr_dirs: list of paths to control directories :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ] """ configure(config, set_fork) if Config.scanscriptlog: scanlogfile = arc.common.LogFile(Config.scanscriptlog) arc.common.Logger_getRootLogger().addDestination(scanlogfile) arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold) jobs = get_jobs(ctr_dirs) if not jobs: return if Config.remote_host: ssh_connect(Config.remote_host, Config.remote_user, Config.private_key) execute = execute_local if not Config.remote_host else execute_remote args = 'ps -opid ' + (' '.join(jobs.keys())) if os.environ.has_key('__FORK_TEST'): handle = execute(args, env=dict(os.environ)) else: handle = execute(args) if handle.returncode != 0: debug('Got error code %i from ps -opid' % handle.returncode, 'fork.Scan') debug('Error output is:\n' + ''.join(handle.stderr), 'fork.Scan') running = [line.strip() for line in handle.stdout] for localid, job in jobs.items(): if localid in running: continue if set_exit_code_from_diag(job): job.message = MESSAGES[job.state] else: job.exitcode = -1 with open(job.lrms_done_file, 'w') as f: f.write('%i %s\n' % (job.exitcode, job.message)) write_comments(job)
def Submit(config, jobdesc): """ Submits a job to the SLURM queue specified in arc.conf. This method executes the required RunTimeEnvironment scripts and assembles the bash job script. The job script is written to file and submitted with ``sbatch``. :param str config: path to arc.conf :param jobdesc: job description object :type jobdesc: :py:class:`arc.JobDescription` :return: local job ID if successfully submitted, else ``None`` :rtype: :py:obj:`str` """ configure(config, set_slurm) validate_attributes(jobdesc) if Config.remote_host: ssh_connect(Config.remote_host, Config.remote_user, Config.private_key) # Run RTE stage0 debug('----- starting slurmSubmitter.py -----', 'slurm.Submit') RTE_stage0(jobdesc, 'SLURM', SBATCH_ACCOUNT = 'OtherAttributes.SBATCH_ACCOUNT') set_grid_global_jobid(jobdesc) # Create script file and write job script jobscript = get_job_script(jobdesc) script_file = write_script_file(jobscript) debug('Created file %s' % script_file, 'slurm.Submit') debug('SLURM jobname: %s' % jobdesc.Identification.JobName, 'slurm.Submit') debug('SLURM job script built', 'slurm.Submit') debug('----------------- BEGIN job script -----', 'slurm.Submit') emptylines = 0 for line in jobscript.split('\n'): if not line: emptylines += 1 else: debug(emptylines*'\n' + line.replace("%", "%%"), 'slurm.Submit') emptylines = 0 if emptylines > 1: debug((emptylines-1)*'\n', 'slurm.Submit') debug('----------------- END job script -----', 'slurm.Submit') if 'ONLY_WRITE_JOBSCRIPT' in os.environ and os.environ['ONLY_WRITE_JOBSCRIPT'] == 'yes': return "-1" ####################################### # Submit the job ###################################### execute = execute_local if not Config.remote_host else execute_remote directory = jobdesc.OtherAttributes['joboption;directory'] debug('Session directory: %s' % directory, 'slurm.Submit') SLURM_TRIES = 0 handle = None while SLURM_TRIES < 10: args = '%s/oarsub %s' % (Config.slurm_bin_path, script_file) verbose('Executing \'%s\' on %s' % (args, Config.remote_host if Config.remote_host else 'localhost'), 'slurm.Submit') handle = execute(args) if handle.returncode == 0: break if handle.returncode == 198 or wait_for_queue(handle): debug('Waiting for queue to decrease', 'slurm.Submit') time.sleep(60) SLURM_TRIES += 1 continue break # Other error than full queue if handle.returncode == 0: # TODO: Test what happens when the jobqueue is full or when the slurm # ctld is not responding. SLURM 1.x and 2.2.x outputs the jobid into # STDERR and STDOUT respectively. Concat them, and let sed sort it out. # From the exit code we know that the job was submitted, so this # is safe. Ulf Tigerstedt <*****@*****.**> 1.5.2011 localid = get_job_id(handle) if localid: debug('Job submitted successfully!', 'slurm.Submit') debug('Local job id: ' + localid, 'slurm.Submit') debug('----- exiting submitSubmitter.py -----', 'slurm.Submit') return localid debug('job *NOT* submitted successfully!', 'slurm.Submit') debug('got error code from sbatch: %d !' % handle.returncode, 'slurm.Submit') debug('Output is:\n' + ''.join(handle.stdout), 'slurm.Submit') debug('Error output is:\n' + ''.join(handle.stderr), 'slurm.Submit') debug('----- exiting slurmSubmitter.py -----', 'slurm.Submit')
def Submit(config, jobdesc): """ Submits an ATLAS job to the ScGrid host specified in arc.conf. This method executes the required RunTimeEnvironment scripts and assembles the bash job script. The job script is written to file and submitted with SCEAPI. :param str config: path to arc.conf :param jobdesc: job description object :type jobdesc: :py:class:`arc.JobDescription` :return: local job ID if successfully submitted, else ``None`` :rtype: :py:obj:`str` """ import fcntl # Allow only one submit at the same time _lock = open("/tmp/sceapi-submit-job.lock", "a") fcntl.flock(_lock, fcntl.LOCK_EX) configure(config, set_sceapi) client = setup_api() validate_attributes(jobdesc) # Run RTE stage0 debug("----- starting sceapiSubmitter.py -----", "sceapi.Submit") rel = re.compile(r"APPS/HEP/ATLAS-(?P<release>[\d\.]+-[\w_-]+)") release = None for rte in jobdesc.Resources.RunTimeEnvironment.getSoftwareList(): match = rel.match(str(rte)) if match: release = match.groupdict()["release"] break if not release: raise ArcError("ATLAS release not specified", "sceapi.Submit") # Create job dict jobJSDL = assemble_dict(jobdesc, release) args = jobJSDL.pop("arguments") input_dict = get_input_dict(jobdesc, args) debug("SCEAPI jobname: %s" % jobdesc.Identification.JobName, "sceapi.Submit") debug("SCEAPI job dict built", "sceapi.Submit") debug("----------------- BEGIN job dict -----", "sceapi.Submit") for key, val in jobJSDL.items(): debug("%s : %s" % (key, val), "sceapi.Submit") debug("----------------- END job dict -----", "sceapi.Submit") ####################################### # Submit the job ###################################### directory = jobdesc.OtherAttributes["joboption;directory"] debug("session directory: %s" % directory, "sceapi.Submit") resp = client.submitJSON(jobJSDL) handle = None try: handle = json.loads(resp, "utf8") except: raise ArcError("SCEAPI client response:\n%s" % str(resp), "sceapi.Submit") failure = "" if handle["status_code"] == 0: jobid = handle["gidujid"]["ujid"] gid = handle["gidujid"]["gid"] upload_tries = 0 ret_code = -1 while upload_tries < 5: resp_text = client.putfiles(gid, input_dict) try: ret_code = json.loads(resp_text, "utf8")["status_code"] assert ret_code == 0 break except AssertionError: sleep(2) upload_tries += 1 except: raise ArcError("SCEAPI client response:\n%s" % str(resp_text), "sceapi") if ret_code == 0: if json.loads(client.run(jobid), "utf8")["status_code"] == 0: debug("job submitted successfully!", "sceapi.Submit") debug("local job id: %s" % jobid, "sceapi.Submit") debug("----- exiting sceapiSubmitter.py -----", "sceapi.Submit") return jobid failure = "Start job query failed." else: failure = "Failed to upload input files." else: failure = "Status code %i: %s" % (handle["status_code"], translate(handle["status_reason"])) debug("job *NOT* submitted successfully!", "sceapi.Submit") if failure: debug(failure.encode("utf-8"), "sceapi.Submit") debug("----- exiting sceapiSubmitter.py -----", "sceapi.Submit")
def Scan(config, ctr_dirs): """ Query SCEAPI for all jobs in /[controldir]/processing. If the job has stopped running, the diagnostics file is updated and ``gm-kick`` is executed. :param str config: path to arc.conf :param ctr_dirs: list of paths to control directories :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ] """ configure(config, set_sceapi) time.sleep(30) if Config.scanscriptlog: scanlogfile = arc.common.LogFile(Config.scanscriptlog) arc.common.Logger_getRootLogger().addDestination(scanlogfile) arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold) jobs = get_jobs(ctr_dirs) if not jobs: return query = "length=%i&ujids=%s" % (len(jobs), ",".join(jobs)) resp = client.bjobs(query) sce_jobs = [] try: ret_json = json.loads(resp, "utf8") sce_jobs = ret_json["jobs_list"] except: error("SCEAPI client response:\n%s" % str(resp), "sceapi.Scan") for jdict in sce_jobs: localid = str(jdict["ujid"]) job = jobs[localid] job.state = str(jdict["status"]) if job.state in RUNNING: continue if not download_output(job, client): add_failure(job) error("Failed to download all files for job %s." % job.globalid, "sceapi.Scan") if os.path.isfile(job.count_file): # Count file is removed after 5 failures continue job.exitcode = (-1, 0)[job.state == "20"] job.message = MESSAGES[job.state] re_time = re.compile(r"^(?P<dd>\d\d\d)d(?P<HH>\d\d)h(?P<MM>\d\d)m(?P<SS>\d\d)s") walltime = re_time.match(jdict["walltime"]).groupdict() y = int(walltime["dd"]) / 365 d = int(walltime["dd"]) % 365 m, d = d / 30, d % 30 walltime["yyyy"] = "%04i" % y walltime["mm"] = "%02i" % m walltime["dd"] = "%02i" % d zero_time = arc.common.Time("0000-00-00T00:00:00") job.WallTime = arc.common.Time(str(get_MDS(walltime))) - zero_time job.Processors = jdict["corenum"] put_cleanup_file(client, job.localid) with open(job.lrms_done_file, "w") as f: f.write("%d %s\n" % (job.exitcode, job.message)) update_diag(job) gm_kick([job])
def Submit(config, jobdesc): """ Submits a job to the SLURM queue specified in arc.conf. This method executes the required RunTimeEnvironment scripts and assembles the bash job script. The job script is written to file and submitted with ``sbatch``. :param str config: path to arc.conf :param jobdesc: job description object :type jobdesc: :py:class:`arc.JobDescription` :return: local job ID if successfully submitted, else ``None`` :rtype: :py:obj:`str` """ configure(config, set_slurm) validate_attributes(jobdesc) if Config.remote_host: ssh_connect(Config.remote_host, Config.remote_user, Config.private_key) # Run RTE stage0 debug("----- starting slurmSubmitter.py -----", "slurm.Submit") RTE_stage0(jobdesc, "SLURM", SBATCH_ACCOUNT="OtherAttributes.SBATCH_ACCOUNT") # Create script file and write job script jobscript = get_job_script(jobdesc) script_file = write_script_file(jobscript) debug("SLURM jobname: %s" % jobdesc.Identification.JobName, "slurm.Submit") debug("SLURM job script built", "slurm.Submit") debug("----------------- BEGIN job script -----", "slurm.Submit") for line in jobscript.split("\n"): debug(line, "slurm.Submit") debug("----------------- END job script -----", "slurm.Submit") if "ONLY_WRITE_JOBSCRIPT" in os.environ and os.environ["ONLY_WRITE_JOBSCRIPT"] == "yes": return ####################################### # Submit the job ###################################### execute = execute_local if not Config.remote_host else execute_remote directory = jobdesc.OtherAttributes["joboption;directory"] debug("Session directory: %s" % directory, "slurm.Submit") SLURM_TRIES = 0 handle = None while SLURM_TRIES < 10: args = "%s/sbatch %s" % (Config.slurm_bin_path, script_file) verbose( "Executing '%s' on %s" % (args, Config.remote_host if Config.remote_host else "localhost"), "slurm.Submit" ) handle = execute(args) if handle.returncode == 0: break if handle.returncode == 198 or wait_for_queue(handle): debug("Waiting for queue to decrease", "slurm.Submit") time.sleep(60) SLURM_TRIES += 1 continue break # Other error than full queue if handle.returncode == 0: # TODO: Test what happens when the jobqueue is full or when the slurm # ctld is not responding. SLURM 1.x and 2.2.x outputs the jobid into # STDERR and STDOUT respectively. Concat them, and let sed sort it out. # From the exit code we know that the job was submitted, so this # is safe. Ulf Tigerstedt <*****@*****.**> 1.5.2011 localid = get_job_id(handle) if localid: debug("Job submitted successfully!", "slurm.Submit") debug("Local job id: " + localid, "slurm.Submit") debug("----- exiting submitSubmitter.py -----", "slurm.Submit") return localid debug("job *NOT* submitted successfully!", "slurm.Submit") debug("got error code from sbatch: %d !" % handle.returncode, "slurm.Submit") debug("Output is:\n" + "".join(handle.stdout), "slurm.Submit") debug("Error output is:\n" + "".join(handle.stderr), "slurm.Submit") debug("----- exiting slurmSubmitter.py -----", "slurm.Submit")
def Scan(config, ctr_dirs): """ Query the SLURM host for all jobs in /[controldir]/processing with ``squeue``. If the job has stopped running, more detailed information is fetched with ``scontrol``, and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed on all jobs with an exit code. :param str config: path to arc.conf :param ctr_dirs: list of paths to control directories :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ] """ configure(config, set_slurm) if Config.scanscriptlog: scanlogfile = arc.common.LogFile(Config.scanscriptlog) arc.common.Logger_getRootLogger().addDestination(scanlogfile) arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold) jobs = get_jobs(ctr_dirs) if not jobs: return if Config.remote_host: # NOTE: Assuming 256 B of TCP window needed for each job (squeue) ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7) * len(jobs)) execute = execute_local if not Config.remote_host else execute_remote args = Config.slurm_bin_path + "/squeue -a -h -o %i:%T -t all -j " + ",".join(jobs.keys()) if os.environ.has_key("__SLURM_TEST"): handle = execute(args, env=dict(os.environ)) else: handle = execute(args) if handle.returncode != 0: debug("Got error code %i from squeue" % handle.returncode, "slurm.Scan") debug("Error output is:\n" + "".join(handle.stderr), "slurm.Scan") # Slurm can report StartTime and EndTime in at least these two formats: # 2010-02-15T15:30:29 (MDS) # 02/15-15:25:15 # Python does not support duplicate named groups. # Have to use separate regex if we want to use named groups. date_MDS = re.compile(r"^(?P<YYYY>\d\d\d\d)-(?P<mm>\d\d)-(?P<dd>\d\d)T(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$") date_2 = re.compile(r"^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$") for line in handle.stdout: try: localid, state = line.strip().split(":", 1) except: if line: warn("Failed to parse squeue line: " + line, "slurm.Scan") continue job = jobs[localid] job.state = state if job.state in RUNNING: continue if not job.state: set_exit_code_from_diag(job) job.message = MESSAGES.get(job.state, "") args = Config.slurm_bin_path + "/scontrol -o show job %s" % localid scontrol_handle = execute(args) if scontrol_handle.returncode != 0: debug("Got error code %i from scontrol" % scontrol_handle.returncode, "slurm.Scan") debug("Error output is:\n" + "".join(scontrol_handle.stderr), "slurm.Scan") try: scontrol_dict = dict(item.split("=", 1) for item in re.split(" (?=[^ =]+=)", scontrol_handle.stdout[0])) job = jobs[scontrol_dict["JobId"]] except: warn("Failed to parse scontrol line: " + line, "slurm.Scan") continue if "ExitCode" in scontrol_dict: ec1, ec2 = scontrol_dict["ExitCode"].split(":") job.exitcode = int(ec2) + 256 if int(ec2) != 0 else int(ec1) else: job.exitcode = 0 if state == "COMPLETED" else -1 if (state == "NODE_FAIL" or state == "CANCELLED") and ("ExitCode" not in scontrol_dict or job.exitcode == 0): job.exitcode = 15 job.message = "Job was cancelled by SLURM" if "StartTime" in scontrol_dict: match = date_MDS.match(scontrol_dict["StartTime"]) or date_2.match(scontrol_dict["StartTime"]) scontrol_dict["StartTime"] = get_MDS(match.groupdict()) job.LRMSStartTime = arc.common.Time(scontrol_dict["StartTime"]) if "EndTime" in scontrol_dict: match = date_MDS.match(scontrol_dict["EndTime"]) or date_2.match(scontrol_dict["EndTime"]) scontrol_dict["EndTime"] = get_MDS(match.groupdict()) job.LRMSEndTime = arc.common.Time(scontrol_dict["EndTime"]) if "StartTime" in scontrol_dict and "EndTime" in scontrol_dict: job.WallTime = job.LRMSEndTime - job.LRMSStartTime if "NumCPUs" in scontrol_dict: job.Processors = scontrol_dict["NumCPUs"] with open(job.lrms_done_file, "w") as f: f.write("%d %s\n" % (job.exitcode, job.message)) write_comments(job) update_diag(job) kicklist = [job for job in jobs.itervalues() if job.state not in RUNNING] kicklist.extend([job for job in jobs.itervalues() if job.state == "CANCELLED"]) # kick twice gm_kick(kicklist)
""" Main python file for the application """ from common import config, secrets from news.news import NewsPuller from datetime import datetime, timedelta, tzinfo import argparse, logging, pytz TAG = 'app.py -' # -m <mode ['debug', 'production']> parser = argparse.ArgumentParser(description='scrape stock statistics and news') parser.add_argument('-m', '--mode', help='mode in which to run the script') args = parser.parse_args() config.configure(args.mode) # Get yesterday's date, ensure that an hour has passed from yesterday (News API delay) current_time = datetime.now(tz=pytz.timezone('US/Eastern')) yesterday = current_time - timedelta(days=1) - timedelta(hours=1) if yesterday.hour < 1: logging.exception('%s Script ran too soon in the day', TAG) raise RuntimeError('Script was run too soon') # Run data collection scripts for those days news_puller = NewsPuller(secrets.NEWS_API_KEY) news_puller.pullTickerNews(yesterday)
def Submit(config, jobdesc): """ Submits a job to the LSF queue specified in arc.conf. This method executes the required RunTimeEnvironment scripts and assembles the bash job script. The job script is written to file and submitted with ``bsub``. :param str config: path to arc.conf :param jobdesc: job description object :type jobdesc: :py:class:`arc.JobDescription` :return: local job ID if successfully submitted, else ``None`` :rtype: :py:obj:`bool` """ configure(config, set_lsf) validate_attributes(jobdesc) if Config.remote_host: ssh_connect(Config.remote_host, Config.remote_user, Config.private_key) # Run RTE stage0 debug('----- starting lsfSubmitter.py -----', 'lsf.Submit') RTE_stage0(jobdesc, 'LSF') # Create script file and write job script jobscript = get_job_script(jobdesc) script_file = write_script_file(jobscript) debug('LSF jobname: %s' % jobdesc.Identification.JobName, 'lsf.Submit') debug('LSF job script built', 'lsf.Submit') debug('----------------- BEGIN job script -----', 'lsf.Submit') for line in jobscript.split('\n'): debug(line, 'lsf.Submit') debug('----------------- END job script -----', 'lsf.Submit') if 'ONLY_WRITE_JOBSCRIPT' in os.environ and os.environ['ONLY_WRITE_JOBSCRIPT'] == 'yes': return False ####################################### # Submit the job ###################################### execute = excute_local if not Config.remote_host else execute_remote directory = jobdesc.OtherAttributes['joboption;directory'] debug('Session directory: %s' % directory, 'lsf.Submit') LSF_TRIES = 0 args = '%s %s/bsub < %s' % (Config.lsf_setup, Config.lsf_bin_path, script_file) verbose('executing \'%s\' on %s' % (args, Config.remote_host if Config.remote_host else 'localhost'), 'lsf.Submit') handle = execute(args) if handle.returncode == 0: localid = get_job_id(handle) if localid: debug('Job submitted successfully!', 'lsf.Submit') debug('Local job id: ' + localid, 'lsf.Submit') debug('----- exiting lsfSubmitter.py -----', 'lsf.Submit') return localid debug('job *NOT* submitted successfully!', 'lsf.Submit') debug('got error code from bsub: %d !' % handle.returncode, 'lsf.Submit') debug('Output is:\n' + ''.join(handle.stdout), 'lsf.Submit') debug('Error output is:\n' + ''.join(handle.stderr), 'lsf.Submit') debug('----- exiting lsfSubmitter.py -----', 'lsf.Submit')
def Scan(config, ctr_dirs): """ Query the LSF host for all jobs in /[controldir]/processing with ``bjobs``. If the job has stopped running, the exit code is read and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed on all jobs with an exit code. If the exit code can not be read from the diagnostics file, it will (after 5 tries) be kicked with status UNKNOWN. :param str config: path to arc.conf :param ctr_dirs: list of paths to control directories :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ] """ configure(config, set_lsf) if Config.scanscriptlog: scanlogfile = arc.common.LogFile(Config.scanscriptlog) arc.common.Logger_getRootLogger().addDestination(scanlogfile) arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold) jobs = get_jobs(ctr_dirs) if not jobs: return if Config.remote_host: # NOTE: Assuming 256 B of TCP window needed for each job ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7)*len(jobs)) lsf_bin_path = Config.lsf_bin_path execute = excute_local if not Config.remote_host else execute_remote args = Config.lsf_setup + ' ' + lsf_bin_path + '/bjobs -w -W ' + ' '.join(jobs.keys()) if os.environ.has_key('__LSF_TEST'): handle = execute(args, env = dict(os.environ)) else: handle = execute(args) def handle_job(info, in_lsf = True): job = jobs[info[0]] job.state = info[2] if job.state in RUNNING: if os.path.exists(job.count_file): os.remove(job.count_file) return if set_exit_code_from_diag(job): if in_lsf: start, end = info[-2:] re_date = re.compile(r'^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d)') job.LRMSStartTime = arc.common.Time(get_MDS(re_date.match(start).groupdict())) if end != '-': job.LRMSEndTime = arc.common.Time(get_MDS(re_date.match(end).groupdict())) job.WallTime = job.LRMSEndTime - job.LRMSStartTime # Job finished and exitcode found job.message = MESSAGES[job.state] return # else add_failure(job) # Handle jobs known to LSF for line in handle.stdout[1:]: try: info = line.strip().split() assert(len(info) == 15) handle_job(info) except Exception as e: if line: warn('Failed to parse bjobs line: %s\n%s' % (line, str(e)), 'lsf.Scan') # Handle jobs lost in LSF if handle.returncode != 0: debug('Got error code %i from bjobs' % handle.returncode, 'lsf.Scan') debug('Error output is:\n' + ''.join(handle.stderr), 'lsf.Scan') lost_job = re.compile('Job <(\d+)> is not found') for line in handle.stderr: match = lost_job.match(line) if match: handle_job([match.groups()[0], None, 'UNKNOWN'], False) kicklist = [] for job in jobs.itervalues(): if hasattr(job, 'exitcode'): with open(job.lrms_done_file, 'w') as f: f.write('%d %s\n' % (job.exitcode, job.message)) write_comments(job) update_diag(job) kicklist.append(job) gm_kick(kicklist)
def Submit(config, jobdesc): """ Submits a job to the local or remote (SSH) machine. This method executes the required RunTimeEnvironment scripts and assembles the bash job script. The job script is written to file and run. :param str config: path to arc.conf :param jobdesc: job description object :type jobdesc: :py:class:`arc.JobDescription` :return: local job ID if successfully submitted, else ``None`` :rtype: :py:obj:`str` """ configure(config, set_fork) validate_attributes(jobdesc) if Config.remote_host: ssh_connect(Config.remote_host, Config.remote_user, Config.private_key) # Run RTE stage0 debug('----- starting forkSubmitter.py -----', 'fork.Submit') RTE_stage0(jobdesc, 'fork') # Create tmp script file and write job script jobscript = get_job_script(jobdesc) script_file = write_script_file(jobscript) debug('Fork jobname: %s' % jobdesc.Identification.JobName, 'fork.Submit') debug('Fork job script built', 'fork.Submit') debug('----------------- BEGIN job script -----', 'fork.Submit') for line in jobscript.split('\n'): debug(line, 'fork.Submit') debug('----------------- END job script -----', 'fork.Submit') if 'ONLY_WRITE_JOBSCRIPT' in os.environ and os.environ['ONLY_WRITE_JOBSCRIPT'] == 'yes': return ####################################### # Submit the job ###################################### execute = execute_local if not Config.remote_host else execute_remote directory = jobdesc.OtherAttributes['joboption;directory'] debug('Session directory: %s' % directory, 'fork.Submit') handle = execute(script_file) if handle.returncode == 0: jobid = None try: jobid = handle.stdout[0][5:] debug('Job submitted successfully!', 'fork.Submit') debug('Local job id: ' + jobid, 'fork.Submit') debug('----- exiting forkSubmitter.py -----', 'fork.Submit') return jobid except: pass debug('Job *NOT* submitted successfully!', 'fork.Submit') debug('Got error code: %d !' % (handle.returncode), 'fork.Submit') debug('Output is:\n' + ''.join(handle.stdout), 'fork.Submit') debug('Error output is:\n' + ''.join(handle.stderr), 'fork.Submit') debug('----- exiting forkSubmitter.py -----', 'fork.Submit')