Beispiel #1
0
def Scan(config, ctr_dirs):
    """
    Query the SLURM host for all jobs in /[controldir]/processing with ``squeue``.
    If the job has stopped running, more detailed information is fetched with ``scontrol``,
    and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed
    on all jobs with an exit code.

    :param str config: path to arc.conf
    :param ctr_dirs: list of paths to control directories 
    :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ]
    """
    
    configure(config, set_slurm)
    if Config.scanscriptlog:
        scanlogfile = arc.common.LogFile(Config.scanscriptlog)
        arc.common.Logger_getRootLogger().addDestination(scanlogfile)
        arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold)

    jobs = get_jobs(ctr_dirs)
    if not jobs: return
    if Config.remote_host:
        # NOTE: Assuming 256 B of TCP window needed for each job (squeue)
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7)*len(jobs)) 

    execute = execute_local if not Config.remote_host else execute_remote
    #args = Config.slurm_bin_path + '/squeue -a -h -o %i:%T -t all -j ' + ','.join(jobs.keys())
	args = Config.slurm_bin_path + '/oarstat -fj ' + '-fj'.join(jobs.keys())
Beispiel #2
0
def Cancel(config, jobid):
    """
    Cancel a job. The TERM signal is sent to allow the process to terminate
    gracefully within 5 seconds, followed by a KILL signal.

    :param str config: path to arc.conf
    :param str jobid: local job ID
    :return: ``True`` if successfully cancelled, else ``False``
    :rtype: :py:obj:`bool`
    """

    debug('----- starting forkCancel.py -----', 'fork.Cancel')

    configure(config)
    if Config.remote_host:
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key)

    info('Killing job with pid %s' % jobid, 'fork.Cancel')
    if not Config.remote_host:
        import signal
        try:
            os.kill(jobid, signal.SIGTERM)
            time.sleep(5)
            os.kill(jobid, signal.SIGKILL)
        except OSError:
            # Job already died or terminated gracefully after SIGTERM
            pass
        except:
            return False
    else:
        args = 'kill -s TERM %i; sleep 5; kill -s KILL %i' % (jobid, jobid)
        handle = execute_remote(args)

    debug('----- exiting forkCancel.py -----', 'fork.Cancel')
    return True
Beispiel #3
0
def Cancel(config, jobid):
    """
    Cancel a job running at an LSF host with ``bkill``.

    :param str config: path to arc.conf
    :param str grami_file: path to grami file
    :return: ``True`` if successfully cancelled, else ``False``
    :rtype: :py:obj:`bool`
    """

    configure(config, set_lsf)
    return cancel('lsf', jobid)
Beispiel #4
0
def Cancel(config, jobid):
    """
    Cancel a job running at a SLURM host with ``scancel``.

    :param str config: path to arc.conf
    :param str jobid: local job ID
    :return: ``True`` if successfully cancelled, else ``False``
    :rtype: :py:obj:`bool`
    """

    verify_job_id(jobid)
    configure(config, set_slurm)
    return cancel("slurm", jobid)
Beispiel #5
0
def Cancel(config, jobid):
    """
    Cancel a job running at a SLURM host with ``scancel``.

    :param str config: path to arc.conf
    :param str jobid: local job ID
    :return: ``True`` if successfully cancelled, else ``False``
    :rtype: :py:obj:`bool`
    """

    verify_job_id(jobid)
    configure(config, set_slurm)
    cmd = '%s/%s' % (Config.slurm_bin_path, 'oardel')
    return cancel([cmd, jobid], jobid)
Beispiel #6
0
def Scan(config, ctr_dirs):
    """
    Query the local or remote (SSH) machine for all jobs in /[controldir]/processing.
    If the job has stopped running, the exit code is read and the comments file
    is updated.

    :param str config: path to arc.conf
    :param ctr_dirs: list of paths to control directories 
    :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ]
    """

    configure(config, set_fork)
    if Config.scanscriptlog:
        scanlogfile = arc.common.LogFile(Config.scanscriptlog)
        arc.common.Logger_getRootLogger().addDestination(scanlogfile)
        arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold)

    jobs = get_jobs(ctr_dirs)
    if not jobs: return
    if Config.remote_host:
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key)

    execute = execute_local if not Config.remote_host else execute_remote
    args = 'ps -opid ' + (' '.join(jobs.keys()))
    if os.environ.has_key('__FORK_TEST'):
        handle = execute(args, env=dict(os.environ))
    else:
        handle = execute(args)
    if handle.returncode != 0:
        debug('Got error code %i from ps -opid' % handle.returncode, 'fork.Scan')
        debug('Error output is:\n' + ''.join(handle.stderr), 'fork.Scan')

    running = [line.strip() for line in handle.stdout]
    for localid, job in jobs.items():
        if localid in running:
            continue
        if set_exit_code_from_diag(job):
            job.message = MESSAGES[job.state]
        else:
            job.exitcode = -1
        
        with open(job.lrms_done_file, 'w') as f:
            f.write('%i %s\n' % (job.exitcode, job.message))
        write_comments(job)
Beispiel #7
0
def Submit(config, jobdesc):
    """
    Submits a job to the SLURM queue specified in arc.conf. This method executes the required
    RunTimeEnvironment scripts and assembles the bash job script. The job script is
    written to file and submitted with ``sbatch``.

    :param str config: path to arc.conf
    :param jobdesc: job description object
    :type jobdesc: :py:class:`arc.JobDescription`
    :return: local job ID if successfully submitted, else ``None``
    :rtype: :py:obj:`str`
    """

    configure(config, set_slurm)

    validate_attributes(jobdesc)
    if Config.remote_host:
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key)
        
    # Run RTE stage0
    debug('----- starting slurmSubmitter.py -----', 'slurm.Submit')
    RTE_stage0(jobdesc, 'SLURM', SBATCH_ACCOUNT = 'OtherAttributes.SBATCH_ACCOUNT')

    set_grid_global_jobid(jobdesc)

    # Create script file and write job script
    jobscript = get_job_script(jobdesc)
    script_file = write_script_file(jobscript)
    debug('Created file %s' % script_file, 'slurm.Submit')

    debug('SLURM jobname: %s' % jobdesc.Identification.JobName, 'slurm.Submit')
    debug('SLURM job script built', 'slurm.Submit')
    debug('----------------- BEGIN job script -----', 'slurm.Submit')
    emptylines = 0
    for line in jobscript.split('\n'):
        if not line:
            emptylines += 1
        else:
            debug(emptylines*'\n' + line.replace("%", "%%"), 'slurm.Submit')
            emptylines = 0
    if emptylines > 1:
            debug((emptylines-1)*'\n', 'slurm.Submit')
    debug('----------------- END job script -----', 'slurm.Submit')

    if 'ONLY_WRITE_JOBSCRIPT' in os.environ and os.environ['ONLY_WRITE_JOBSCRIPT'] == 'yes':
        return "-1"

    #######################################
    #  Submit the job
    ######################################

    execute = execute_local if not Config.remote_host else execute_remote
    directory = jobdesc.OtherAttributes['joboption;directory']

    debug('Session directory: %s' % directory, 'slurm.Submit')

    SLURM_TRIES = 0
    handle = None
    while SLURM_TRIES < 10:
        args = '%s/oarsub %s' % (Config.slurm_bin_path, script_file)
        verbose('Executing \'%s\' on %s' % 
                (args, Config.remote_host if Config.remote_host else 'localhost'), 'slurm.Submit')
        handle = execute(args)
        if handle.returncode == 0:
            break
        if handle.returncode == 198 or wait_for_queue(handle):
            debug('Waiting for queue to decrease', 'slurm.Submit')
            time.sleep(60)
            SLURM_TRIES += 1
            continue
        break # Other error than full queue

    if handle.returncode == 0:
        # TODO: Test what happens when the jobqueue is full or when the slurm
        # ctld is not responding. SLURM 1.x and 2.2.x outputs the jobid into 
        # STDERR and STDOUT respectively. Concat them, and let sed sort it out. 
        # From the exit code we know that the job was submitted, so this
        # is safe. Ulf Tigerstedt <*****@*****.**> 1.5.2011 
        localid = get_job_id(handle)
        if localid:
            debug('Job submitted successfully!', 'slurm.Submit')
            debug('Local job id: ' + localid, 'slurm.Submit')
            debug('----- exiting submitSubmitter.py -----', 'slurm.Submit')
            return localid

    debug('job *NOT* submitted successfully!', 'slurm.Submit')
    debug('got error code from sbatch: %d !' % handle.returncode, 'slurm.Submit')
    debug('Output is:\n' + ''.join(handle.stdout), 'slurm.Submit')
    debug('Error output is:\n' + ''.join(handle.stderr), 'slurm.Submit')
    debug('----- exiting slurmSubmitter.py -----', 'slurm.Submit')
Beispiel #8
0
def Submit(config, jobdesc):
    """    
    Submits an ATLAS job to the ScGrid host specified in arc.conf. This method executes the required
    RunTimeEnvironment scripts and assembles the bash job script. The job script is
    written to file and submitted with SCEAPI.
                                                                                     
    :param str config: path to arc.conf
    :param jobdesc: job description object  
    :type jobdesc: :py:class:`arc.JobDescription`
    :return: local job ID if successfully submitted, else ``None``
    :rtype: :py:obj:`str`
    """

    import fcntl

    # Allow only one submit at the same time
    _lock = open("/tmp/sceapi-submit-job.lock", "a")
    fcntl.flock(_lock, fcntl.LOCK_EX)

    configure(config, set_sceapi)
    client = setup_api()

    validate_attributes(jobdesc)

    # Run RTE stage0
    debug("----- starting sceapiSubmitter.py -----", "sceapi.Submit")
    rel = re.compile(r"APPS/HEP/ATLAS-(?P<release>[\d\.]+-[\w_-]+)")
    release = None
    for rte in jobdesc.Resources.RunTimeEnvironment.getSoftwareList():
        match = rel.match(str(rte))
        if match:
            release = match.groupdict()["release"]
            break
    if not release:
        raise ArcError("ATLAS release not specified", "sceapi.Submit")

    # Create job dict
    jobJSDL = assemble_dict(jobdesc, release)
    args = jobJSDL.pop("arguments")
    input_dict = get_input_dict(jobdesc, args)

    debug("SCEAPI jobname: %s" % jobdesc.Identification.JobName, "sceapi.Submit")
    debug("SCEAPI job dict built", "sceapi.Submit")
    debug("----------------- BEGIN job dict -----", "sceapi.Submit")
    for key, val in jobJSDL.items():
        debug("%s : %s" % (key, val), "sceapi.Submit")
    debug("----------------- END job dict -----", "sceapi.Submit")

    #######################################
    #  Submit the job
    ######################################

    directory = jobdesc.OtherAttributes["joboption;directory"]
    debug("session directory: %s" % directory, "sceapi.Submit")
    resp = client.submitJSON(jobJSDL)
    handle = None
    try:
        handle = json.loads(resp, "utf8")
    except:
        raise ArcError("SCEAPI client response:\n%s" % str(resp), "sceapi.Submit")

    failure = ""
    if handle["status_code"] == 0:

        jobid = handle["gidujid"]["ujid"]
        gid = handle["gidujid"]["gid"]

        upload_tries = 0
        ret_code = -1
        while upload_tries < 5:
            resp_text = client.putfiles(gid, input_dict)
            try:
                ret_code = json.loads(resp_text, "utf8")["status_code"]
                assert ret_code == 0
                break
            except AssertionError:
                sleep(2)
                upload_tries += 1
            except:
                raise ArcError("SCEAPI client response:\n%s" % str(resp_text), "sceapi")

        if ret_code == 0:
            if json.loads(client.run(jobid), "utf8")["status_code"] == 0:
                debug("job submitted successfully!", "sceapi.Submit")
                debug("local job id: %s" % jobid, "sceapi.Submit")
                debug("----- exiting sceapiSubmitter.py -----", "sceapi.Submit")
                return jobid
            failure = "Start job query failed."
        else:
            failure = "Failed to upload input files."
    else:
        failure = "Status code %i: %s" % (handle["status_code"], translate(handle["status_reason"]))

    debug("job *NOT* submitted successfully!", "sceapi.Submit")
    if failure:
        debug(failure.encode("utf-8"), "sceapi.Submit")
    debug("----- exiting sceapiSubmitter.py -----", "sceapi.Submit")
Beispiel #9
0
def Scan(config, ctr_dirs):
    """
    Query SCEAPI for all jobs in /[controldir]/processing. If the job has stopped running,
    the diagnostics file is updated and ``gm-kick`` is executed.

    :param str config: path to arc.conf
    :param ctr_dirs: list of paths to control directories 
    :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ]
    """

    configure(config, set_sceapi)
    time.sleep(30)

    if Config.scanscriptlog:
        scanlogfile = arc.common.LogFile(Config.scanscriptlog)
        arc.common.Logger_getRootLogger().addDestination(scanlogfile)
        arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold)

    jobs = get_jobs(ctr_dirs)
    if not jobs:
        return

    query = "length=%i&ujids=%s" % (len(jobs), ",".join(jobs))
    resp = client.bjobs(query)
    sce_jobs = []

    try:
        ret_json = json.loads(resp, "utf8")
        sce_jobs = ret_json["jobs_list"]
    except:
        error("SCEAPI client response:\n%s" % str(resp), "sceapi.Scan")

    for jdict in sce_jobs:

        localid = str(jdict["ujid"])
        job = jobs[localid]
        job.state = str(jdict["status"])

        if job.state in RUNNING:
            continue

        if not download_output(job, client):
            add_failure(job)
            error("Failed to download all files for job %s." % job.globalid, "sceapi.Scan")
            if os.path.isfile(job.count_file):  # Count file is removed after 5 failures
                continue

        job.exitcode = (-1, 0)[job.state == "20"]
        job.message = MESSAGES[job.state]

        re_time = re.compile(r"^(?P<dd>\d\d\d)d(?P<HH>\d\d)h(?P<MM>\d\d)m(?P<SS>\d\d)s")
        walltime = re_time.match(jdict["walltime"]).groupdict()
        y = int(walltime["dd"]) / 365
        d = int(walltime["dd"]) % 365
        m, d = d / 30, d % 30
        walltime["yyyy"] = "%04i" % y
        walltime["mm"] = "%02i" % m
        walltime["dd"] = "%02i" % d
        zero_time = arc.common.Time("0000-00-00T00:00:00")

        job.WallTime = arc.common.Time(str(get_MDS(walltime))) - zero_time
        job.Processors = jdict["corenum"]
        put_cleanup_file(client, job.localid)

        with open(job.lrms_done_file, "w") as f:
            f.write("%d %s\n" % (job.exitcode, job.message))
        update_diag(job)
        gm_kick([job])
Beispiel #10
0
def Submit(config, jobdesc):
    """
    Submits a job to the SLURM queue specified in arc.conf. This method executes the required
    RunTimeEnvironment scripts and assembles the bash job script. The job script is
    written to file and submitted with ``sbatch``.

    :param str config: path to arc.conf
    :param jobdesc: job description object
    :type jobdesc: :py:class:`arc.JobDescription`
    :return: local job ID if successfully submitted, else ``None``
    :rtype: :py:obj:`str`
    """

    configure(config, set_slurm)

    validate_attributes(jobdesc)
    if Config.remote_host:
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key)

    # Run RTE stage0
    debug("----- starting slurmSubmitter.py -----", "slurm.Submit")
    RTE_stage0(jobdesc, "SLURM", SBATCH_ACCOUNT="OtherAttributes.SBATCH_ACCOUNT")

    # Create script file and write job script
    jobscript = get_job_script(jobdesc)
    script_file = write_script_file(jobscript)

    debug("SLURM jobname: %s" % jobdesc.Identification.JobName, "slurm.Submit")
    debug("SLURM job script built", "slurm.Submit")
    debug("----------------- BEGIN job script -----", "slurm.Submit")
    for line in jobscript.split("\n"):
        debug(line, "slurm.Submit")
    debug("----------------- END job script -----", "slurm.Submit")

    if "ONLY_WRITE_JOBSCRIPT" in os.environ and os.environ["ONLY_WRITE_JOBSCRIPT"] == "yes":
        return

    #######################################
    #  Submit the job
    ######################################

    execute = execute_local if not Config.remote_host else execute_remote
    directory = jobdesc.OtherAttributes["joboption;directory"]

    debug("Session directory: %s" % directory, "slurm.Submit")

    SLURM_TRIES = 0
    handle = None
    while SLURM_TRIES < 10:
        args = "%s/sbatch %s" % (Config.slurm_bin_path, script_file)
        verbose(
            "Executing '%s' on %s" % (args, Config.remote_host if Config.remote_host else "localhost"), "slurm.Submit"
        )
        handle = execute(args)
        if handle.returncode == 0:
            break
        if handle.returncode == 198 or wait_for_queue(handle):
            debug("Waiting for queue to decrease", "slurm.Submit")
            time.sleep(60)
            SLURM_TRIES += 1
            continue
        break  # Other error than full queue

    if handle.returncode == 0:
        # TODO: Test what happens when the jobqueue is full or when the slurm
        # ctld is not responding. SLURM 1.x and 2.2.x outputs the jobid into
        # STDERR and STDOUT respectively. Concat them, and let sed sort it out.
        # From the exit code we know that the job was submitted, so this
        # is safe. Ulf Tigerstedt <*****@*****.**> 1.5.2011
        localid = get_job_id(handle)
        if localid:
            debug("Job submitted successfully!", "slurm.Submit")
            debug("Local job id: " + localid, "slurm.Submit")
            debug("----- exiting submitSubmitter.py -----", "slurm.Submit")
            return localid

    debug("job *NOT* submitted successfully!", "slurm.Submit")
    debug("got error code from sbatch: %d !" % handle.returncode, "slurm.Submit")
    debug("Output is:\n" + "".join(handle.stdout), "slurm.Submit")
    debug("Error output is:\n" + "".join(handle.stderr), "slurm.Submit")
    debug("----- exiting slurmSubmitter.py -----", "slurm.Submit")
Beispiel #11
0
def Scan(config, ctr_dirs):
    """
    Query the SLURM host for all jobs in /[controldir]/processing with ``squeue``.
    If the job has stopped running, more detailed information is fetched with ``scontrol``,
    and the diagnostics and comments files are updated. Finally ``gm-kick`` is executed
    on all jobs with an exit code.

    :param str config: path to arc.conf
    :param ctr_dirs: list of paths to control directories 
    :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ]
    """

    configure(config, set_slurm)
    if Config.scanscriptlog:
        scanlogfile = arc.common.LogFile(Config.scanscriptlog)
        arc.common.Logger_getRootLogger().addDestination(scanlogfile)
        arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold)

    jobs = get_jobs(ctr_dirs)
    if not jobs:
        return
    if Config.remote_host:
        # NOTE: Assuming 256 B of TCP window needed for each job (squeue)
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7) * len(jobs))

    execute = execute_local if not Config.remote_host else execute_remote
    args = Config.slurm_bin_path + "/squeue -a -h -o %i:%T -t all -j " + ",".join(jobs.keys())
    if os.environ.has_key("__SLURM_TEST"):
        handle = execute(args, env=dict(os.environ))
    else:
        handle = execute(args)
    if handle.returncode != 0:
        debug("Got error code %i from squeue" % handle.returncode, "slurm.Scan")
        debug("Error output is:\n" + "".join(handle.stderr), "slurm.Scan")

    # Slurm can report StartTime and EndTime in at least these two formats:
    # 2010-02-15T15:30:29 (MDS)
    # 02/15-15:25:15
    # Python does not support duplicate named groups.
    # Have to use separate regex if we want to use named groups.
    date_MDS = re.compile(r"^(?P<YYYY>\d\d\d\d)-(?P<mm>\d\d)-(?P<dd>\d\d)T(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$")
    date_2 = re.compile(r"^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d):(?P<SS>\d\d)$")

    for line in handle.stdout:
        try:
            localid, state = line.strip().split(":", 1)
        except:
            if line:
                warn("Failed to parse squeue line: " + line, "slurm.Scan")
            continue
        job = jobs[localid]
        job.state = state
        if job.state in RUNNING:
            continue

        if not job.state:
            set_exit_code_from_diag(job)
        job.message = MESSAGES.get(job.state, "")

        args = Config.slurm_bin_path + "/scontrol -o show job %s" % localid
        scontrol_handle = execute(args)
        if scontrol_handle.returncode != 0:
            debug("Got error code %i from scontrol" % scontrol_handle.returncode, "slurm.Scan")
            debug("Error output is:\n" + "".join(scontrol_handle.stderr), "slurm.Scan")

        try:
            scontrol_dict = dict(item.split("=", 1) for item in re.split(" (?=[^ =]+=)", scontrol_handle.stdout[0]))
            job = jobs[scontrol_dict["JobId"]]
        except:
            warn("Failed to parse scontrol line: " + line, "slurm.Scan")
            continue

        if "ExitCode" in scontrol_dict:
            ec1, ec2 = scontrol_dict["ExitCode"].split(":")
            job.exitcode = int(ec2) + 256 if int(ec2) != 0 else int(ec1)
        else:
            job.exitcode = 0 if state == "COMPLETED" else -1

        if (state == "NODE_FAIL" or state == "CANCELLED") and ("ExitCode" not in scontrol_dict or job.exitcode == 0):
            job.exitcode = 15
            job.message = "Job was cancelled by SLURM"

        if "StartTime" in scontrol_dict:
            match = date_MDS.match(scontrol_dict["StartTime"]) or date_2.match(scontrol_dict["StartTime"])
            scontrol_dict["StartTime"] = get_MDS(match.groupdict())
            job.LRMSStartTime = arc.common.Time(scontrol_dict["StartTime"])
        if "EndTime" in scontrol_dict:
            match = date_MDS.match(scontrol_dict["EndTime"]) or date_2.match(scontrol_dict["EndTime"])
            scontrol_dict["EndTime"] = get_MDS(match.groupdict())
            job.LRMSEndTime = arc.common.Time(scontrol_dict["EndTime"])

        if "StartTime" in scontrol_dict and "EndTime" in scontrol_dict:
            job.WallTime = job.LRMSEndTime - job.LRMSStartTime

        if "NumCPUs" in scontrol_dict:
            job.Processors = scontrol_dict["NumCPUs"]

        with open(job.lrms_done_file, "w") as f:
            f.write("%d %s\n" % (job.exitcode, job.message))
        write_comments(job)
        update_diag(job)

    kicklist = [job for job in jobs.itervalues() if job.state not in RUNNING]
    kicklist.extend([job for job in jobs.itervalues() if job.state == "CANCELLED"])  # kick twice
    gm_kick(kicklist)
Beispiel #12
0
"""
    Main python file for the application
"""
from common import config, secrets
from news.news import NewsPuller
from datetime import datetime, timedelta, tzinfo
import argparse, logging, pytz

TAG = 'app.py -'

# -m <mode ['debug', 'production']>
parser = argparse.ArgumentParser(description='scrape stock statistics and news')
parser.add_argument('-m', '--mode', help='mode in which to run the script')
args = parser.parse_args()

config.configure(args.mode)

# Get yesterday's date, ensure that an hour has passed from yesterday (News API delay)
current_time = datetime.now(tz=pytz.timezone('US/Eastern'))
yesterday = current_time - timedelta(days=1) - timedelta(hours=1) 
if yesterday.hour < 1:
    logging.exception('%s Script ran too soon in the day', TAG)
    raise RuntimeError('Script was run too soon')

# Run data collection scripts for those days
news_puller = NewsPuller(secrets.NEWS_API_KEY)
news_puller.pullTickerNews(yesterday)
Beispiel #13
0
def Submit(config, jobdesc):
    """
    Submits a job to the LSF queue specified in arc.conf. This method executes the required
    RunTimeEnvironment scripts and assembles the bash job script. The job script is
    written to file and submitted with ``bsub``.

    :param str config: path to arc.conf
    :param jobdesc: job description object
    :type jobdesc: :py:class:`arc.JobDescription`
    :return: local job ID if successfully submitted, else ``None``
    :rtype: :py:obj:`bool`
    """

    configure(config, set_lsf)

    validate_attributes(jobdesc)
    if Config.remote_host:
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key)
        
    # Run RTE stage0
    debug('----- starting lsfSubmitter.py -----', 'lsf.Submit')
    RTE_stage0(jobdesc, 'LSF')

    # Create script file and write job script
    jobscript = get_job_script(jobdesc)
    script_file = write_script_file(jobscript)

    debug('LSF jobname: %s' % jobdesc.Identification.JobName, 'lsf.Submit')
    debug('LSF job script built', 'lsf.Submit')
    debug('----------------- BEGIN job script -----', 'lsf.Submit')
    for line in jobscript.split('\n'):
        debug(line, 'lsf.Submit')
    debug('----------------- END job script -----', 'lsf.Submit')

    if 'ONLY_WRITE_JOBSCRIPT' in os.environ and os.environ['ONLY_WRITE_JOBSCRIPT'] == 'yes':
        return False

    #######################################
    #  Submit the job
    ######################################

    execute = excute_local if not Config.remote_host else execute_remote
    directory = jobdesc.OtherAttributes['joboption;directory']

    debug('Session directory: %s' % directory, 'lsf.Submit')

    LSF_TRIES = 0
    args = '%s %s/bsub < %s' % (Config.lsf_setup, Config.lsf_bin_path, script_file)
    verbose('executing \'%s\' on %s' % (args, Config.remote_host if Config.remote_host else 'localhost'), 'lsf.Submit')
    handle = execute(args)

    if handle.returncode == 0:
        localid = get_job_id(handle)
        if localid:
            debug('Job submitted successfully!', 'lsf.Submit')
            debug('Local job id: ' + localid, 'lsf.Submit')
            debug('----- exiting lsfSubmitter.py -----', 'lsf.Submit')
            return localid

    debug('job *NOT* submitted successfully!', 'lsf.Submit')
    debug('got error code from bsub: %d !' % handle.returncode, 'lsf.Submit')
    debug('Output is:\n' + ''.join(handle.stdout), 'lsf.Submit')
    debug('Error output is:\n' + ''.join(handle.stderr), 'lsf.Submit')
    debug('----- exiting lsfSubmitter.py -----', 'lsf.Submit')
Beispiel #14
0
def Scan(config, ctr_dirs):
    """
    Query the LSF host for all jobs in /[controldir]/processing with ``bjobs``.
    If the job has stopped running, the exit code is read and the 
    diagnostics and comments files are updated. Finally ``gm-kick`` is executed
    on all jobs with an exit code.

    If the exit code can not be read from the diagnostics file, it will (after
    5 tries) be kicked with status UNKNOWN.

    :param str config: path to arc.conf
    :param ctr_dirs: list of paths to control directories 
    :type ctr_dirs: :py:obj:`list` [ :py:obj:`str` ... ]
    """

    configure(config, set_lsf)
    if Config.scanscriptlog:
        scanlogfile = arc.common.LogFile(Config.scanscriptlog)
        arc.common.Logger_getRootLogger().addDestination(scanlogfile)
        arc.common.Logger_getRootLogger().setThreshold(Config.log_threshold)

    jobs = get_jobs(ctr_dirs)
    if not jobs: return
    if Config.remote_host:
        # NOTE: Assuming 256 B of TCP window needed for each job
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key, (2 << 7)*len(jobs))

    lsf_bin_path = Config.lsf_bin_path
    execute = excute_local if not Config.remote_host else execute_remote
    args = Config.lsf_setup + ' ' + lsf_bin_path + '/bjobs -w -W ' + ' '.join(jobs.keys()) 
    if os.environ.has_key('__LSF_TEST'):
	handle = execute(args, env = dict(os.environ))
    else:
        handle = execute(args)

    def handle_job(info, in_lsf = True):
        job = jobs[info[0]]
        job.state = info[2]
        if job.state in RUNNING:
            if os.path.exists(job.count_file):
                os.remove(job.count_file)
            return

        if set_exit_code_from_diag(job):
            if in_lsf:
                start, end = info[-2:]
                re_date = re.compile(r'^(?P<mm>\d\d)/(?P<dd>\d\d)-(?P<HH>\d\d):(?P<MM>\d\d)')
                job.LRMSStartTime = arc.common.Time(get_MDS(re_date.match(start).groupdict()))
                if end != '-':
                    job.LRMSEndTime = arc.common.Time(get_MDS(re_date.match(end).groupdict()))
                    job.WallTime = job.LRMSEndTime - job.LRMSStartTime
            # Job finished and exitcode found
            job.message = MESSAGES[job.state]
            return
        # else
        add_failure(job)

    # Handle jobs known to LSF
    for line in handle.stdout[1:]:
        try:
            info = line.strip().split()
            assert(len(info) == 15)
            handle_job(info)
        except Exception as e:
            if line:
                warn('Failed to parse bjobs line: %s\n%s' % (line, str(e)), 'lsf.Scan')

    # Handle jobs lost in LSF
    if handle.returncode != 0:
        debug('Got error code %i from bjobs' % handle.returncode, 'lsf.Scan')
        debug('Error output is:\n' + ''.join(handle.stderr), 'lsf.Scan')
        lost_job = re.compile('Job <(\d+)> is not found')
        for line in handle.stderr:
            match = lost_job.match(line)
            if match:
                handle_job([match.groups()[0], None, 'UNKNOWN'], False)

    kicklist = []
    for job in jobs.itervalues():
        if hasattr(job, 'exitcode'):
            with open(job.lrms_done_file, 'w') as f:
                f.write('%d %s\n' % (job.exitcode, job.message))
            write_comments(job)
            update_diag(job)
            kicklist.append(job)
    gm_kick(kicklist)
Beispiel #15
0
def Submit(config, jobdesc):
    """
    Submits a job to the local or remote (SSH) machine. This method executes the required
    RunTimeEnvironment scripts and assembles the bash job script. The job script is
    written to file and run.

    :param str config: path to arc.conf
    :param jobdesc: job description object
    :type jobdesc: :py:class:`arc.JobDescription`
    :return: local job ID if successfully submitted, else ``None``
    :rtype: :py:obj:`str`
    """

    configure(config, set_fork)

    validate_attributes(jobdesc)
    if Config.remote_host:
        ssh_connect(Config.remote_host, Config.remote_user, Config.private_key)
  
    # Run RTE stage0
    debug('----- starting forkSubmitter.py -----', 'fork.Submit')
    RTE_stage0(jobdesc, 'fork')

    # Create tmp script file and write job script
    jobscript = get_job_script(jobdesc)
    script_file = write_script_file(jobscript)

    debug('Fork jobname: %s' % jobdesc.Identification.JobName, 'fork.Submit')
    debug('Fork job script built', 'fork.Submit')
    debug('----------------- BEGIN job script -----', 'fork.Submit')
    for line in jobscript.split('\n'):
        debug(line, 'fork.Submit')
    debug('----------------- END job script -----', 'fork.Submit')

    if 'ONLY_WRITE_JOBSCRIPT' in os.environ and os.environ['ONLY_WRITE_JOBSCRIPT'] == 'yes':
        return

    #######################################
    #  Submit the job
    ######################################
    
    execute = execute_local if not Config.remote_host else execute_remote
    directory = jobdesc.OtherAttributes['joboption;directory']

    debug('Session directory: %s' % directory, 'fork.Submit')

    handle = execute(script_file)

    if handle.returncode == 0:
        jobid = None
        try:
            jobid = handle.stdout[0][5:]
            debug('Job submitted successfully!', 'fork.Submit')
            debug('Local job id: ' + jobid, 'fork.Submit') 
            debug('----- exiting forkSubmitter.py -----', 'fork.Submit')
            return jobid
        except:
            pass

    debug('Job *NOT* submitted successfully!', 'fork.Submit')
    debug('Got error code: %d !' % (handle.returncode), 'fork.Submit')
    debug('Output is:\n' + ''.join(handle.stdout), 'fork.Submit')
    debug('Error output is:\n' + ''.join(handle.stderr), 'fork.Submit')
    debug('----- exiting forkSubmitter.py -----', 'fork.Submit')