Ejemplo n.º 1
0
def create_core_dump(pid=None, workdir=None):
    """
    Create core dump and copy it to work directory
    """

    if not pid or not workdir:
        logger.warning(
            'cannot create core file since pid or workdir is unknown')
        return

    cmd = 'gdb --pid %d -ex \'generate-core-file\'' % pid
    exit_code, stdout, stderr = execute(cmd)
    if not exit_code:
        path = locate_core_file(pid=pid)
        if path:
            try:
                copy(path, workdir)
            except Exception as error:
                logger.warning('failed to copy core file: %s', error)
            else:
                logger.debug('copied core dump to workdir')

    else:
        logger.warning('failed to execute command: %s, stdout+err=%s', cmd,
                       stdout + stderr)
Ejemplo n.º 2
0
def post_memory_monitor_action(job):
    """
    Perform post action items for memory monitor.

    :param job: job object.
    :return:
    """

    nap = 3
    path1 = os.path.join(job.workdir, get_memory_monitor_summary_filename())
    path2 = os.environ.get('PILOT_HOME')
    i = 0
    maxretry = 20
    while i <= maxretry:
        if os.path.exists(path1):
            break
        logger.info(
            "taking a short nap (%d s) to allow the memory monitor to finish writing to the summary file (#%d/#%d)"
            % (nap, i, maxretry))
        time.sleep(nap)
        i += 1

    try:
        copy(path1, path2)
    except Exception as e:
        logger.warning('failed to copy memory monitor output: %s' % e)
Ejemplo n.º 3
0
def download_transform(url, transform_name, workdir):
    """
    Download the transform from the given url
    :param url: download URL with path to transform (string).
    :param transform_name: trf name (string).
    :param workdir: work directory (string).
    :return:
    """

    status = False
    diagnostics = ""
    path = os.path.join(workdir, transform_name)
    cmd = 'curl -sS \"%s\" > %s' % (url, path)
    trial = 1
    max_trials = 3

    # test if $HARVESTER_WORKDIR is set
    harvester_workdir = os.environ.get('HARVESTER_WORKDIR')
    if harvester_workdir is not None:
        # skip curl by setting max_trials = 0
        max_trials = 0
        source_path = os.path.join(harvester_workdir, transform_name)
        try:
            copy(source_path, path)
            status = True
        except Exception as error:
            status = False
            diagnostics = "Failed to copy file %s to %s : %s" % (source_path,
                                                                 path, error)
            logger.error(diagnostics)

    # try to download the trf a maximum of 3 times
    while trial <= max_trials:
        logger.info("executing command [trial %d/%d]: %s" %
                    (trial, max_trials, cmd))

        exit_code, stdout, stderr = execute(cmd, mute=True)
        if not stdout:
            stdout = "(None)"
        if exit_code != 0:
            # Analyze exit code / output
            diagnostics = "curl command failed: %d, %s, %s" % (exit_code,
                                                               stdout, stderr)
            logger.warning(diagnostics)
            if trial == max_trials:
                logger.fatal('could not download transform: %s' % stdout)
                status = False
                break
            else:
                logger.info("will try again after 60 s")
                sleep(60)
        else:
            logger.info("curl command returned: %s" % stdout)
            status = True
            break
        trial += 1

    return status, diagnostics
Ejemplo n.º 4
0
def copy_output(job, job_scratch_dir, work_dir):
    cp_start = time.time()
    try:
        for outfile in job.output_files.keys():
            if os.path.exists(outfile):
                copy(os.path.join(job_scratch_dir, outfile),
                     os.path.join(work_dir, outfile))
        os.chdir(work_dir)
    except IOError:
        raise FileHandlingFailure(
            "Copy from scratch dir to access point failed")
    finally:
        cp_time = time.time() - cp_start
        logger.info("Copy of outputs took: {0} sec.".format(cp_time))
    return 0
Ejemplo n.º 5
0
def get_analysis_trf(transform, workdir):
    """
    Prepare to download the user analysis transform with curl.
    The function will verify the download location from a known list of hosts.

    :param transform: full trf path (url) (string).
    :param workdir: work directory (string).
    :return: exit code (int), diagnostics (string), transform_name (string)
    """

    ec = 0
    diagnostics = ""

    # test if $HARVESTER_WORKDIR is set
    harvester_workdir = os.environ.get('HARVESTER_WORKDIR')
    if harvester_workdir is not None:
        search_pattern = "%s/jobO.*.tar.gz" % harvester_workdir
        logger.debug("search_pattern - %s" % search_pattern)
        jobopt_files = glob.glob(search_pattern)
        for jobopt_file in jobopt_files:
            logger.debug("jobopt_file = %s workdir = %s" % (jobopt_file, workdir))
            try:
                copy(jobopt_file, workdir)
            except Exception as e:
                logger.error("could not copy file %s to %s : %s" % (jobopt_file, workdir, e))

    if '/' in transform:
        transform_name = transform.split('/')[-1]
    else:
        logger.warning('did not detect any / in %s (using full transform name)' % transform)
        transform_name = transform

    # is the command already available? (e.g. if already downloaded by a preprocess/main process step)
    if os.path.exists(os.path.join(workdir, transform_name)):
        logger.info('script %s is already available - no need to download again' % transform_name)
        return ec, diagnostics, transform_name

    original_base_url = ""

    # verify the base URL
    for base_url in get_valid_base_urls():
        if transform.startswith(base_url):
            original_base_url = base_url
            break

    if original_base_url == "":
        diagnostics = "invalid base URL: %s" % transform
        return errors.TRFDOWNLOADFAILURE, diagnostics, ""

    # try to download from the required location, if not - switch to backup
    status = False
    for base_url in get_valid_base_urls(order=original_base_url):
        trf = re.sub(original_base_url, base_url, transform)
        logger.debug("attempting to download script: %s" % trf)
        status, diagnostics = download_transform(trf, transform_name, workdir)
        if status:
            break

    if not status:
        return errors.TRFDOWNLOADFAILURE, diagnostics, ""

    logger.info("successfully downloaded script")
    path = os.path.join(workdir, transform_name)
    logger.debug("changing permission of %s to 0o755" % path)
    try:
        os.chmod(path, 0o755)  # Python 2/3
    except Exception as e:
        diagnostics = "failed to chmod %s: %s" % (transform_name, e)
        return errors.CHMODTRF, diagnostics, ""

    return ec, diagnostics, transform_name
Ejemplo n.º 6
0
def get_command(job,
                xdata,
                queue,
                script,
                eventtype,
                localsite,
                remotesite,
                external_dir,
                label='stage-in',
                container_type='container'):
    """
    Get the middleware container execution command.

    Note: this function is tailor made for stage-in/out.

    :param job: job object.
    :param xdata: list of FileSpec objects.
    :param queue: queue name (string).
    :param script: name of stage-in/out script (string).
    :param eventtype:
    :param localsite:
    :param remotesite:
    :param external_dir: input or output files directory (string).
    :param label: optional 'stage-[in|out]' (string).
    :param container_type: optional 'container/bash' (string).
    :return: stage-in/out command (string).
    :raises PilotException: for stage-in/out related failures
    """

    if label == 'stage-out':
        filedata_dictionary = get_filedata_strings(xdata)
    else:
        filedata_dictionary = get_filedata(xdata)

        # write file data to file
        try:
            status = write_json(
                path.join(job.workdir,
                          config.Container.stagein_replica_dictionary),
                filedata_dictionary)
        except Exception as exc:
            diagnostics = 'exception caught in get_command(): %s' % exc
            logger.warning(diagnostics)
            raise PilotException(diagnostics)
        else:
            if not status:
                diagnostics = 'failed to write replica dictionary to file'
                logger.warning(diagnostics)
                raise PilotException(diagnostics)

    # copy pilot source into container directory, unless it is already there
    diagnostics = copy_pilot_source(job.workdir)
    if diagnostics:
        raise PilotException(diagnostics)

    final_script_path = path.join(job.workdir, script)
    environ['PYTHONPATH'] = environ.get('PYTHONPATH') + ':' + job.workdir
    script_path = path.join('pilot/scripts', script)
    full_script_path = path.join(path.join(job.workdir, script_path))
    copy(full_script_path, final_script_path)

    if container_type == 'container':
        # correct the path when containers have been used
        final_script_path = path.join('.', script)
        workdir = '/srv'
    else:
        # for container_type=bash we need to add the rucio setup
        pilot_user = environ.get('PILOT_USER', 'generic').lower()
        user = __import__('pilot.user.%s.container' % pilot_user, globals(),
                          locals(), [pilot_user], 0)  # Python 2/3
        try:
            final_script_path = user.get_middleware_container_script(
                '', final_script_path, asetup=True)
        except PilotException:
            final_script_path = 'python %s' % final_script_path
        workdir = job.workdir

    cmd = "%s -d -w %s -q %s --eventtype=%s --localsite=%s --remotesite=%s --produserid=\"%s\" --jobid=%s" % \
          (final_script_path, workdir, queue, eventtype, localsite, remotesite, job.produserid.replace(' ', '%20'), job.jobid)

    if label == 'stage-in':
        cmd += " --eventservicemerge=%s --usepcache=%s --usevp=%s --replicadictionary=%s" % \
               (job.is_eventservicemerge, job.infosys.queuedata.use_pcache, job.use_vp, config.Container.stagein_replica_dictionary)
        if external_dir:
            cmd += ' --inputdir=%s' % external_dir
    else:  # stage-out
        cmd += ' --lfns=%s --scopes=%s --datasets=%s --ddmendpoints=%s --guids=%s' % \
               (filedata_dictionary['lfns'], filedata_dictionary['scopes'], filedata_dictionary['datasets'],
                filedata_dictionary['ddmendpoints'], filedata_dictionary['guids'])
        if external_dir:
            cmd += ' --outputdir=%s' % external_dir

    cmd += ' --taskid=%s' % job.taskid
    cmd += ' --jobdefinitionid=%s' % job.jobdefinitionid
    cmd += ' --catchall=%s' % job.infosys.queuedata.catchall

    if container_type == 'bash':
        cmd += '\nexit $?'

    return cmd