Beispiel #1
0
def declare_output(job, work_report, worker_stageout_declaration):
    out_file_report = {}
    out_file_report[job.jobid] = []
    for outfile in job.output_files.keys():
        logger.debug(
            "File {} will be checked and declared for stage out".format(
                outfile))
        if os.path.exists(outfile):
            file_desc = {}
            if outfile == job.log_file:
                file_desc['filetype'] = 'log'
            else:
                file_desc['filetype'] = 'output'
            file_desc['path'] = os.path.abspath(outfile)
            file_desc['fsize'] = os.path.getsize(outfile)
            if 'guid' in job.output_files[outfile].keys():
                file_desc['guid'] = job.output_files[outfile]['guid']
            elif work_report['outputfiles'] and work_report['outputfiles'][
                    outfile]:
                file_desc['guid'] = work_report['outputfiles'][outfile]['guid']
            out_file_report[job.jobid].append(file_desc)
        else:
            logger.info(
                "Expected output file {0} missed. Job {1} will be failed".
                format(outfile, job.jobid))
            set_pilot_state(job=job, state='failed')

    if out_file_report[job.jobid]:
        write_json(worker_stageout_declaration, out_file_report)
        logger.debug(
            'Stagout declared in: {0}'.format(worker_stageout_declaration))
        logger.debug('Report for stageout: {}'.format(out_file_report))
Beispiel #2
0
def get_metadata_dict_from_txt(path, storejson=False, jobid=None):
    """
    Convert memory monitor text output to json, store it, and return a selection as a dictionary.

    :param path:
    :param storejson: store dictionary on disk if True (boolean).
    :param jobid: job id (string).
    :return: prmon metadata (dictionary).
    """

    # get the raw memory monitor output, convert to dictionary
    dictionary = convert_text_file_to_dictionary(path)

    if dictionary and storejson:
        # add metadata
        dictionary['type'] = 'MemoryMonitorData'
        dictionary['pandaid'] = jobid

        path = os.path.join(os.path.dirname(path),
                            get_memory_monitor_output_filename(suffix='json'))
        logger.debug('writing prmon dictionary to: %s' % path)
        write_json(path, dictionary)
    else:
        logger.debug('nothing to write (no prmon dictionary)')

    # filter dictionary?
    # ..

    return dictionary
Beispiel #3
0
def process_jobreport(payload_report_file, job_scratch_path,
                      job_communication_point):
    """
    Copy job report file to make it accessible by Harvester. Shrink job report file.

    :param payload_report_file: name of job report (string).
    :param job_scratch_path: path to scratch directory (string).
    :param job_communication_point: path to updated job report accessible by Harvester (string).
    :raises FileHandlingFailure: in case of IOError.
    """

    src_file = os.path.join(job_scratch_path, payload_report_file)
    dst_file = os.path.join(job_communication_point, payload_report_file)

    try:
        logger.info("Copy of payload report [{0}] to access point: {1}".format(
            payload_report_file, job_communication_point))
        # shrink jobReport
        job_report = read_json(src_file)
        if 'executor' in job_report:
            for executor in job_report['executor']:
                if 'logfileReport' in executor:
                    executor['logfileReport'] = {}

        write_json(dst_file, job_report)

    except IOError:
        logger.error("Job report copy failed, execution terminated':  \n %s " %
                     (sys.exc_info()[1]))
        raise FileHandlingFailure("Job report copy from RAM failed")
Beispiel #4
0
def publish_job_report(job, args, job_report_file="jobReport.json"):
    """
    Copy job report file to make it accessible by Harvester. Shrink job report file.

    :param job: job object.
    :param args: Pilot arguments object.
    :param job_report_file: name of job report (string).
    :raises FileHandlingFailure: in case of IOError.
    """

    src_file = join(job.workdir, job_report_file)
    dst_file = join(args.harvester_workdir, job_report_file)

    try:
        logger.info("copy of payload report [{0}] to access point: {1}".format(
            job_report_file, args.harvester_workdir))
        # shrink jobReport
        job_report = read_json(src_file)
        if 'executor' in job_report:
            for executor in job_report['executor']:
                if 'logfileReport' in executor:
                    executor['logfileReport'] = {}

        write_json(dst_file, job_report)

    except IOError:
        logger.error("job report copy failed")
Beispiel #5
0
def get_schedconfig_queuedata(queue):
    """
    Return and store the schedconfig queuedata.

    :param queue: PanDA queue name (e.g. BNL_PROD_MCORE)
    :return: schedconfig queuedata json dictionary
    """

    # read it locally if the queuedata file already exists
    filename = os.path.join(os.environ.get('PILOT_HOME'),
                            config.Information.queuedata)
    if os.path.exists(filename):
        queuedata = read_json(filename)
        return queuedata

    url = config.Information.schedconfig
    if url == "":
        logger.fatal('URL for schedconfig not set')
        return False
    else:
        # add the queuename to the URL
        if not url.endswith('/'):
            url += '/'
        url += queue + '.all.json'
    queuedata = retrieve_json(url)

    # also write the queuedata to disk
    if not write_json(filename, queuedata):
        logger.warning("failed to write queuedata json to file")
    else:
        logger.info("wrote queuedata to local file %s" % filename)

    return queuedata
Beispiel #6
0
def request_new_jobs(njobs=1):
    """
    Inform Harvester that the pilot is ready to process new jobs by creating a job request file with the desired
    number of jobs.

    :param njobs: Number of jobs. Default is 1 since on grids and clouds the pilot does not know how many jobs it can
    process before it runs out of time.
    :return:
    """

    path = get_job_request_file_name()
    dictionary = {'nJobs': njobs}

    # write it to file
    try:
        write_json(path, dictionary)
    except FileHandlingFailure:
        raise FileHandlingFailure
Beispiel #7
0
def write_pilot_timing(pilot_timing_dictionary):
    """
    Write the given pilot timing dictionary to file.

    :param pilot_timing_dictionary:
    :return:
    """
    timing_file = config.Pilot.timing_file
    #rank, max_ranks = get_ranks_info()
    #if rank is not None:
    #    timing_file += '_{0}'.format(rank)
    path = os.path.join(os.environ.get('PILOT_HOME', ''), timing_file)
    if write_json(path, pilot_timing_dictionary):
        logger.debug('updated pilot timing dictionary: %s' % path)
    else:
        logger.warning('failed to update pilot timing dictionary: %s' % path)
Beispiel #8
0
def publish_work_report(work_report=None,
                        worker_attributes_file="worker_attributes.json"):
    """
    Publishing of work report to file.
    The work report dictionary should contain the fields defined in get_initial_work_report().

    :param work_report: work report dictionary.
    :param worker_attributes_file:
    :return:
    """

    if work_report:
        work_report['timestamp'] = time_stamp()
        if "outputfiles" in work_report:
            del (work_report["outputfiles"])
        if "inputfiles" in work_report:
            del (work_report["inputfiles"])
        if "xml" in work_report:
            del (work_report["xml"])
        if write_json(worker_attributes_file, work_report):
            logger.info("work report published: {0}".format(work_report))
Beispiel #9
0
def publish_work_report(work_report=None,
                        worker_attributes_file="worker_attributes.json"):
    """
    Publishing of work report to file.
    The work report dictionary should contain the fields defined in get_initial_work_report().

    :param work_report: work report dictionary.
    :param worker_attributes_file:
    :raises FileHandlingFailure: in case of IOError.
    :return: True or False
    """

    if work_report:
        try:
            work_report['timestamp'] = time_stamp()
            if "outputfiles" in work_report:
                del (work_report["outputfiles"])
            if "inputfiles" in work_report:
                del (work_report["inputfiles"])
            if "xml" in work_report:
                del (work_report["xml"])
            if write_json(worker_attributes_file, work_report):
                logger.info("work report published: {0}".format(work_report))
                return True
            else:
                logger.error(
                    "work report publish failed: {0}".format(work_report))
                return False
        except IOError:
            logger.error("job report copy failed")
            return False
        except Exception as e:
            logger.error("write json file failed: {0}".format(e))
            return False
    else:
        # No work_report return False
        return False
Beispiel #10
0
    except Exception as e:
        err = str(e)
        errcode = -1
        message(err)

    # put file statuses in a dictionary to be written to file
    file_dictionary = {
    }  # { 'error': [error_diag, -1], 'lfn1': [status, status_code], 'lfn2':.., .. }
    if xfiles:
        message('stagein script summary of transferred files:')
        for fspec in xfiles:
            add_to_dictionary(file_dictionary, fspec.lfn, fspec.status,
                              fspec.status_code, fspec.turl)
            status = fspec.status if fspec.status else "(not transferred)"
            message(" -- lfn=%s, status_code=%s, status=%s" %
                    (fspec.lfn, fspec.status_code, status))

    # add error info, if any
    if err:
        errcode, err = extract_error_info(err)
    add_to_dictionary(file_dictionary, 'error', err, errcode, None)
    _status = write_json(
        os.path.join(args.workdir, config.Container.stagein_status_dictionary),
        file_dictionary)
    if err:
        message("containerised file transfers failed: %s" % err)
        exit(TRANSFER_ERROR)

    message("containerised file transfers finished")
    exit(0)
Beispiel #11
0
def publish_stageout_files(job, event_status_file):
    """
    Publishing of work report to file.
    The work report dictionary should contain the fields defined in get_initial_work_report().

    :param args: Pilot arguments object.
    :param job: job object.
    :param event status file name:

    :return: Boolean. status of writing the file information to a json
    """

    # get the harvester workdir from the event_status_file
    work_dir = dirname(event_status_file)

    out_file_report = {}
    out_file_report[job.jobid] = []

    # first look at the logfile information (logdata) from the FileSpec objects
    for fspec in job.logdata:
        logger.debug(
            "File {} will be checked and declared for stage out".format(
                fspec.lfn))
        # find the first instance of the file
        filename = basename(fspec.surl)
        path = findfile(work_dir, filename)
        logger.debug("Found File {} at path - {}".format(fspec.lfn, path))
        #
        file_desc = {}
        file_desc['type'] = fspec.filetype
        file_desc['path'] = path
        file_desc['guid'] = fspec.guid
        file_desc['fsize'] = fspec.filesize
        file_desc['chksum'] = get_checksum_value(fspec.checksum)
        logger.debug("File description - {} ".format(file_desc))
        out_file_report[job.jobid].append(file_desc)

    # Now look at the output file(s) information (outdata) from the FileSpec objects
    for fspec in job.outdata:
        logger.debug(
            "File {} will be checked and declared for stage out".format(
                fspec.lfn))
        # find the first instance of the file
        filename = basename(fspec.surl)
        path = findfile(work_dir, filename)
        logger.debug("Found File {} at path - {}".format(fspec.lfn, path))
        #
        file_desc = {}
        file_desc['type'] = fspec.filetype
        file_desc['path'] = path
        file_desc['guid'] = fspec.guid
        file_desc['fsize'] = fspec.filesize
        file_desc['chksum'] = get_checksum_value(fspec.checksum)
        logger.debug("File description - {} ".format(file_desc))
        out_file_report[job.jobid].append(file_desc)

    if out_file_report[job.jobid]:
        if write_json(event_status_file, out_file_report):
            logger.debug('Stagout declared in: {0}'.format(event_status_file))
            logger.debug('Report for stageout: {}'.format(out_file_report))
            return True
        else:
            logger.debug(
                'Failed to declare stagout in: {0}'.format(event_status_file))
            return False
    else:
        logger.debug('No Report for stageout')
        return False
Beispiel #12
0
def get_command(job,
                xdata,
                queue,
                script,
                eventtype,
                localsite,
                remotesite,
                external_dir,
                label='stage-in',
                container_type='container'):
    """
    Get the middleware container execution command.

    Note: this function is tailor made for stage-in/out.

    :param job: job object.
    :param xdata: list of FileSpec objects.
    :param queue: queue name (string).
    :param script: name of stage-in/out script (string).
    :param eventtype:
    :param localsite:
    :param remotesite:
    :param external_dir: input or output files directory (string).
    :param label: optional 'stage-[in|out]' (string).
    :param container_type: optional 'container/bash' (string).
    :return: stage-in/out command (string).
    :raises PilotException: for stage-in/out related failures
    """

    if label == 'stage-out':
        filedata_dictionary = get_filedata_strings(xdata)
    else:
        filedata_dictionary = get_filedata(xdata)

        # write file data to file
        try:
            status = write_json(
                path.join(job.workdir,
                          config.Container.stagein_replica_dictionary),
                filedata_dictionary)
        except Exception as exc:
            diagnostics = 'exception caught in get_command(): %s' % exc
            logger.warning(diagnostics)
            raise PilotException(diagnostics)
        else:
            if not status:
                diagnostics = 'failed to write replica dictionary to file'
                logger.warning(diagnostics)
                raise PilotException(diagnostics)

    # copy pilot source into container directory, unless it is already there
    diagnostics = copy_pilot_source(job.workdir)
    if diagnostics:
        raise PilotException(diagnostics)

    final_script_path = path.join(job.workdir, script)
    environ['PYTHONPATH'] = environ.get('PYTHONPATH') + ':' + job.workdir
    script_path = path.join('pilot/scripts', script)
    full_script_path = path.join(path.join(job.workdir, script_path))
    copy(full_script_path, final_script_path)

    if container_type == 'container':
        # correct the path when containers have been used
        final_script_path = path.join('.', script)
        workdir = '/srv'
    else:
        # for container_type=bash we need to add the rucio setup
        pilot_user = environ.get('PILOT_USER', 'generic').lower()
        user = __import__('pilot.user.%s.container' % pilot_user, globals(),
                          locals(), [pilot_user], 0)  # Python 2/3
        try:
            final_script_path = user.get_middleware_container_script(
                '', final_script_path, asetup=True)
        except PilotException:
            final_script_path = 'python %s' % final_script_path
        workdir = job.workdir

    cmd = "%s -d -w %s -q %s --eventtype=%s --localsite=%s --remotesite=%s --produserid=\"%s\" --jobid=%s" % \
          (final_script_path, workdir, queue, eventtype, localsite, remotesite, job.produserid.replace(' ', '%20'), job.jobid)

    if label == 'stage-in':
        cmd += " --eventservicemerge=%s --usepcache=%s --usevp=%s --replicadictionary=%s" % \
               (job.is_eventservicemerge, job.infosys.queuedata.use_pcache, job.use_vp, config.Container.stagein_replica_dictionary)
        if external_dir:
            cmd += ' --inputdir=%s' % external_dir
    else:  # stage-out
        cmd += ' --lfns=%s --scopes=%s --datasets=%s --ddmendpoints=%s --guids=%s' % \
               (filedata_dictionary['lfns'], filedata_dictionary['scopes'], filedata_dictionary['datasets'],
                filedata_dictionary['ddmendpoints'], filedata_dictionary['guids'])
        if external_dir:
            cmd += ' --outputdir=%s' % external_dir

    cmd += ' --taskid=%s' % job.taskid
    cmd += ' --jobdefinitionid=%s' % job.jobdefinitionid
    cmd += ' --catchall=%s' % job.infosys.queuedata.catchall

    if container_type == 'bash':
        cmd += '\nexit $?'

    return cmd
Beispiel #13
0
    except Exception as error:
        print("caught exception: %s (skipping remote file open verification)" %
              error)
        exit(1)
    else:
        if not logname:
            print("remote file open verification not desired")
            exit(0)

    establish_logging(args, filename=logname)
    logger = logging.getLogger(__name__)

    # get the file info
    file_list_dictionary = get_file_lists(args.turls)
    turls = file_list_dictionary.get('turls')
    processed_turls_dictionary = {}
    if turls:
        message('got TURLs: %s' % str(turls))
        for turl in turls:
            processed_turls_dictionary[turl] = try_open_file(turl)

        # write dictionary to file with results
        _status = write_json(
            os.path.join(args.workdir,
                         config.Pilot.remotefileverification_dictionary),
            processed_turls_dictionary)
    else:
        message('no TURLs to verify')

    exit(0)
Beispiel #14
0
    }  # { 'error': [error_diag, -1], 'lfn1': [status, status_code], 'lfn2':.., .. }
    if xfiles:
        message('stageout script summary of transferred files:')
        for fspec in xfiles:
            add_to_dictionary(file_dictionary, fspec.lfn, fspec.status,
                              fspec.status_code, fspec.surl, fspec.turl,
                              fspec.checksum.get('adler32'), fspec.filesize)
            status = fspec.status if fspec.status else "(not transferred)"
            message(
                " -- lfn=%s, status_code=%s, status=%s, surl=%s, turl=%s, checksum=%s, filesize=%s"
                % (fspec.lfn, fspec.status_code, status, fspec.surl,
                   fspec.turl, fspec.checksum.get('adler32'), fspec.filesize))

    # add error info, if any
    if err:
        errcode, err = extract_error_info(err)
    add_to_dictionary(file_dictionary, 'error', err, errcode, None, None, None,
                      None)
    path = os.path.join(args.workdir,
                        config.Container.stageout_status_dictionary)
    if os.path.exists(path):
        path += '.log'
    _status = write_json(path, file_dictionary)
    if err:
        message("containerised file transfers failed: %s" % err)
        exit(TRANSFER_ERROR)

    message("wrote %s" % path)
    message("containerised file transfers finished")
    exit(0)