def copy_jobreport(job_working_dir, worker_communication_point, payload_report_file, workerattributesfile):
    src_file = os.path.join(job_working_dir, payload_report_file)
    dst_file = os.path.join(worker_communication_point, payload_report_file)

    try:
        logger.info(
            "Copy of payload report [{0}] to access point: {1}".format(payload_report_file, worker_communication_point))
        cp_start = time.time()
        # shrink jobReport
        job_report = read_json(src_file)
        if 'executor' in job_report:
            for executor in job_report['executor']:
                if 'logfileReport' in executor:
                    executor['logfileReport'] = {}

        with open(dst_file, 'w') as job_report_outfile:
            json.dump(job_report, job_report_outfile)
        cp_time = time.time() - cp_start
        logger.info("Copy of payload report file took: {0} sec.".format(cp_time))
    except:
        logger.error("Job report copy failed, execution terminated':  \n %s " % (sys.exc_info()[1]))
        work_report = dict()
        work_report["jobStatus"] = "failed"
        work_report["pilotErrorCode"] = 1103  # Should be changed to Pilot2 errors
        work_report["exitMsg"] = str(sys.exc_info()[1])
        main_exit(1103, work_report, workerattributesfile)
Esempio n. 2
0
def publish_job_report(job, args, job_report_file="jobReport.json"):
    """
    Copy job report file to make it accessible by Harvester. Shrink job report file.

    :param job: job object.
    :param args: Pilot arguments object.
    :param job_report_file: name of job report (string).
    :raises FileHandlingFailure: in case of IOError.
    :return True or False
    """

    src_file = os.path.join(job.workdir, job_report_file)
    dst_file = os.path.join(args.harvester_workdir, job_report_file)

    try:
        logger.info("copy of payload report [{0}] to access point: {1}".format(
            job_report_file, args.harvester_workdir))
        # shrink jobReport
        job_report = read_json(src_file)
        if 'executor' in job_report:
            for executor in job_report['executor']:
                if 'logfileReport' in executor:
                    executor['logfileReport'] = {}

        if write_json(dst_file, job_report):
            return True
        else:
            return False

    except IOError:
        logger.error("job report copy failed")
        return False
Esempio n. 3
0
def process_jobreport(payload_report_file, job_scratch_path,
                      job_communication_point):
    """
    Copy job report file to make it accessible by Harvester. Shrink job report file.

    :param payload_report_file: name of job report (string).
    :param job_scratch_path: path to scratch directory (string).
    :param job_communication_point: path to updated job report accessible by Harvester (string).
    :raises FileHandlingFailure: in case of IOError.
    """

    src_file = os.path.join(job_scratch_path, payload_report_file)
    dst_file = os.path.join(job_communication_point, payload_report_file)

    try:
        logger.info("Copy of payload report [{0}] to access point: {1}".format(
            payload_report_file, job_communication_point))
        # shrink jobReport
        job_report = read_json(src_file)
        if 'executor' in job_report:
            for executor in job_report['executor']:
                if 'logfileReport' in executor:
                    executor['logfileReport'] = {}

        write_json(dst_file, job_report)

    except IOError:
        logger.error("Job report copy failed, execution terminated':  \n %s " %
                     (sys.exc_info()[1]))
        raise FileHandlingFailure("Job report copy from RAM failed")
Esempio n. 4
0
def get_schedconfig_queuedata(queue):
    """
    Return and store the schedconfig queuedata.

    :param queue: PanDA queue name (e.g. BNL_PROD_MCORE)
    :return: schedconfig queuedata json dictionary
    """

    # read it locally if the queuedata file already exists
    filename = os.path.join(os.environ.get('PILOT_HOME'),
                            config.Information.queuedata)
    if os.path.exists(filename):
        queuedata = read_json(filename)
        return queuedata

    url = config.Information.schedconfig
    if url == "":
        logger.fatal('URL for schedconfig not set')
        return False
    else:
        # add the queuename to the URL
        if not url.endswith('/'):
            url += '/'
        url += queue + '.all.json'
    queuedata = retrieve_json(url)

    # also write the queuedata to disk
    if not write_json(filename, queuedata):
        logger.warning("failed to write queuedata json to file")
    else:
        logger.info("wrote queuedata to local file %s" % filename)

    return queuedata
Esempio n. 5
0
def handle_updated_job_object(job, xdata, label='stage-in'):
    """
    Handle updated job object fields.

    :param job: job object.
    :param xdata: list of FileSpec objects.
    :param label: 'stage-in/out' (string).
    :return:
    :raises: StageInFailure, StageOutFailure
    """

    dictionary_name = config.Container.stagein_status_dictionary if label == 'stage-in' else config.Container.stageout_status_dictionary

    # read the JSON file created by the stage-in/out script
    if path.exists(path.join(job.workdir, dictionary_name + '.log')):
        dictionary_name += '.log'
    file_dictionary = read_json(path.join(job.workdir, dictionary_name))

    # update the job object accordingly
    if file_dictionary:
        # get file info and set essential parameters
        for fspec in xdata:
            try:
                fspec.status = file_dictionary[fspec.lfn][0]
                fspec.status_code = file_dictionary[fspec.lfn][1]
                if label == 'stage-in':
                    fspec.turl = file_dictionary[fspec.lfn][2]
                    fspec.ddmendpoint = file_dictionary[fspec.lfn][3]
                else:
                    fspec.surl = file_dictionary[fspec.lfn][2]
                    fspec.turl = file_dictionary[fspec.lfn][3]
                    fspec.checksum['adler32'] = file_dictionary[fspec.lfn][4]
                    fspec.filesize = file_dictionary[fspec.lfn][5]
            except Exception as exc:
                msg = "exception caught while reading file dictionary: %s" % exc
                logger.warning(msg)
                if label == 'stage-in':
                    raise StageInFailure(msg)
                else:
                    raise StageOutFailure(msg)

        # get main error info ('error': [error_diag, error_code])
        error_diag = file_dictionary['error'][0]
        error_code = file_dictionary['error'][1]
        if error_code:
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                error_code, msg=error_diag)
    else:
        msg = "%s file dictionary not found" % label
        logger.warning(msg)
        if label == 'stage-in':
            raise StageInFailure(msg)
        else:
            raise StageOutFailure(msg)
Esempio n. 6
0
def read_pilot_timing():
    """
    Read the pilot timing dictionary from file.

    :return: pilot timing dictionary (json dictionary).
    """

    pilot_timing_dictionary = {}

    path = os.path.join(os.environ.get('PILOT_HOME', ''), config.Pilot.timing_file)
    if os.path.exists(path):
        pilot_timing_dictionary = read_json(path)

    return pilot_timing_dictionary
Esempio n. 7
0
def get_job(harvesterpath):
    """
    Return job description in dictionary and MPI rank (if applicable)

    :param harvesterpath: path to config.Harvester.jobs_list_file (string).
    :return: job object, rank (int).
    """

    rank = 0
    job = None
    logger.info("Going to read job definition from file")

    pandaids_list_filename = os.path.join(harvesterpath,
                                          config.Harvester.jobs_list_file)
    if not os.path.isfile(pandaids_list_filename):
        logger.info("File with PanDA IDs are missing. Nothing to execute.")
        return job, rank

    harvesterpath = os.path.abspath(harvesterpath)
    rank, max_ranks = get_ranks_info()

    pandaids = read_json(pandaids_list_filename)
    logger.info('Got {0} job ids'.format(len(pandaids)))
    pandaid = pandaids[rank]
    job_workdir = os.path.join(harvesterpath, str(pandaid))

    logger.info('Rank: {2} with job {0} will have work directory {1}'.format(
        pandaid, job_workdir, rank))

    job_def_filename = os.path.join(job_workdir,
                                    config.Harvester.pandajob_file)
    jobs_dict = read_json(job_def_filename)
    job_dict = jobs_dict[str(pandaid)]
    job = JobDescription()
    job.load(job_dict)

    return job, rank
Esempio n. 8
0
def get_memory_values(workdir, name=""):
    """
    Find the values in the memory monitor output file.

    In case the summary JSON file has not yet been produced, create a summary dictionary with the same format
    using the output text file (produced by the memory monitor and which is updated once per minute).

    FORMAT:
       {"Max":{"maxVMEM":40058624,"maxPSS":10340177,"maxRSS":16342012,"maxSwap":16235568},
        "Avg":{"avgVMEM":19384236,"avgPSS":5023500,"avgRSS":6501489,"avgSwap":5964997},
        "Other":{"rchar":NN,"wchar":NN,"rbytes":NN,"wbytes":NN}}

    :param workdir: relevant work directory (string).
    :param name: name of memory monitor (string).
    :return: memory values dictionary.
    """

    summary_dictionary = {}

    # Get the path to the proper memory info file (priority ordered)
    path = get_memory_monitor_info_path(workdir, allowtxtfile=True)
    if os.path.exists(path):
        logger.info("using path: %s (trf name=%s)" % (path, name))

        # Does a JSON summary file exist? If so, there's no need to calculate maximums and averages in the pilot
        if path.lower().endswith('json'):
            # Read the dictionary from the JSON file
            summary_dictionary = read_json(path)
        else:
            # Loop over the output file, line by line, and look for the maximum PSS value
            if name == "prmon":
                summary_dictionary = get_average_summary_dictionary_prmon(path)
            else:
                summary_dictionary = get_average_summary_dictionary(path)
            logger.debug('summary_dictionary=%s (trf name=%s)' %
                         (str(summary_dictionary), name))
    else:
        if path == "":
            logger.warning("filename not set for memory monitor output")
        else:
            # Normally this means that the memory output file has not been produced yet
            pass

    return summary_dictionary
Esempio n. 9
0
def parse_job_definition_file(filename):
    """
    This function parses the Harvester job definition file and re-packages the job definition dictionaries.
    The format of the Harvester job definition dictionary is:
    dict = { job_id: { key: value, .. }, .. }
    The function returns a list of these dictionaries each re-packaged as
    dict = { key: value } (where the job_id is now one of the key-value pairs: 'jobid': job_id)

    :param filename: file name (string).
    :return: list of job definition dictionaries.
    """

    job_definitions_list = []

    # re-package dictionaries
    job_definitions_dict = read_json(filename)
    if job_definitions_dict:
        for job_id in job_definitions_dict:
            res = {'jobid': job_id}
            res.update(job_definitions_dict[job_id])
            job_definitions_list.append(res)

    return job_definitions_list
Esempio n. 10
0
    # get the args from the arg parser
    args = get_args()
    args.debug = True
    args.nopilotlog = False

    establish_logging(args, filename=config.Pilot.stageinlog)
    logger = logging.getLogger(__name__)

    #ret = verify_args()
    #if ret:
    #    exit(ret)

    # get the file info
    try:
        replica_dictionary = read_json(
            os.path.join(args.workdir, args.replicadictionary))
    except Exception as e:
        message('exception caught reading json: %s' % e)
        exit(1)

#    file_list_dictionary = get_file_lists(args.lfns, args.scopes, args.filesizes, args.checksums, args.allowlans,
#                                          args.allowwans, args.directaccesslans, args.directaccesswans, args.istars,
#                                          args.accessmodes, args.storagetokens, args.guids)
#    lfns = file_list_dictionary.get('lfns')
#    scopes = file_list_dictionary.get('scopes')
#    filesizes = file_list_dictionary.get('filesizes')
#    checksums = file_list_dictionary.get('checksums')
#    allowlans = file_list_dictionary.get('allowlans')
#    allowwans = file_list_dictionary.get('allowwans')
#    directaccesslans = file_list_dictionary.get('directaccesslans')
#    directaccesswans = file_list_dictionary.get('directaccesswans')
Esempio n. 11
0
    #ret = verify_args()
    #if ret:
    #    exit(ret)

    # get the file info
    lfns, scopes = get_file_lists(args.lfns, args.scopes)
    if len(lfns) != len(scopes):
        message('file lists not same length: len(lfns)=%d, len(scopes)=%d' % (len(lfns), len(scopes)))

    # get the initial trace report
    path = os.path.join(args.workdir, args.tracereportname)
    if not os.path.exists(path):
        message('file does not exist: %s' % path)
        exit(NO_TRACEREPORT)

    trace_report = read_json(args.tracereportname)
    if not trace_report:
        message('failed to read trace report')
        exit(NO_TRACEREPORT)

    try:
        infoservice = InfoService()
        infoservice.init(args.queuename, infosys.confinfo, infosys.extinfo)
        infosys.init(args.queuename)  # is this correct? otherwise infosys.queuedata doesn't get set
    except Exception as e:
        message(e)

    # perform stage-in (single transfers)
    err = ""
    for lfn, scope in list(zip(lfns, scopes)):
        try:
Esempio n. 12
0
def run(args):
    """
    Main execution function for the generic HPC workflow.

    :param args: pilot arguments.
    :returns: traces object.
    """

    # set communication point. Worker report should be placed there, matched with working directory of Harvester
    if args.harvester_workdir:
        communication_point = args.harvester_workdir
    else:
        communication_point = os.getcwd()
    work_report = get_initial_work_report()
    worker_attributes_file = config.Harvester.workerAttributesFile
    worker_stageout_declaration = config.Harvester.StageOutnFile
    payload_report_file = config.Payload.jobreport
    payload_stdout_file = config.Payload.payloadstdout
    payload_stderr_file = config.Payload.payloadstderr

    try:
        logger.info('setting up signal handling')
        signal.signal(signal.SIGINT, functools.partial(interrupt, args))

        logger.info('setting up tracing')
        traces = namedtuple('traces', ['pilot'])
        traces.pilot = {'state': SUCCESS, 'nr_jobs': 0}

        if args.hpc_resource == '':
            logger.critical('hpc resource not specified, cannot continue')
            traces.pilot['state'] = FAILURE
            return traces

        # get the resource reference
        resource = __import__('pilot.resource.%s' % args.hpc_resource,
                              globals(), locals(), [args.hpc_resource], -1)

        # get the user reference
        user = __import__('pilot.user.%s.common' % args.pilot_user.lower(),
                          globals(), locals(), [args.pilot_user.lower()], -1)

        # get job (and rank)
        add_to_pilot_timing('0', PILOT_PRE_GETJOB, time.time(), args)
        job, rank = resource.get_job(communication_point)
        add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args)
        # cd to job working directory

        add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, time.time(), args)
        work_dir = resource.set_job_workdir(job, communication_point)
        work_report['workdir'] = work_dir
        worker_attributes_file = os.path.join(work_dir, worker_attributes_file)
        logger.debug("Worker attributes will be publeshied in: {0}".format(
            worker_attributes_file))

        set_pilot_state(job=job, state="starting")
        work_report["jobStatus"] = job.state
        publish_work_report(work_report, worker_attributes_file)

        # Get HPC specific setup commands
        logger.info('setup for resource %s: %s' %
                    (args.hpc_resource, str(resource.get_setup())))
        setup_str = "; ".join(resource.get_setup())

        # Prepare job scratch directory (RAM disk etc.)
        job_scratch_dir = resource.set_scratch_workdir(job, work_dir, args)

        my_command = " ".join([job.script, job.script_parameters])
        my_command = resource.command_fix(my_command, job_scratch_dir)
        my_command = setup_str + my_command
        add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), args)

        # Basic execution. Should be replaced with something like 'run_payload'
        logger.debug("Going to launch: {0}".format(my_command))
        logger.debug("Current work directory: {0}".format(job_scratch_dir))
        payloadstdout = open(payload_stdout_file, "w")
        payloadstderr = open(payload_stderr_file, "w")

        add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, time.time(), args)
        set_pilot_state(job=job, state="running")
        work_report["jobStatus"] = job.state
        work_report["startTime"] = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S")
        start_time = time.asctime(time.localtime(time.time()))
        job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        publish_work_report(work_report, worker_attributes_file)

        stime = time.time()
        t0 = os.times()
        exit_code, stdout, stderr = execute(my_command,
                                            stdout=payloadstdout,
                                            stderr=payloadstderr,
                                            shell=True)
        logger.debug("Payload exit code: {0}".format(exit_code))
        t1 = os.times()
        exetime = time.time() - stime
        end_time = time.asctime(time.localtime(time.time()))
        t = map(lambda x, y: x - y, t1, t0)
        t_tot = reduce(lambda x, y: x + y, t[2:3])
        job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        payloadstdout.close()
        payloadstderr.close()
        add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), args)

        state = 'finished' if exit_code == 0 else 'failed'
        set_pilot_state(job=job, state=state)
        job.exitcode = exit_code

        work_report["startTime"] = job.startTime
        work_report["endTime"] = job.endTime
        work_report["jobStatus"] = job.state
        work_report["cpuConsumptionTime"] = t_tot
        work_report["transExitCode"] = job.exitcode

        log_jobreport = "\nPayload exit code: {0} JobID: {1} \n".format(
            exit_code, job.jobid)
        log_jobreport += "CPU comsumption time: {0}  JobID: {1} \n".format(
            t_tot, job.jobid)
        log_jobreport += "Start time: {0}  JobID: {1} \n".format(
            start_time, job.jobid)
        log_jobreport += "End time: {0}  JobID: {1} \n".format(
            end_time, job.jobid)
        log_jobreport += "Execution time: {0} sec.  JobID: {1} \n".format(
            exetime, job.jobid)
        logger.info(log_jobreport)
        log_jobreport = "\nJob report start time: {0}\nJob report end time: {1}".format(
            job.startTime, job.endTime)
        logger.debug(log_jobreport)

        # Parse job report file and update of work report
        if os.path.exists(payload_report_file):
            payload_report = user.parse_jobreport_data(
                read_json(payload_report_file))
            work_report.update(payload_report)
            resource.process_jobreport(payload_report_file, job_scratch_dir,
                                       work_dir)

        resource.postprocess_workdir(job_scratch_dir)

        # output files should not be packed with logs
        protectedfiles = job.output_files.keys()

        # log file not produced (yet), so should be excluded
        if job.log_file in protectedfiles:
            protectedfiles.remove(job.log_file)
        else:
            logger.info("Log files was not declared")

        logger.info("Cleanup of working directory")

        protectedfiles.extend(
            [worker_attributes_file, worker_stageout_declaration])
        user.remove_redundant_files(job_scratch_dir, protectedfiles)
        res = tar_files(job_scratch_dir, protectedfiles, job.log_file)
        if res > 0:
            raise FileHandlingFailure("Log file tar failed")

        add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args)
        # Copy of output to shared FS for stageout
        if not job_scratch_dir == work_dir:
            copy_output(job, job_scratch_dir, work_dir)
        add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args)

        logger.info("Declare stage-out")
        add_to_pilot_timing(job.jobid, PILOT_PRE_FINAL_UPDATE, time.time(),
                            args)
        declare_output(job, work_report, worker_stageout_declaration)

        logger.info("All done")
        publish_work_report(work_report, worker_attributes_file)
        traces.pilot['state'] = SUCCESS
        logger.debug("Final report: {0}".format(work_report))
        add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(),
                            args)

    except Exception as e:
        work_report["jobStatus"] = "failed"
        work_report["exitMsg"] = str(e)
        publish_work_report(work_report, worker_attributes_file)
        logging.exception('exception caught:')
        traces.pilot['state'] = FAILURE

    return traces
    try:
        job_id = panda_ids[rank]
    except ValueError:
        logger.critical("Pilot have no job for rank {0}".format(rank))
        logger.critical("Exit pilot")
        main_exit(1)

    logger.debug("Job [{0}] will be processed".format(job_id))
    os.chdir(str(job_id))
    worker_communication_point = os.getcwd()

    work_report['workdir'] = worker_communication_point
    workerAttributesFile = os.path.join(worker_communication_point, workerAttributesFile)
    trans_job_workdir = os.path.join(scratch_path, str(job_id))

    jobs_dict = read_json("HPCJobs.json")
    job_dict = jobs_dict[str(job_id)]

    job = JobDescription()
    job.load(job_dict)
    # add path to input files in RAM
    for inp_file in job.input_files:
        job.input_files[inp_file]["scratch_path"] = os.path.join(trans_job_workdir, inp_file)

    job.startTime = ""
    job.endTime = ""
    setup_str = "; ".join(get_setup(job))

    job_working_dir = titan_prepare_wd(scratch_path, trans_job_workdir, worker_communication_point, job,
                                       workerAttributesFile)