Ejemplo n.º 1
0
def process_jobreport(payload_report_file, job_scratch_path,
                      job_communication_point):
    """
    Copy job report file to make it accessible by Harvester. Shrink job report file.

    :param payload_report_file: name of job report (string).
    :param job_scratch_path: path to scratch directory (string).
    :param job_communication_point: path to updated job report accessible by Harvester (string).
    :raises FileHandlingFailure: in case of IOError.
    """

    src_file = os.path.join(job_scratch_path, payload_report_file)
    dst_file = os.path.join(job_communication_point, payload_report_file)

    try:
        logger.info("Copy of payload report [{0}] to access point: {1}".format(
            payload_report_file, job_communication_point))
        # shrink jobReport
        job_report = read_json(src_file)
        if 'executor' in job_report:
            for executor in job_report['executor']:
                if 'logfileReport' in executor:
                    executor['logfileReport'] = {}

        write_json(dst_file, job_report)

    except IOError:
        logger.error("Job report copy failed, execution terminated':  \n %s " %
                     (sys.exc_info()[1]))
        raise FileHandlingFailure("Job report copy from RAM failed")
Ejemplo n.º 2
0
def write_file(path, contents, mute=True):
    """
    Write the given contents to a file.

    :param path: full path for file (string).
    :param contents: file contents (string).
    :param mute: boolean to control stdout info message
    :raises PilotException: FileHandlingFailure.
    :return: True if successful, otherwise False.
    """

    status = False

    f = open_file(path, 'w')
    if f:
        try:
            f.write(contents)
        except IOError as e:
            raise FileHandlingFailure(e)
        else:
            status = True
        f.close()

    if not mute:
        logger.info('created file: %s' % path)

    return status
Ejemplo n.º 3
0
def write_json(filename,
               data,
               sort_keys=True,
               indent=4,
               separators=(',', ': ')):
    """
    Write the dictionary to a JSON file.

    :param filename: file name (string).
    :param data: object to be written to file (dictionary or list).
    :param sort_keys: should entries be sorted? (boolean).
    :param indent: indentation level, default 4 (int).
    :param separators: field separators (default (',', ': ') for dictionaries, use e.g. (',\n') for lists) (tuple)
    :raises PilotException: FileHandlingFailure.
    :return: status (boolean).
    """

    status = False

    try:
        with open(filename, 'w') as fh:
            dumpjson(data,
                     fh,
                     sort_keys=sort_keys,
                     indent=indent,
                     separators=separators)
    except IOError as exc:
        raise FileHandlingFailure(exc)
    else:
        status = True

    return status
Ejemplo n.º 4
0
def postprocess_workdir(workdir):
    """
    Post-processing of working directory. Unlink paths.

    :param workdir: path to directory to be processed (string).
    :raises FileHandlingFailure: in case of IOError.
    """

    pseudo_dir = "poolcond"
    try:
        if os.path.exists(pseudo_dir):
            remove(os.path.join(workdir, pseudo_dir))
    except IOError:
        raise FileHandlingFailure(
            "Post processing of working directory failed")
Ejemplo n.º 5
0
def copy_output(job, job_scratch_dir, work_dir):
    cp_start = time.time()
    try:
        for outfile in job.output_files.keys():
            if os.path.exists(outfile):
                copy(os.path.join(job_scratch_dir, outfile),
                     os.path.join(work_dir, outfile))
        os.chdir(work_dir)
    except IOError:
        raise FileHandlingFailure(
            "Copy from scratch dir to access point failed")
    finally:
        cp_time = time.time() - cp_start
        logger.info("Copy of outputs took: {0} sec.".format(cp_time))
    return 0
Ejemplo n.º 6
0
def open_file(filename, mode):
    """
    Open and return a file pointer for the given mode.
    Note: the caller needs to close the file.

    :param filename: file name (string).
    :param mode: file mode (character).
    :raises PilotException: FileHandlingFailure.
    :return: file pointer.
    """

    f = None
    try:
        f = open(filename, mode)
    except IOError as exc:
        raise FileHandlingFailure(exc)

    return f
Ejemplo n.º 7
0
def move(path1, path2):
    """
    Move a file from path1 to path2.

    :param path1: source path (string).
    :param path2: destination path2 (string).
    """

    if not os.path.exists(path1):
        logger.warning('file copy failure: path does not exist: %s', path1)
        raise NoSuchFile("File does not exist: %s" % path1)

    try:
        import shutil
        shutil.move(path1, path2)
    except IOError as exc:
        logger.warning("exception caught during file move: %s", exc)
        raise FileHandlingFailure(exc)
    else:
        logger.info("moved %s to %s", path1, path2)
Ejemplo n.º 8
0
def copy(path1, path2):
    """
    Copy path1 to path2.

    :param path1: file path (string).
    :param path2: file path (string).
    :raises PilotException: FileHandlingFailure, NoSuchFile
    :return:
    """

    if not os.path.exists(path1):
        logger.warning('file copy failure: path does not exist: %s', path1)
        raise NoSuchFile("File does not exist: %s" % path1)

    try:
        copy2(path1, path2)
    except IOError as exc:
        logger.warning("exception caught during file copy: %s", exc)
        raise FileHandlingFailure(exc)
    else:
        logger.info("copied %s to %s", path1, path2)
Ejemplo n.º 9
0
def open_file(filename, mode):
    """
    Open and return a file pointer for the given mode.
    Note: the caller needs to close the file.

    :param filename: file name (string).
    :param mode: file mode (character).
    :raises PilotException: FileHandlingFailure.
    :return: file pointer.
    """

    f = None
    if not mode == 'w' and not os.path.exists(filename):
        raise NoSuchFile("File does not exist: %s" % filename)

    try:
        f = open(filename, mode)
    except IOError as e:
        raise FileHandlingFailure(e)

    return f
Ejemplo n.º 10
0
def calculate_checksum(filename, algorithm='adler32'):
    """
    Calculate the checksum value for the given file.
    The default algorithm is adler32. Md5 is also be supported.
    Valid algorithms are 1) adler32/adler/ad32/ad, 2) md5/md5sum/md.

    :param filename: file name (string).
    :param algorithm: optional algorithm string.
    :raises FileHandlingFailure, NotImplementedError: exception raised when file does not exist or for unknown algorithm.
    :return: checksum value (string).
    """

    if not os.path.exists(filename):
        raise FileHandlingFailure('file does not exist: %s' % filename)

    if algorithm == 'adler32' or algorithm == 'adler' or algorithm == 'ad' or algorithm == 'ad32':
        return calculate_adler32_checksum(filename)
    elif algorithm == 'md5' or algorithm == 'md5sum' or algorithm == 'md':
        return calculate_md5_checksum(filename)
    else:
        msg = 'unknown checksum algorithm: %s' % algorithm
        logger.warning(msg)
        raise NotImplementedError()
Ejemplo n.º 11
0
def write_file(path, contents, mute=True, mode='w', unique=False):
    """
    Write the given contents to a file.
    If unique=True, then if the file already exists, an index will be added (e.g. 'out.txt' -> 'out-1.txt')
    :param path: full path for file (string).
    :param contents: file contents (object).
    :param mute: boolean to control stdout info message.
    :param mode: file mode (e.g. 'w', 'r', 'a', 'wb', 'rb') (string).
    :param unique: file must be unique (Boolean).
    :raises PilotException: FileHandlingFailure.
    :return: True if successful, otherwise False.
    """

    status = False

    # add an incremental file name (add -%d if path already exists) if necessary
    if unique:
        path = get_nonexistant_path(path)

    f = open_file(path, mode)
    if f:
        try:
            f.write(contents)
        except IOError as exc:
            raise FileHandlingFailure(exc)
        else:
            status = True
        f.close()

    if not mute:
        if 'w' in mode:
            logger.info('created file: %s', path)
        if 'a' in mode:
            logger.info('appended file: %s', path)

    return status
Ejemplo n.º 12
0
def set_scratch_workdir(job, work_dir, args):
    """
    Copy input files and some db files to RAM disk.

    :param job: job object.
    :param work_dir: job working directory (permanent FS) (string).
    :param args: args dictionary to collect timing metrics.
    :return: job working directory in scratch (string).
    """

    scratch_path = config.HPC.scratch
    du = disk_usage(scratch_path)
    logger.info("Scratch dir available space: {0} used: {1}".format(
        du.free, du.used))
    job_scratch_dir = os.path.join(scratch_path, str(job.jobid))
    for inp_file in job.input_files:
        job.input_files[inp_file]["scratch_path"] = job_scratch_dir
    logger.debug("Job scratch path: {0}".format(job_scratch_dir))
    # special data, that should be preplaced in RAM disk
    dst_db_path = 'sqlite200/'
    dst_db_filename = 'ALLP200.db'
    dst_db_path_2 = 'geomDB/'
    dst_db_filename_2 = 'geomDB_sqlite'
    tmp_path = 'tmp/'
    src_file = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/sqlite200/ALLP200.db'
    src_file_2 = '/ccs/proj/csc108/AtlasReleases/21.0.15/DBRelease/current/geomDB/geomDB_sqlite'

    if os.path.exists(scratch_path):
        try:
            add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEIN, time.time(),
                                args)
            logger.debug("Prepare \'tmp\' dir in scratch ")
            if not os.path.exists(scratch_path + tmp_path):
                os.makedirs(scratch_path + tmp_path)
            logger.debug("Prepare dst and copy sqlite db files")
            t0 = time.time()
            if not os.path.exists(scratch_path + dst_db_path):
                os.makedirs(scratch_path + dst_db_path)
            shutil.copyfile(src_file,
                            scratch_path + dst_db_path + dst_db_filename)
            logger.debug("")
            sql_cp_time = time.time() - t0
            logger.debug("Copy of sqlite files took: {0}".format(sql_cp_time))
            logger.debug("Prepare dst and copy geomDB files")
            t0 = time.time()
            if not os.path.exists(scratch_path + dst_db_path_2):
                os.makedirs(scratch_path + dst_db_path_2)
            shutil.copyfile(src_file_2,
                            scratch_path + dst_db_path_2 + dst_db_filename_2)
            geomdb_cp_time = time.time() - t0
            logger.debug(
                "Copy of geomDB files took: {0} s".format(geomdb_cp_time))
            logger.debug("Prepare job scratch dir")
            t0 = time.time()
            if not os.path.exists(job_scratch_dir):
                os.makedirs(job_scratch_dir)
            logger.debug("Copy input file")
            for inp_file in job.input_files:
                logger.debug("Copy: {0} to {1}".format(
                    os.path.join(work_dir, inp_file),
                    job.input_files[inp_file]["scratch_path"]))
                shutil.copyfile(
                    os.path.join(work_dir, inp_file),
                    os.path.join(job.input_files[inp_file]["scratch_path"],
                                 inp_file))
            input_cp_time = time.time() - t0
            logger.debug(
                "Copy of input files took: {0} s".format(input_cp_time))
        except IOError as e:
            logger.error("I/O error({0}): {1}".format(e.errno, e.strerror))
            logger.error(
                "Copy to scratch failed, execution terminated': \n %s " %
                (sys.exc_info()[1]))
            raise FileHandlingFailure("Copy to RAM disk failed")
        finally:
            add_to_pilot_timing(job.jobid, PILOT_POST_STAGEIN, time.time(),
                                args)
    else:
        logger.info('Scratch directory (%s) dos not exist' % scratch_path)
        return work_dir

    os.chdir(job_scratch_dir)
    logger.debug("Current directory: {0}".format(os.getcwd()))
    true_dir = '/ccs/proj/csc108/AtlasReleases/21.0.15/nfs_db_files'
    pseudo_dir = "./poolcond"
    os.symlink(true_dir, pseudo_dir)
    du = disk_usage(scratch_path)
    logger.info("Scratch dir available space for job: {0} used: {1}".format(
        du.free, du.used))

    return job_scratch_dir
Ejemplo n.º 13
0
def run(args):
    """
    Main execution function for the generic HPC workflow.

    :param args: pilot arguments.
    :returns: traces object.
    """

    # set communication point. Worker report should be placed there, matched with working directory of Harvester
    if args.harvester_workdir:
        communication_point = args.harvester_workdir
    else:
        communication_point = os.getcwd()
    work_report = get_initial_work_report()
    worker_attributes_file = config.Harvester.workerAttributesFile
    worker_stageout_declaration = config.Harvester.StageOutnFile
    payload_report_file = config.Payload.jobreport
    payload_stdout_file = config.Payload.payloadstdout
    payload_stderr_file = config.Payload.payloadstderr

    try:
        logger.info('setting up signal handling')
        signal.signal(signal.SIGINT, functools.partial(interrupt, args))

        logger.info('setting up tracing')
        traces = namedtuple('traces', ['pilot'])
        traces.pilot = {'state': SUCCESS, 'nr_jobs': 0}

        if args.hpc_resource == '':
            logger.critical('hpc resource not specified, cannot continue')
            traces.pilot['state'] = FAILURE
            return traces

        # get the resource reference
        resource = __import__('pilot.resource.%s' % args.hpc_resource,
                              globals(), locals(), [args.hpc_resource], -1)

        # get the user reference
        user = __import__('pilot.user.%s.common' % args.pilot_user.lower(),
                          globals(), locals(), [args.pilot_user.lower()], -1)

        # get job (and rank)
        add_to_pilot_timing('0', PILOT_PRE_GETJOB, time.time(), args)
        job, rank = resource.get_job(communication_point)
        add_to_pilot_timing(job.jobid, PILOT_POST_GETJOB, time.time(), args)
        # cd to job working directory

        add_to_pilot_timing(job.jobid, PILOT_PRE_SETUP, time.time(), args)
        work_dir = resource.set_job_workdir(job, communication_point)
        work_report['workdir'] = work_dir
        worker_attributes_file = os.path.join(work_dir, worker_attributes_file)
        logger.debug("Worker attributes will be publeshied in: {0}".format(
            worker_attributes_file))

        set_pilot_state(job=job, state="starting")
        work_report["jobStatus"] = job.state
        publish_work_report(work_report, worker_attributes_file)

        # Get HPC specific setup commands
        logger.info('setup for resource %s: %s' %
                    (args.hpc_resource, str(resource.get_setup())))
        setup_str = "; ".join(resource.get_setup())

        # Prepare job scratch directory (RAM disk etc.)
        job_scratch_dir = resource.set_scratch_workdir(job, work_dir, args)

        my_command = " ".join([job.script, job.script_parameters])
        my_command = resource.command_fix(my_command, job_scratch_dir)
        my_command = setup_str + my_command
        add_to_pilot_timing(job.jobid, PILOT_POST_SETUP, time.time(), args)

        # Basic execution. Should be replaced with something like 'run_payload'
        logger.debug("Going to launch: {0}".format(my_command))
        logger.debug("Current work directory: {0}".format(job_scratch_dir))
        payloadstdout = open(payload_stdout_file, "w")
        payloadstderr = open(payload_stderr_file, "w")

        add_to_pilot_timing(job.jobid, PILOT_PRE_PAYLOAD, time.time(), args)
        set_pilot_state(job=job, state="running")
        work_report["jobStatus"] = job.state
        work_report["startTime"] = datetime.utcnow().strftime(
            "%Y-%m-%d %H:%M:%S")
        start_time = time.asctime(time.localtime(time.time()))
        job.startTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        publish_work_report(work_report, worker_attributes_file)

        stime = time.time()
        t0 = os.times()
        exit_code, stdout, stderr = execute(my_command,
                                            stdout=payloadstdout,
                                            stderr=payloadstderr,
                                            shell=True)
        logger.debug("Payload exit code: {0}".format(exit_code))
        t1 = os.times()
        exetime = time.time() - stime
        end_time = time.asctime(time.localtime(time.time()))
        t = map(lambda x, y: x - y, t1, t0)
        t_tot = reduce(lambda x, y: x + y, t[2:3])
        job.endTime = datetime.utcnow().strftime("%Y-%m-%d %H:%M:%S")
        payloadstdout.close()
        payloadstderr.close()
        add_to_pilot_timing(job.jobid, PILOT_POST_PAYLOAD, time.time(), args)

        state = 'finished' if exit_code == 0 else 'failed'
        set_pilot_state(job=job, state=state)
        job.exitcode = exit_code

        work_report["startTime"] = job.startTime
        work_report["endTime"] = job.endTime
        work_report["jobStatus"] = job.state
        work_report["cpuConsumptionTime"] = t_tot
        work_report["transExitCode"] = job.exitcode

        log_jobreport = "\nPayload exit code: {0} JobID: {1} \n".format(
            exit_code, job.jobid)
        log_jobreport += "CPU comsumption time: {0}  JobID: {1} \n".format(
            t_tot, job.jobid)
        log_jobreport += "Start time: {0}  JobID: {1} \n".format(
            start_time, job.jobid)
        log_jobreport += "End time: {0}  JobID: {1} \n".format(
            end_time, job.jobid)
        log_jobreport += "Execution time: {0} sec.  JobID: {1} \n".format(
            exetime, job.jobid)
        logger.info(log_jobreport)
        log_jobreport = "\nJob report start time: {0}\nJob report end time: {1}".format(
            job.startTime, job.endTime)
        logger.debug(log_jobreport)

        # Parse job report file and update of work report
        if os.path.exists(payload_report_file):
            payload_report = user.parse_jobreport_data(
                read_json(payload_report_file))
            work_report.update(payload_report)
            resource.process_jobreport(payload_report_file, job_scratch_dir,
                                       work_dir)

        resource.postprocess_workdir(job_scratch_dir)

        # output files should not be packed with logs
        protectedfiles = job.output_files.keys()

        # log file not produced (yet), so should be excluded
        if job.log_file in protectedfiles:
            protectedfiles.remove(job.log_file)
        else:
            logger.info("Log files was not declared")

        logger.info("Cleanup of working directory")

        protectedfiles.extend(
            [worker_attributes_file, worker_stageout_declaration])
        user.remove_redundant_files(job_scratch_dir, protectedfiles)
        res = tar_files(job_scratch_dir, protectedfiles, job.log_file)
        if res > 0:
            raise FileHandlingFailure("Log file tar failed")

        add_to_pilot_timing(job.jobid, PILOT_PRE_STAGEOUT, time.time(), args)
        # Copy of output to shared FS for stageout
        if not job_scratch_dir == work_dir:
            copy_output(job, job_scratch_dir, work_dir)
        add_to_pilot_timing(job.jobid, PILOT_POST_STAGEOUT, time.time(), args)

        logger.info("Declare stage-out")
        add_to_pilot_timing(job.jobid, PILOT_PRE_FINAL_UPDATE, time.time(),
                            args)
        declare_output(job, work_report, worker_stageout_declaration)

        logger.info("All done")
        publish_work_report(work_report, worker_attributes_file)
        traces.pilot['state'] = SUCCESS
        logger.debug("Final report: {0}".format(work_report))
        add_to_pilot_timing(job.jobid, PILOT_POST_FINAL_UPDATE, time.time(),
                            args)

    except Exception as e:
        work_report["jobStatus"] = "failed"
        work_report["exitMsg"] = str(e)
        publish_work_report(work_report, worker_attributes_file)
        logging.exception('exception caught:')
        traces.pilot['state'] = FAILURE

    return traces