Example #1
0
File: mv.py Project: brinick/pilot2
def copy_out(files, copy_type="mv", **kwargs):
    """
    Tries to upload the given files using mv directly.

    :param files: list of `FileSpec` objects
    :raises PilotException: StageOutFailure
    """

    if copy_type not in ["cp", "mv"]:
        raise StageOutFailure("incorrect method for copy out")

    if not kwargs.get('workdir'):
        raise StageOutFailure("Workdir is not specified")

    exit_code, stdout, stderr = move_all_files(files, copy_type,
                                               kwargs.get('workdir'))
    if exit_code != 0:
        # raise failure
        raise StageOutFailure(stdout)

    # Create output list for ARC CE if necessary
    logger.debug('init_dir for output.list=%s' %
                 os.path.dirname(kwargs.get('workdir')))
    output_dir = kwargs.get('output_dir', '')
    if not output_dir:
        create_output_list(files, os.path.dirname(kwargs.get('workdir')),
                           kwargs.get('ddmconf', None))

    return files
Example #2
0
def handle_updated_job_object(job, xdata, label='stage-in'):
    """
    Handle updated job object fields.

    :param job: job object.
    :param xdata: list of FileSpec objects.
    :param label: 'stage-in/out' (string).
    :return:
    :raises: StageInFailure, StageOutFailure
    """

    dictionary_name = config.Container.stagein_status_dictionary if label == 'stage-in' else config.Container.stageout_status_dictionary

    # read the JSON file created by the stage-in/out script
    if path.exists(path.join(job.workdir, dictionary_name + '.log')):
        dictionary_name += '.log'
    file_dictionary = read_json(path.join(job.workdir, dictionary_name))

    # update the job object accordingly
    if file_dictionary:
        # get file info and set essential parameters
        for fspec in xdata:
            try:
                fspec.status = file_dictionary[fspec.lfn][0]
                fspec.status_code = file_dictionary[fspec.lfn][1]
                if label == 'stage-in':
                    fspec.turl = file_dictionary[fspec.lfn][2]
                    fspec.ddmendpoint = file_dictionary[fspec.lfn][3]
                else:
                    fspec.surl = file_dictionary[fspec.lfn][2]
                    fspec.turl = file_dictionary[fspec.lfn][3]
                    fspec.checksum['adler32'] = file_dictionary[fspec.lfn][4]
                    fspec.filesize = file_dictionary[fspec.lfn][5]
            except Exception as exc:
                msg = "exception caught while reading file dictionary: %s" % exc
                logger.warning(msg)
                if label == 'stage-in':
                    raise StageInFailure(msg)
                else:
                    raise StageOutFailure(msg)

        # get main error info ('error': [error_diag, error_code])
        error_diag = file_dictionary['error'][0]
        error_code = file_dictionary['error'][1]
        if error_code:
            job.piloterrorcodes, job.piloterrordiags = errors.add_error_code(
                error_code, msg=error_diag)
    else:
        msg = "%s file dictionary not found" % label
        logger.warning(msg)
        if label == 'stage-in':
            raise StageInFailure(msg)
        else:
            raise StageOutFailure(msg)
Example #3
0
def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out, transfer_timeout):

    # init. download client
    from rucio.client.uploadclient import UploadClient
    upload_client = UploadClient(logger=logger)

    # traces are turned off
    if hasattr(upload_client, 'tracing'):
        upload_client.tracing = tracing_rucio
    if tracing_rucio:
        upload_client.trace = trace_report

    # file specifications before the upload
    f = {}
    f['path'] = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(fspec.workdir, fspec.lfn)
    f['rse'] = fspec.ddmendpoint
    f['did_scope'] = fspec.scope
    f['no_register'] = True

    if transfer_timeout:
        f['transfer_timeout'] = transfer_timeout

    # if fspec.storageId and int(fspec.storageId) > 0:
    #     if fspec.turl and fspec.is_nondeterministic:
    #         f['pfn'] = fspec.turl
    # elif fspec.lfn and '.root' in fspec.lfn:
    #     f['guid'] = fspec.guid
    if fspec.lfn and '.root' in fspec.lfn:
        f['guid'] = fspec.guid

    # process with the upload
    logger.info('_stage_out_api: %s' % str(f))
    result = None

    # upload client raises an exception if any file failed
    try:
        result = upload_client.upload([f], summary_file_path=summary_file_path, traces_copy_out=trace_report_out)
    except UnboundLocalError:
        logger.warning('rucio still needs a bug fix of the summary in the uploadclient')

    logger.debug('Rucio upload client returned %s' % result)

    try:
        file_exists = verify_stage_out(fspec)
        logger.info('File exists at the storage: %s' % str(file_exists))
        if not file_exists:
            raise StageOutFailure('stageOut: Physical check after upload failed.')
    except Exception as e:
        msg = 'stageOut: File existence verification failed with: %s' % str(e)
        logger.info(msg)
        raise StageOutFailure(msg)

    return trace_report_out
Example #4
0
def copy_out_old(files):
    """
    Tries to upload the given files using lsm-put directly.

    :param files: Files to upload
    :raises PilotException: StageOutFailure
    """

    if not check_for_lsm(dst_in=False):
        raise StageOutFailure("No LSM tools found")

    exit_code, stdout, stderr = move_all_files_out(files)
    if exit_code != 0:
        # raise failure
        raise StageOutFailure(stdout)
Example #5
0
def copy_out(files, **kwargs):
    """
    Upload given files using gfal command.

    :param files: Files to upload
    :raises: PilotException in case of errors
    """

    if not check_for_gfal():
        raise StageOutFailure("No GFAL2 tools found")

    trace_report = kwargs.get('trace_report')

    for fspec in files:
        trace_report.update(scope=fspec.scope, dataset=fspec.dataset, url=fspec.surl, filesize=fspec.filesize)
        trace_report.update(catStart=time(), filename=fspec.lfn, guid=fspec.guid.replace('-', ''))

        src = fspec.workdir or kwargs.get('workdir') or '.'

        timeout = get_timeout(fspec.filesize)

        source = "file://%s" % os.path.abspath(fspec.surl or os.path.join(src, fspec.lfn))
        destination = fspec.turl

        cmd = ['gfal-copy --verbose -f', ' -t %s' % timeout]

        if fspec.checksum:
            cmd += ['-K', '%s:%s' % list(fspec.checksum.items())[0]]  # Python 2/3

        cmd += [source, destination]

        rcode, stdout, stderr = execute(" ".join(cmd), **kwargs)

        if rcode:  ## error occurred
            if rcode in [errno.ETIMEDOUT, errno.ETIME]:
                error = {'rcode': ErrorCodes.STAGEOUTTIMEOUT,
                         'state': 'CP_TIMEOUT',
                         'error': 'Copy command timed out: %s' % stderr}
            else:
                error = resolve_common_transfer_errors(stdout + stderr, is_stagein=False)
            fspec.status = 'failed'
            fspec.status_code = error.get('rcode')
            trace_report.update(clientState=error.get('state', None) or 'STAGEOUT_ATTEMPT_FAILED',
                                stateReason=error.get('error', 'unknown error'),
                                timeEnd=time())
            trace_report.send()
            raise PilotException(error.get('error'), code=error.get('rcode'), state=error.get('state'))

        fspec.status_code = 0
        fspec.status = 'transferred'
        trace_report.update(clientState='DONE', stateReason='OK', timeEnd=time())
        trace_report.send()

    return files
Example #6
0
File: mv.py Project: ptrlv/pilot2
def copy_out(files, copy_type="mv", **kwargs):
    """
    Tries to upload the given files using mv directly.

    :param files: list of `FileSpec` objects
    :raises PilotException: StageOutFailure
    """

    if copy_type not in ["cp", "mv"]:
        raise StageOutFailure("Incorrect method for copy out")

    if not kwargs.get('workdir'):
        raise StageOutFailure("Workdir is not specified")

    exit_code, stdout, stderr = move_all_files(files, copy_type,
                                               kwargs.get('workdir'))
    if exit_code != 0:
        # raise failure
        raise StageOutFailure(stdout)

    # Create output list for ARC CE
    create_output_list(files, os.path.dirname(kwargs.get('workdir')))

    return files
Example #7
0
    def stageout_es_real(self, output_file):
        """
        Stage out event service output file.

        :param output_file: output file name.
        """

        job = self.get_job()
        logger.info('prepare to stage-out eventservice files')

        error = None
        file_data = {
            'scope': 'transient',
            'lfn': os.path.basename(output_file),
        }
        file_spec = FileSpec(filetype='output', **file_data)
        xdata = [file_spec]
        kwargs = dict(workdir=job.workdir,
                      cwd=job.workdir,
                      usecontainer=False,
                      job=job)

        try_failover = False
        activity = [
            'es_events', 'pw'
        ]  ## FIX ME LATER: replace `pw` with `write_lan` once AGIS is updated (acopytools)

        try:
            client = StageOutESClient(job.infosys, logger=logger)
            try_failover = True

            client.prepare_destinations(
                xdata, activity
            )  ## IF ES job should be allowed to write only at `es_events` astorages, then fix activity names here
            client.transfer(xdata, activity=activity, **kwargs)
        except PilotException as error:
            logger.error(error.get_detail())
        except Exception as e:
            logger.error(traceback.format_exc())
            error = StageOutFailure("stageOut failed with error=%s" % e)

        logger.info('Summary of transferred files:')
        logger.info(" -- lfn=%s, status_code=%s, status=%s" %
                    (file_spec.lfn, file_spec.status_code, file_spec.status))

        if error:
            logger.error(
                'Failed to stage-out eventservice file(%s): error=%s' %
                (output_file, error.get_detail()))
        elif file_spec.status != 'transferred':
            msg = 'Failed to stage-out ES file(%s): logic corrupted: unknown internal error, fspec=%s' % (
                output_file, file_spec)
            logger.error(msg)
            raise StageOutFailure(msg)

        failover_storage_activity = ['es_failover', 'pw']

        if try_failover and error and error.get_error_code() not in [
                ErrorCodes.MISSINGOUTPUTFILE
        ]:  ## try to failover to other storage

            xdata2 = [FileSpec(filetype='output', **file_data)]

            try:
                client.prepare_destinations(xdata2, failover_storage_activity)
                if xdata2[0].ddmendpoint != xdata[
                        0].ddmendpoint:  ## skip transfer to same output storage
                    msg = 'Will try to failover ES transfer to astorage with activity=%s, rse=%s' % (
                        failover_storage_activity, xdata2[0].ddmendpoint)
                    logger.info(msg)
                    client.transfer(xdata2, activity=activity, **kwargs)

                    logger.info(
                        'Summary of transferred files (failover transfer):')
                    logger.info(" -- lfn=%s, status_code=%s, status=%s" %
                                (xdata2[0].lfn, xdata2[0].status_code,
                                 xdata2[0].status))

            except PilotException as e:
                if e.get_error_code() == ErrorCodes.NOSTORAGE:
                    logger.info(
                        'Failover ES storage is not defined for activity=%s .. skipped'
                        % failover_storage_activity)
                else:
                    logger.error(
                        'Transfer to failover storage=%s failed .. skipped, error=%s'
                        % (xdata2[0].ddmendpoint, e.get_detail()))
            except Exception:
                logger.error('Failover ES stageout failed .. skipped')
                logger.error(traceback.format_exc())

            if xdata2[0].status == 'transferred':
                error = None
                file_spec = xdata2[0]

        if error:
            raise error

        storage_id = infosys.get_storage_id(file_spec.ddmendpoint)

        return file_spec.ddmendpoint, storage_id, file_spec.filesize, file_spec.checksum
Example #8
0
def containerise_middleware(job,
                            xdata,
                            queue,
                            eventtype,
                            localsite,
                            remotesite,
                            container_options,
                            external_dir,
                            label='stage-in',
                            container_type='container'):
    """
    Containerise the middleware by performing stage-in/out steps in a script that in turn can be run in a container.

    Note: a container will only be used for option container_type='container'. If this is 'bash', then stage-in/out
    will still be done by a script, but not containerised.

    Note: this function is tailor made for stage-in/out.

    :param job: job object.
    :param xdata: list of FileSpec objects.
    :param queue: queue name (string).
    :param eventtype:
    :param localsite:
    :param remotesite:
    :param container_options: container options from queuedata (string).
    :param external_dir: input or output files directory (string).
    :param label: optional 'stage-in/out' (String).
    :param container_type: optional 'container/bash'
    :raises StageInFailure: for stage-in failures
    :raises StageOutFailure: for stage-out failures
    :return:
    """

    cwd = getcwd()

    # get the name of the stage-in/out isolation script
    script = config.Container.middleware_container_stagein_script if label == 'stage-in' else config.Container.middleware_container_stageout_script

    try:
        cmd = get_command(job,
                          xdata,
                          queue,
                          script,
                          eventtype,
                          localsite,
                          remotesite,
                          external_dir,
                          label=label,
                          container_type=container_type)
    except PilotException as e:
        raise e

    if container_type == 'container':
        # add bits and pieces needed to run the cmd in a container
        pilot_user = environ.get('PILOT_USER', 'generic').lower()
        user = __import__('pilot.user.%s.container' % pilot_user, globals(),
                          locals(), [pilot_user], 0)  # Python 2/3
        try:
            cmd = user.create_middleware_container_command(job.workdir,
                                                           cmd,
                                                           container_options,
                                                           label=label)
        except PilotException as e:
            raise e
    else:
        logger.warning(
            '%s will not be done in a container (but it will be done by a script)',
            label)

    try:
        logger.info('*** executing %s (logging will be redirected) ***', label)
        exit_code, stdout, stderr = execute(cmd, job=job, usecontainer=False)
    except Exception as exc:
        logger.info('*** %s has failed ***', label)
        logger.warning('exception caught: %s', exc)
    else:
        if exit_code == 0:
            logger.info('*** %s has finished ***', label)
        else:
            logger.info('*** %s has failed ***', label)
            logger.warning('stderr:\n%s', stderr)
            logger.warning('stdout:\n%s', stdout)
        logger.debug('%s script returned exit_code=%d', label, exit_code)

        # write stdout+stderr to files
        try:
            _stdout_name, _stderr_name = get_logfile_names(label)
            write_file(path.join(job.workdir, _stdout_name),
                       stdout,
                       mute=False)
            write_file(path.join(job.workdir, _stderr_name),
                       stderr,
                       mute=False)
        except PilotException as exc:
            msg = 'exception caught: %s' % exc
            if label == 'stage-in':
                raise StageInFailure(msg)
            else:
                raise StageOutFailure(msg)

    # handle errors, file statuses, etc (the stage-in/out scripts write errors and file status to a json file)
    try:
        handle_updated_job_object(job, xdata, label=label)
    except PilotException as exc:
        raise exc
Example #9
0
def _stage_out_api(fspec, summary_file_path, trace_report, trace_report_out,
                   transfer_timeout):

    ec = 0

    # init. download client
    from rucio.client.uploadclient import UploadClient
    upload_client = UploadClient(logger=logger)

    # traces are turned off
    if hasattr(upload_client, 'tracing'):
        upload_client.tracing = tracing_rucio
    if tracing_rucio:
        upload_client.trace = trace_report

    # file specifications before the upload
    f = {}
    f['path'] = fspec.surl or getattr(fspec, 'pfn', None) or os.path.join(
        fspec.workdir, fspec.lfn)
    f['rse'] = fspec.ddmendpoint
    f['did_scope'] = fspec.scope
    f['no_register'] = True

    if transfer_timeout:
        f['transfer_timeout'] = transfer_timeout
    f['connection_timeout'] = 60 * 60

    # if fspec.storageId and int(fspec.storageId) > 0:
    #     if fspec.turl and fspec.is_nondeterministic:
    #         f['pfn'] = fspec.turl
    # elif fspec.lfn and '.root' in fspec.lfn:
    #     f['guid'] = fspec.guid
    if fspec.lfn and '.root' in fspec.lfn:
        f['guid'] = fspec.guid

    logger.info('rucio API stage-out dictionary: %s' % f)

    # upload client raises an exception if any file failed
    try:
        logger.info('*** rucio API uploading file (taking over logging) ***')
        logger.debug('summary_file_path=%s' % summary_file_path)
        logger.debug('trace_report_out=%s' % trace_report_out)
        result = upload_client.upload([f],
                                      summary_file_path=summary_file_path,
                                      traces_copy_out=trace_report_out)
    except Exception as e:
        logger.warning('*** rucio API upload client failed ***')
        logger.warning('caught exception: %s' % e)
        import traceback
        logger.error(traceback.format_exc())
        logger.debug('trace_report_out=%s' % trace_report_out)
        if not trace_report_out:
            raise e
        if not trace_report_out[0].get('stateReason'):
            raise e
        ec = -1
    except UnboundLocalError:
        logger.warning('*** rucio API upload client failed ***')
        logger.warning(
            'rucio still needs a bug fix of the summary in the uploadclient')
    else:
        logger.warning('*** rucio API upload client finished ***')
        logger.debug('client returned %s' % result)

    try:
        file_exists = verify_stage_out(fspec)
        logger.info('file exists at the storage: %s' % str(file_exists))
        if not file_exists:
            raise StageOutFailure('physical check after upload failed')
    except Exception as e:
        msg = 'file existence verification failed with: %s' % e
        logger.info(msg)
        raise StageOutFailure(msg)

    return ec, trace_report_out