コード例 #1
0
ファイル: downloadclient.py プロジェクト: ahandresf/rucio
    def _split_did_str(self, did_str):
        """
        Splits a given DID string (e.g. 'scope1:name.file') into its scope and name part
        (This function is meant to be used as class internal only)

        :param did_str: the DID string that will be splitted

        :returns: the scope- and name part of the given DID

        :raises InputValidationError: if the given DID string is not valid
        """
        did = did_str.split(':')
        if len(did) == 2:
            did_scope = did[0]
            did_name = did[1]
        elif len(did) == 1:
            did = did_str.split('.')
            did_scope = did[0]
            if did_scope == 'user' or did_scope == 'group':
                did_scope = '%s.%s' % (did[0], did[1])
            did_name = did_str
        else:
            raise InputValidationError('%s is not a valid DID. To many colons.' % did_str)

        if did_name.endswith('/'):
            did_name = did_name[:-1]

        return did_scope, did_name
コード例 #2
0
def parse_replicas_metalink(root):
    """
    Transforms the metalink tree into a list of dictionaries where
    each dictionary describes a file with its replicas.
    Will be called by parse_replicas_from_file and parse_replicas_from_string.

    :param root: root node of the metalink tree

    :returns: a list with a dictionary for each file
    """
    files = []

    # metalink namespace
    ns = '{urn:ietf:params:xml:ns:metalink}'
    str_to_bool = {'true': True, 'True': True, 'false': False, 'False': False}

    # loop over all <file> tags of the metalink string
    for file_tag_obj in root.findall(ns + 'file'):
        # search for identity-tag
        identity_tag_obj = file_tag_obj.find(ns + 'identity')
        if not ElementTree.iselement(identity_tag_obj):
            raise InputValidationError(
                'Failed to locate identity-tag inside %s' %
                ElementTree.tostring(file_tag_obj))

        cur_file = {
            'did': identity_tag_obj.text,
            'adler32': None,
            'md5': None,
            'sources': []
        }

        parent_dids = set()
        parent_dids_tag_obj = file_tag_obj.find(ns + 'parents')
        if ElementTree.iselement(parent_dids_tag_obj):
            for did_tag_obj in parent_dids_tag_obj.findall(ns + 'did'):
                parent_dids.add(did_tag_obj.text)
        cur_file['parent_dids'] = parent_dids

        size_tag_obj = file_tag_obj.find(ns + 'size')
        cur_file['bytes'] = int(
            size_tag_obj.text) if ElementTree.iselement(size_tag_obj) else None

        for hash_tag_obj in file_tag_obj.findall(ns + 'hash'):
            hash_type = hash_tag_obj.get('type')
            if hash_type:
                cur_file[hash_type] = hash_tag_obj.text

        for url_tag_obj in file_tag_obj.findall(ns + 'url'):
            key_rename_map = {'location': 'rse'}
            src = {}
            for k, v in url_tag_obj.items():
                k = key_rename_map.get(k, k)
                src[k] = str_to_bool.get(v, v)
            src['pfn'] = url_tag_obj.text
            cur_file['sources'].append(src)

        files.append(cur_file)

    return files
コード例 #3
0
ファイル: uploadclient.py プロジェクト: davidgcameron/rucio
    def _collect_and_validate_file_info(self, items):
        """
        Checks if there are any inconsistencies within the given input
        options and stores the output of _collect_file_info for every file
        (This function is meant to be used as class internal only)

        :param filepath: list of dictionaries with all input files and options

        :returns: a list of dictionaries containing all descriptions of the files to upload

        :raises InputValidationError: if an input option has a wrong format
        """
        logger = self.logger
        files = []
        for item in items:
            path = item.get('path')
            pfn = item.get('pfn')
            recursive = item.get('recursive')
            if not path:
                logger(
                    logging.WARNING,
                    'Skipping source entry because the key "path" is missing')
                continue
            if not item.get('rse'):
                logger(logging.WARNING,
                       'Skipping file %s because no rse was given' % path)
                continue
            if pfn:
                item['force_scheme'] = pfn.split(':')[0]
            if os.path.isdir(path) and not recursive:
                dname, subdirs, fnames = next(os.walk(path))
                for fname in fnames:
                    file = self._collect_file_info(os.path.join(dname, fname),
                                                   item)
                    files.append(file)
                if not len(fnames) and not len(subdirs):
                    logger(logging.WARNING,
                           'Skipping %s because it is empty.' % dname)
                elif not len(fnames):
                    logger(
                        logging.WARNING,
                        'Skipping %s because it has no files in it. Subdirectories are not supported.'
                        % dname)
            elif os.path.isdir(path) and recursive:
                files.extend(self._recursive(item))
            elif os.path.isfile(path) and not recursive:
                file = self._collect_file_info(path, item)
                files.append(file)
            elif os.path.isfile(path) and recursive:
                logger(logging.WARNING,
                       'Skipping %s because of --recursive flag' % path)
            else:
                logger(logging.WARNING, 'No such file or directory: %s' % path)

        if not len(files):
            raise InputValidationError('No valid input files given')

        return files
コード例 #4
0
    def collect_and_validate_file_info(self, sources_with_settings):
        logger = self.logger
        files = []
        for settings in sources_with_settings:
            path = settings.get('path')
            pfn = settings.get('pfn')
            if not path:
                logger.warning(
                    'Skipping source entry because the key "path" is missing')
                continue
            if not settings.get('rse'):
                logger.warning('Skipping file %s because no rse was given' %
                               path)
                continue
            if pfn:
                if settings.get('no_register'):
                    logger.warning(
                        'Upload with given pfn implies that no_register is True'
                    )
                    settings['no_register'] = True
                scheme = settings.get('scheme')
                pfn_scheme = pfn.split(':')[0]
                if scheme and scheme != pfn_scheme:
                    logger.warning(
                        'PFN scheme (%s) overrides given scheme (%s)' %
                        (pfn_scheme, scheme))
                    scheme = pfn_scheme
                settings['scheme'] = pfn_scheme

            if os.path.isdir(path):
                dname, subdirs, fnames = os.walk(path).next()
                for fname in fnames:
                    file = self.collect_file_info(os.path.join(dname, fname),
                                                  settings)
                    files.append(file)
                if not len(fnames) and not len(subdirs):
                    logger.warning('Skipping %s because it is empty.' % dname)
                elif not len(fnames):
                    logger.warning(
                        'Skipping %s because it has no files in it. Subdirectories are not supported.'
                        % dname)
            elif os.path.isfile(path):
                file = self.collect_file_info(path, settings)
                files.append(file)
            else:
                logger.warning('No such file or directory: %s' % path)

        if not len(files):
            raise InputValidationError('No valid input files given')

        return files
コード例 #5
0
    def _collect_and_validate_file_info(self, sources_with_settings):
        """
        Checks if there are any inconsistencies within the given input
        options and stores the output of _collect_file_info for every file
        (This function is meant to be used as class internal only)

        :param filepath: list of dictionaries with all input files and options

        :returns: a list of dictionaries containing all descriptions of the files to upload

        :raises InputValidationError: if an input option has a wrong format
        """
        logger = self.logger
        files = []
        for settings in sources_with_settings:
            path = settings.get('path')
            pfn = settings.get('pfn')
            if not path:
                logger.warning('Skipping source entry because the key "path" is missing')
                continue
            if not settings.get('rse'):
                logger.warning('Skipping file %s because no rse was given' % path)
                continue
            if pfn:
                if settings.get('no_register'):
                    logger.warning('Upload with given pfn implies that no_register is True')
                    settings['no_register'] = True
                settings['force_scheme'] = pfn.split(':')[0]

            if os.path.isdir(path):
                dname, subdirs, fnames = next(os.walk(path))
                for fname in fnames:
                    file = self._collect_file_info(os.path.join(dname, fname), settings)
                    files.append(file)
                if not len(fnames) and not len(subdirs):
                    logger.warning('Skipping %s because it is empty.' % dname)
                elif not len(fnames):
                    logger.warning('Skipping %s because it has no files in it. Subdirectories are not supported.' % dname)
            elif os.path.isfile(path):
                file = self._collect_file_info(path, settings)
                files.append(file)
            else:
                logger.warning('No such file or directory: %s' % path)

        if not len(files):
            raise InputValidationError('No valid input files given')

        return files
コード例 #6
0
ファイル: uploadclient.py プロジェクト: davidgcameron/rucio
    def _recursive(self, item):
        """
        If the --recursive flag is set, it replicates the folder structure recursively into collections
        A folder only can have either other folders inside or files, but not both of them
            - If it has folders, the root folder will be a container
            - If it has files, the root folder will be a dataset
            - If it is empty, it does not create anything

        :param: item        dictionary containing all descriptions of the files to upload
        """
        files = []
        datasets = []
        containers = []
        attach = []
        scope = item.get('did_scope') if item.get(
            'did_scope') is not None else self.default_file_scope
        rse = item.get('rse')
        path = item.get('path')
        if path[-1] == '/':
            path = path[0:-1]
        i = 0
        path = os.path.abspath(path)
        for root, dirs, fnames in os.walk(path):
            if len(dirs) > 0 and len(fnames) > 0 and i == 0:
                self.logger(
                    logging.ERROR,
                    'A container can only have either collections or files, not both'
                )
                raise InputValidationError('Invalid input folder structure')
            if len(fnames) > 0:
                datasets.append({
                    'scope': scope,
                    'name': root.split('/')[-1],
                    'rse': rse
                })
                self.logger(logging.DEBUG,
                            'Appended dataset with DID %s:%s' % (scope, path))
                for fname in fnames:
                    file = self._collect_file_info(os.path.join(root, fname),
                                                   item)
                    file['dataset_scope'] = scope
                    file['dataset_name'] = root.split('/')[-1]
                    files.append(file)
                    self.logger(
                        logging.DEBUG,
                        'Appended file with DID %s:%s' % (scope, fname))
            elif len(dirs) > 0:
                containers.append({
                    'scope': scope,
                    'name': root.split('/')[-1]
                })
                self.logger(
                    logging.DEBUG,
                    'Appended container with DID %s:%s' % (scope, path))
                attach.extend([{
                    'scope': scope,
                    'name': root.split('/')[-1],
                    'rse': rse,
                    'dids': {
                        'scope': scope,
                        'name': dir_
                    }
                } for dir_ in dirs])
            elif len(dirs) == 0 and len(fnames) == 0:
                self.logger(logging.WARNING,
                            'The folder %s is empty, skipping' % root)
                continue
            i += 1
        # if everything went ok, replicate the folder structure in Rucio storage
        for dataset in datasets:
            try:
                self.client.add_dataset(scope=dataset['scope'],
                                        name=dataset['name'],
                                        rse=dataset['rse'])
                self.logger(
                    logging.INFO, 'Created dataset with DID %s:%s' %
                    (dataset['scope'], dataset['name']))
            except RucioException as error:
                self.logger(logging.ERROR, error)
                self.logger(
                    logging.ERROR,
                    'It was not possible to create dataset with DID %s:%s' %
                    (dataset['scope'], dataset['name']))
        for container in containers:
            try:
                self.client.add_container(scope=container['scope'],
                                          name=container['name'])
                self.logger(
                    logging.INFO, 'Created container with DID %s:%s' %
                    (container['scope'], container['name']))
            except RucioException as error:
                self.logger(logging.ERROR, error)
                self.logger(
                    logging.ERROR,
                    'It was not possible to create dataset with DID %s:%s' %
                    (container['scope'], container['name']))
        for att in attach:
            try:
                self.client.attach_dids(scope=att['scope'],
                                        name=att['name'],
                                        dids=[att['dids']])
                self.logger(
                    logging.INFO, 'DIDs attached to collection %s:%s' %
                    (att['scope'], att['name']))
            except RucioException as error:
                self.logger(logging.ERROR, error)
                self.logger(
                    logging.ERROR,
                    'It was not possible to attach to collection with DID %s:%s'
                    % (att['scope'], att['name']))
        return files
コード例 #7
0
ファイル: uploadclient.py プロジェクト: davidgcameron/rucio
    def upload(self, items, summary_file_path=None, traces_copy_out=None):
        """
        :param items: List of dictionaries. Each dictionary describing a file to upload. Keys:
            path                  - path of the file that will be uploaded
            rse                   - rse expression/name (e.g. 'CERN-PROD_DATADISK') where to upload the file
            did_scope             - Optional: custom did scope (Default: user.<account>)
            did_name              - Optional: custom did name (Default: name of the file)
            dataset_scope         - Optional: custom dataset scope
            dataset_name          - Optional: custom dataset name
            force_scheme          - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None)
            pfn                   - Optional: use a given PFN (this sets no_register to True, and no_register becomes mandatory)
            no_register           - Optional: if True, the file will not be registered in the rucio catalogue
            register_after_upload - Optional: if True, the file will be registered after successful upload
            lifetime              - Optional: the lifetime of the file after it was uploaded
            transfer_timeout      - Optional: time after the upload will be aborted
            guid                  - Optional: guid of the file
            recursive             - Optional: if set, parses the folder structure recursively into collections
        :param summary_file_path: Optional: a path where a summary in form of a json file will be stored
        :param traces_copy_out: reference to an external list, where the traces should be uploaded

        :returns: 0 on success

        :raises InputValidationError: if any input arguments are in a wrong format
        :raises RSEWriteBlocked: if a given RSE is not available for writing
        :raises NoFilesUploaded: if no files were successfully uploaded
        :raises NotAllFilesUploaded: if not all files were successfully uploaded
        """

        # helper to get rse from rse_expression:
        def _pick_random_rse(rse_expression):
            rses = [r['rse'] for r in self.client.list_rses(rse_expression)
                    ]  # can raise InvalidRSEExpression
            random.shuffle(rses)
            return rses[0]

        logger = self.logger
        self.trace['uuid'] = generate_uuid()

        # check given sources, resolve dirs into files, and collect meta infos
        files = self._collect_and_validate_file_info(items)
        logger(
            logging.DEBUG,
            'Num. of files that upload client is processing: {}'.format(
                len(files)))

        # check if RSE of every file is available for writing
        # and cache rse settings
        registered_dataset_dids = set()
        registered_file_dids = set()
        rse_expression = None
        for file in files:
            rse_expression = file['rse']
            rse = self.rse_expressions.setdefault(
                rse_expression, _pick_random_rse(rse_expression))

            if not self.rses.get(rse):
                rse_settings = self.rses.setdefault(
                    rse, rsemgr.get_rse_info(rse, vo=self.client.vo))
                if rse_settings['availability_write'] != 1:
                    raise RSEWriteBlocked(
                        '%s is not available for writing. No actions have been taken'
                        % rse)

            dataset_scope = file.get('dataset_scope')
            dataset_name = file.get('dataset_name')
            file['rse'] = rse
            if dataset_scope and dataset_name:
                dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name))
                file['dataset_did_str'] = dataset_did_str
                registered_dataset_dids.add(dataset_did_str)

            registered_file_dids.add('%s:%s' %
                                     (file['did_scope'], file['did_name']))
        wrong_dids = registered_file_dids.intersection(registered_dataset_dids)
        if len(wrong_dids):
            raise InputValidationError(
                'DIDs used to address both files and datasets: %s' %
                str(wrong_dids))
        logger(logging.DEBUG, 'Input validation done.')

        # clear this set again to ensure that we only try to register datasets once
        registered_dataset_dids = set()
        num_succeeded = 0
        summary = []
        for file in files:
            basename = file['basename']
            logger(logging.INFO, 'Preparing upload for file %s' % basename)

            no_register = file.get('no_register')
            register_after_upload = file.get(
                'register_after_upload') and not no_register
            pfn = file.get('pfn')
            force_scheme = file.get('force_scheme')
            delete_existing = False

            trace = copy.deepcopy(self.trace)
            # appending trace to list reference, if the reference exists
            if traces_copy_out is not None:
                traces_copy_out.append(trace)

            rse = file['rse']
            trace['scope'] = file['did_scope']
            trace['datasetScope'] = file.get('dataset_scope', '')
            trace['dataset'] = file.get('dataset_name', '')
            trace['remoteSite'] = rse
            trace['filesize'] = file['bytes']

            file_did = {'scope': file['did_scope'], 'name': file['did_name']}
            dataset_did_str = file.get('dataset_did_str')
            rse_settings = self.rses[rse]
            rse_sign_service = rse_settings.get('sign_url', None)
            is_deterministic = rse_settings.get('deterministic', True)
            if not is_deterministic and not pfn:
                logger(logging.ERROR,
                       'PFN has to be defined for NON-DETERMINISTIC RSE.')
                continue
            if pfn and is_deterministic:
                logger(
                    logging.WARNING,
                    'Upload with given pfn implies that no_register is True, except non-deterministic RSEs'
                )
                no_register = True

            # resolving local area networks
            domain = 'wan'
            rse_attributes = {}
            try:
                rse_attributes = self.client.list_rse_attributes(rse)
            except:
                logger(logging.WARNING,
                       'Attributes of the RSE: %s not available.' % rse)
            if (self.client_location and 'lan' in rse_settings['domain']
                    and 'site' in rse_attributes):
                if self.client_location['site'] == rse_attributes['site']:
                    domain = 'lan'
            logger(logging.DEBUG,
                   '{} domain is used for the upload'.format(domain))

            if not no_register and not register_after_upload:
                self._register_file(file, registered_dataset_dids)
            # if register_after_upload, file should be overwritten if it is not registered
            # otherwise if file already exists on RSE we're done
            if register_after_upload:
                if rsemgr.exists(rse_settings,
                                 pfn if pfn else file_did,
                                 domain=domain,
                                 auth_token=self.auth_token,
                                 logger=logger):
                    try:
                        self.client.get_did(file['did_scope'],
                                            file['did_name'])
                        logger(logging.INFO,
                               'File already registered. Skipping upload.')
                        trace['stateReason'] = 'File already exists'
                        continue
                    except DataIdentifierNotFound:
                        logger(
                            logging.INFO,
                            'File already exists on RSE. Previous left overs will be overwritten.'
                        )
                        delete_existing = True
            elif not is_deterministic and not no_register:
                if rsemgr.exists(rse_settings,
                                 pfn,
                                 domain=domain,
                                 auth_token=self.auth_token,
                                 logger=logger):
                    logger(
                        logging.INFO,
                        'File already exists on RSE with given pfn. Skipping upload. Existing replica has to be removed first.'
                    )
                    trace['stateReason'] = 'File already exists'
                    continue
                elif rsemgr.exists(rse_settings,
                                   file_did,
                                   domain=domain,
                                   auth_token=self.auth_token,
                                   logger=logger):
                    logger(
                        logging.INFO,
                        'File already exists on RSE with different pfn. Skipping upload.'
                    )
                    trace['stateReason'] = 'File already exists'
                    continue
            else:
                if rsemgr.exists(rse_settings,
                                 pfn if pfn else file_did,
                                 domain=domain,
                                 auth_token=self.auth_token,
                                 logger=logger):
                    logger(logging.INFO,
                           'File already exists on RSE. Skipping upload')
                    trace['stateReason'] = 'File already exists'
                    continue

            # protocol handling and upload
            protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings,
                                                     operation='write',
                                                     scheme=force_scheme,
                                                     domain=domain)
            protocols.reverse()
            success = False
            state_reason = ''
            logger(logging.DEBUG, str(protocols))
            while not success and len(protocols):
                protocol = protocols.pop()
                cur_scheme = protocol['scheme']
                logger(logging.INFO,
                       'Trying upload with %s to %s' % (cur_scheme, rse))
                lfn = {}
                lfn['filename'] = basename
                lfn['scope'] = file['did_scope']
                lfn['name'] = file['did_name']

                for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS:
                    if checksum_name in file:
                        lfn[checksum_name] = file[checksum_name]

                lfn['filesize'] = file['bytes']

                sign_service = None
                if cur_scheme == 'https':
                    sign_service = rse_sign_service

                trace['protocol'] = cur_scheme
                trace['transferStart'] = time.time()
                logger(logging.DEBUG,
                       'Processing upload with the domain: {}'.format(domain))
                try:
                    pfn = self._upload_item(
                        rse_settings=rse_settings,
                        rse_attributes=rse_attributes,
                        lfn=lfn,
                        source_dir=file['dirname'],
                        domain=domain,
                        force_scheme=cur_scheme,
                        force_pfn=pfn,
                        transfer_timeout=file.get('transfer_timeout'),
                        delete_existing=delete_existing,
                        sign_service=sign_service)
                    logger(logging.DEBUG, 'Upload done.')
                    success = True
                    file['upload_result'] = {
                        0: True,
                        1: None,
                        'success': True,
                        'pfn': pfn
                    }  # needs to be removed
                except (ServiceUnavailable, ResourceTemporaryUnavailable,
                        RSEOperationNotSupported, RucioException) as error:
                    logger(logging.WARNING, 'Upload attempt failed')
                    logger(logging.INFO,
                           'Exception: %s' % str(error),
                           exc_info=True)
                    state_reason = str(error)

            if success:
                num_succeeded += 1
                trace['transferEnd'] = time.time()
                trace['clientState'] = 'DONE'
                file['state'] = 'A'
                logger(logging.INFO,
                       'Successfully uploaded file %s' % basename)
                self._send_trace(trace)

                if summary_file_path:
                    summary.append(copy.deepcopy(file))

                if not no_register:
                    if register_after_upload:
                        self._register_file(file, registered_dataset_dids)
                    replica_for_api = self._convert_file_for_api(file)
                    if not self.client.update_replicas_states(
                            rse, files=[replica_for_api]):
                        logger(logging.WARNING,
                               'Failed to update replica state')

                # add file to dataset if needed
                if dataset_did_str and not no_register:
                    try:
                        self.client.attach_dids(file['dataset_scope'],
                                                file['dataset_name'],
                                                [file_did])
                    except Exception as error:
                        logger(logging.WARNING,
                               'Failed to attach file to the dataset')
                        logger(logging.DEBUG,
                               'Attaching to dataset {}'.format(str(error)))
            else:
                trace['clientState'] = 'FAILED'
                trace['stateReason'] = state_reason
                self._send_trace(trace)
                logger(logging.ERROR, 'Failed to upload file %s' % basename)

        if summary_file_path:
            logger(logging.DEBUG,
                   'Summary will be available at {}'.format(summary_file_path))
            final_summary = {}
            for file in summary:
                file_scope = file['did_scope']
                file_name = file['did_name']
                file_did_str = '%s:%s' % (file_scope, file_name)
                final_summary[file_did_str] = {
                    'scope': file_scope,
                    'name': file_name,
                    'bytes': file['bytes'],
                    'rse': file['rse'],
                    'pfn': file['upload_result'].get('pfn', ''),
                    'guid': file['meta']['guid']
                }

                for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS:
                    if checksum_name in file:
                        final_summary[file_did_str][checksum_name] = file[
                            checksum_name]

            with open(summary_file_path, 'w') as summary_file:
                json.dump(final_summary,
                          summary_file,
                          sort_keys=True,
                          indent=1)

        if num_succeeded == 0:
            raise NoFilesUploaded()
        elif num_succeeded != len(files):
            raise NotAllFilesUploaded()
        return 0
コード例 #8
0
ファイル: downloadclient.py プロジェクト: ahandresf/rucio
    def _prepare_items_for_download(self, items):
        """
        Resolves wildcarded DIDs, get DID details (e.g. type), and collects
        the available replicas for each DID
        (This function is meant to be used as class internal only)

        :param items: list of dictionaries containing the items to prepare

        :returns: list of dictionaries, one dict for each file to download

        :raises InputValidationError: if the given input is not valid or incomplete
        """
        logger = self.logger

        logger.info('Processing %d item(s) for input' % len(items))
        resolved_items = []
        # resolve input: extend rse expression, resolve wildcards, get did type
        for item in items:
            did_str = item.get('did')
            if not did_str:
                raise InputValidationError('The key did is mandatory')

            logger.debug('Processing item %s' % did_str)

            new_item = copy.deepcopy(item)

            # extend RSE expression to exclude tape RSEs for non-admin accounts
            if not self.is_admin:
                rse = new_item.get('rse')
                new_item['rse'] = 'istape=False' if not rse else '(%s)&istape=False' % rse
                logger.debug('RSE-Expression: %s' % new_item['rse'])

            # resolve any wildcards in the input dids
            did_scope, did_name = self._split_did_str(did_str)
            logger.debug('Splitted DID: %s:%s' % (did_scope, did_name))
            new_item['scope'] = did_scope
            if '*' in did_name:
                logger.debug('Resolving wildcarded DID %s' % did_str)
                for dids in self.client.list_dids(did_scope, filters={'name': did_name}, type='all', long=True):
                    logger.debug('%s - %s:%s' % (dids['did_type'], did_scope, dids['name']))
                    new_item['type'] = dids['did_type'].upper()
                    new_item['name'] = dids['name']
                    new_item['did'] = '%s:%s' % (did_scope, dids['name'])
                    resolved_items.append(new_item)
            else:
                new_item['type'] = self.client.get_did(did_scope, did_name)['type'].upper()
                new_item['name'] = did_name
                resolved_items.append(new_item)

        # this list will have one dict for each file to download
        file_items = []

        # get replicas for every file of the given dids
        logger.debug('%d DIDs after processing input' % len(resolved_items))
        for item in resolved_items:
            did_scope = item['scope']
            did_name = item['name']
            did_str = item['did']

            logger.debug('Processing: %s' % item)

            # since we are using metalink we need to explicitly
            # give all schemes (probably due to a bad server site implementation)
            force_scheme = item.get('force_scheme')
            if force_scheme:
                schemes = force_scheme if isinstance(force_scheme, list) else [force_scheme]
            else:
                schemes = ['davs', 'gsiftp', 'https', 'root', 'srm', 'file']

            # get PFNs of files and datasets
            metalink_str = self.client.list_replicas([{'scope': did_scope, 'name': did_name}],
                                                     schemes=schemes,
                                                     rse_expression=item.get('rse'),
                                                     client_location=self.client_location,
                                                     metalink=True)
            files_with_pfns = self._parse_list_replica_metalink(metalink_str)

            nrandom = item.get('nrandom')
            if nrandom:
                logger.info('Selecting %d random replicas from dataset %s' % (nrandom, did_str))
                random.shuffle(files_with_pfns)
                files_with_pfns = files_with_pfns[0:nrandom]

            for file_item in files_with_pfns:
                file_did_scope = file_item['scope']
                file_did_name = file_item['name']
                file_did_str = '%s:%s' % (file_did_scope, file_did_name)

                logger.debug('Queueing file: %s' % file_did_str)

                # put the input options from item into the file item
                file_item.update(item)

                dest_dir_name = file_did_scope
                if item['type'] != 'FILE':
                    # if the did is a dataset, scope and name were updated wrongly
                    file_item['scope'] = file_did_scope
                    file_item['name'] = file_did_name
                    file_item['did'] = file_did_str
                    file_item['dataset_scope'] = did_scope
                    file_item['dataset_name'] = did_name
                    dest_dir_name = did_name

                dest_dir_path = self._prepare_dest_dir(item.get('base_dir', '.'),
                                                       dest_dir_name, file_did_name,
                                                       item.get('no_subdir'))
                file_item['dest_dir_path'] = dest_dir_path
                dest_file_path = os.path.join(dest_dir_path, file_did_name)
                file_item['dest_file_path'] = dest_file_path
                file_item['temp_file_path'] = dest_file_path + '.part'

                file_items.append(file_item)

        return file_items
コード例 #9
0
ファイル: downloadclient.py プロジェクト: ahandresf/rucio
    def download_file_from_archive(self, items, trace_custom_fields={}):
        """
        Download items with a given PFN. This function can only download files, no datasets.

        :param items: List of dictionaries. Each dictionary describing a file to download. Keys:
            did                 - DID string of the archive file (e.g. 'scope:file.name'). Wildcards are not allowed
            archive             - DID string of the archive from which the file should be extracted
            rse                 - Optional: rse name (e.g. 'CERN-PROD_DATADISK'). RSE Expressions are allowed
            base_dir            - Optional: Base directory where the downloaded files will be stored. (Default: '.')
            no_subdir           - Optional: If true, files are written directly into base_dir and existing files are overwritten. (Default: False)
        :param trace_custom_fields: Custom key value pairs to send with the traces

        :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState
                  clientState can be one of the following: ALREADY_DONE, DONE, FILE_NOT_FOUND, FAIL_VALIDATE, FAILED

        :raises InputValidationError: if one of the input items is in the wrong format
        :raises NoFilesDownloaded: if no files could be downloaded
        :raises NotAllFilesDownloaded: if not all files could be downloaded
        :raises SourceNotFound: if xrdcp was unable to find the PFN
        :raises ServiceUnavailable: if xrdcp failed
        :raises RucioException: if something unexpected went wrong during the download
        """
        logger = self.logger
        trace = copy.deepcopy(self.trace_tpl)
        trace['uuid'] = generate_uuid()
        log_prefix = 'Extracting files: '

        logger.info('Processing %d item(s) for input' % len(items))
        for item in items:
            archive = item.get('archive')
            file_extract = item.get('did')
            rse_name = item.get('rse')
            if not archive or not file_extract:
                raise InputValidationError('File DID and archive DID are mandatory')
            if '*' in archive:
                logger.debug(archive)
                raise InputValidationError('Cannot use PFN download with wildcard in DID')

            file_extract_scope, file_extract_name = self._split_did_str(file_extract)
            archive_scope, archive_name = self._split_did_str(archive)

            # listing all available replicas of given archhive file
            rse_expression = 'istape=False' if not rse_name else '(%s)&istape=False' % rse_name
            archive_replicas = self.client.list_replicas([{'scope': archive_scope, 'name': archive_name}],
                                                         schemes=['root'],
                                                         rse_expression=rse_expression,
                                                         unavailable=False,
                                                         client_location=self.client_location)

            # preparing trace
            trace['scope'] = archive_scope
            trace['dataset'] = archive_name
            trace['filename'] = file_extract

            # preparing output directories
            dest_dir_path = self._prepare_dest_dir(item.get('base_dir', '.'),
                                                   os.path.join(archive_scope, archive_name + '.extracted'), file_extract,
                                                   item.get('no_subdir'))
            logger.debug('%sPreparing output destination %s' % (log_prefix, dest_dir_path))

            # validation and customisation of list of replicas
            archive_replicas = list(archive_replicas)
            if len(archive_replicas) != 1:
                raise RucioException('No replicas for DID found or dataset was given.')
            archive_pfns = archive_replicas[0]['pfns'].keys()
            if len(archive_pfns) == 0:
                raise InputValidationError('No PFNs for replicas of archive %s' % archive)

            # checking whether file already exists
            success = False
            dest_file_path = os.path.join(dest_dir_path, file_extract)
            if os.path.isfile(dest_file_path):
                logger.info('%s%s File exists already locally: %s' % (log_prefix, file_extract_name, dest_dir_path))
                trace['clientState'] = 'ALREADY_DONE'
                trace['transferStart'] = time.time()
                trace['transferEnd'] = time.time()
                send_trace(trace, self.client.host, self.client.user_agent)
                success = True

            # DOWNLOAD, iteration over different rses unitl success
            retry_counter = 0
            while not success and len(archive_pfns):
                retry_counter += 1
                pfn = archive_pfns.pop()
                trace['rse'] = archive_replicas[0]['pfns'][pfn]['rse']
                try:
                    start_time = time.time()
                    cmd = 'xrdcp -vf %s -z %s file://%s' % (pfn, file_extract_name, dest_dir_path)
                    logger.debug('%sExecuting: %s' % (log_prefix, cmd))
                    status, out, err = execute(cmd)
                    end_time = time.time()
                    trace['transferStart'] = start_time
                    trace['transferEnd'] = end_time
                    if status == 54:
                        trace['clientState'] = 'FAILED'
                        raise SourceNotFound(err)
                    elif status != 0:
                        trace['clientState'] = 'FAILED'
                        raise RucioException(err)
                    else:
                        success = True
                        item['clientState'] = 'DONE'
                        trace['clientState'] = 'DONE'
                except Exception as e:
                    trace['clientState'] = 'FAILED'
                    raise ServiceUnavailable(e)
                send_trace(trace, self.client.host, self.client.user_agent)
            if not success:
                raise RucioException('Failed to download file %s after %d retries' % (file_extract_name, retry_counter))
        return self._check_output(items)
コード例 #10
0
ファイル: downloadclient.py プロジェクト: ahandresf/rucio
    def download_pfns(self, items, num_threads=2, trace_custom_fields={}):
        """
        Download items with a given PFN. This function can only download files, no datasets.

        :param items: List of dictionaries. Each dictionary describing a file to download. Keys:
            pfn                 - PFN string of this file
            did                 - DID string of this file (e.g. 'scope:file.name'). Wildcards are not allowed
            rse                 - rse name (e.g. 'CERN-PROD_DATADISK'). RSE Expressions are not allowed
            base_dir            - Optional: Base directory where the downloaded files will be stored. (Default: '.')
            no_subdir           - Optional: If true, files are written directly into base_dir and existing files are overwritten. (Default: False)
            ignore_checksum     - Optional: If true, the checksum validation is skipped (for pfn downloads the checksum must be given explicitly). (Default: True)
            transfer_timeout    - Optional: Timeout time for the download protocols. (Default: None)
        :param num_threads: Suggestion of number of threads to use for the download. It will be lowered if it's too high.
        :param trace_custom_fields: Custom key value pairs to send with the traces

        :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState
                  clientState can be one of the following: ALREADY_DONE, DONE, FILE_NOT_FOUND, FAIL_VALIDATE, FAILED

        :raises InputValidationError: if one of the input items is in the wrong format
        :raises NoFilesDownloaded: if no files could be downloaded
        :raises NotAllFilesDownloaded: if not all files could be downloaded
        :raises RucioException: if something unexpected went wrong during the download
        """
        logger = self.logger
        trace_custom_fields['uuid'] = generate_uuid()

        logger.info('Processing %d item(s) for input' % len(items))
        input_items = []
        for item in items:
            did_str = item.get('did')
            pfn = item.get('pfn')
            rse = item.get('rse')

            if not did_str or not pfn or not rse:
                logger.debug(item)
                raise InputValidationError('The keys did, pfn, and rse are mandatory')

            logger.debug('Preparing PFN download of %s (%s) from %s' % (did_str, pfn, rse))

            if '*' in did_str:
                logger.debug(did_str)
                raise InputValidationError('Cannot use PFN download with wildcard in DID')

            did_scope, did_name = self._split_did_str(did_str)
            dest_dir_path = self._prepare_dest_dir(item.get('base_dir', '.'),
                                                   did_scope, did_name,
                                                   item.get('no_subdir'))

            item['scope'] = did_scope
            item['name'] = did_name
            item['sources'] = [{'pfn': pfn, 'rse': rse}]
            dest_file_path = os.path.join(dest_dir_path, did_name)
            item['dest_dir_path'] = dest_dir_path
            item['dest_file_path'] = dest_file_path
            item['temp_file_path'] = dest_file_path + '.part'
            item.setdefault('ignore_checksum', True)

            input_items.append(item)

        num_files_in = len(input_items)
        output_items = self._download_multithreaded(input_items, num_threads, trace_custom_fields)
        num_files_out = len(output_items)

        if num_files_in != num_files_out:
            raise RucioException('%d items were in the input queue but only %d are in the output queue' % (num_files_in, num_files_out))

        return self._check_output(output_items)
コード例 #11
0
    def upload(self, items, summary_file_path=None):
        """

        :param items: List of dictionaries. Each dictionary describing a file to upload. Keys:
            path             - path of the file that will be uploaded
            rse              - rse name (e.g. 'CERN-PROD_DATADISK') where to upload the file
            did_scope        - Optional: custom did scope (Default: user.<account>)
            did_name         - Optional: custom did name (Default: name of the file)
            dataset_scope    - Optional: custom dataset scope
            dataset_name     - Optional: custom dataset name
            force_scheme     - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None)
            pfn              - Optional: use a given PFN (this sets no_register to True)
            no_register      - Optional: if True, the file will not be registered in the rucio catalogue
            lifetime         - Optional: the lifetime of the file after it was uploaded
            transfer_timeout - Optional: time after the upload will be aborted
            guid             - Optional: guid of the file
        :param summary_file_path: Optional: a path where a summary in form of a json file will be stored

        :returns: 0 on success

        :raises InputValidationError: if any input arguments are in a wrong format
        :raises RSEBlacklisted: if a given RSE is not available for writing
        :raises NoFilesUploaded: if no files were successfully uploaded
        :raises NotAllFilesUploaded: if not all files were successfully uploaded
        """
        logger = self.logger

        self.trace['uuid'] = generate_uuid()

        # check given sources, resolve dirs into files, and collect meta infos
        files = self._collect_and_validate_file_info(items)

        # check if RSE of every file is available for writing
        # and cache rse settings
        registered_dataset_dids = set()
        registered_file_dids = set()
        for file in files:
            rse = file['rse']
            if not self.rses.get(rse):
                rse_settings = self.rses.setdefault(rse,
                                                    rsemgr.get_rse_info(rse))
                if rse_settings['availability_write'] != 1:
                    raise RSEBlacklisted(
                        '%s is blacklisted for writing. No actions have been taken'
                        % rse)

            dataset_scope = file.get('dataset_scope')
            dataset_name = file.get('dataset_name')
            if dataset_scope and dataset_name:
                dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name))
                file['dataset_did_str'] = dataset_did_str
                registered_dataset_dids.add(dataset_did_str)

            registered_file_dids.add('%s:%s' %
                                     (file['did_scope'], file['did_name']))

        wrong_dids = registered_file_dids.intersection(registered_dataset_dids)
        if len(wrong_dids):
            raise InputValidationError(
                'DIDs used to address both files and datasets: %s' %
                str(wrong_dids))

        # clear this set again to ensure that we only try to register datasets once
        registered_dataset_dids = set()
        num_succeeded = 0
        for file in files:
            basename = file['basename']
            logger.info('Preparing upload for file %s' % basename)

            no_register = file.get('no_register')
            pfn = file.get('pfn')
            force_scheme = file.get('force_scheme')

            self.trace['scope'] = file['did_scope']
            self.trace['datasetScope'] = file.get('dataset_scope', '')
            self.trace['dataset'] = file.get('dataset_name', '')
            self.trace['remoteSite'] = rse
            self.trace['filesize'] = file['bytes']

            file_did = {'scope': file['did_scope'], 'name': file['did_name']}
            dataset_did_str = file.get('dataset_did_str')

            if not no_register:
                self._register_file(file, registered_dataset_dids)

            rse = file['rse']
            rse_settings = self.rses[rse]
            # if file already exists on RSE we're done
            if rsemgr.exists(rse_settings, file_did):
                logger.info('File already exists on RSE. Skipping upload')
                continue

            protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings,
                                                     operation='write',
                                                     scheme=force_scheme)
            protocols.reverse()
            success = False
            summary = []
            while not success and len(protocols):
                protocol = protocols.pop()
                cur_scheme = protocol['scheme']
                logger.info('Trying upload with %s to %s' % (cur_scheme, rse))
                lfn = {}
                lfn['filename'] = basename
                lfn['scope'] = file['did_scope']
                lfn['name'] = file['did_name']
                lfn['adler32'] = file['adler32']
                lfn['filesize'] = file['bytes']

                self.trace['protocol'] = cur_scheme
                self.trace['transferStart'] = time.time()
                try:
                    state = rsemgr.upload(
                        rse_settings=rse_settings,
                        lfns=lfn,
                        source_dir=file['dirname'],
                        force_scheme=cur_scheme,
                        force_pfn=pfn,
                        transfer_timeout=file.get('transfer_timeout'))
                    success = True
                    file['upload_result'] = state
                except (ServiceUnavailable,
                        ResourceTemporaryUnavailable) as error:
                    logger.warning('Upload attempt failed')
                    logger.debug('Exception: %s' % str(error))

            if success:
                num_succeeded += 1
                self.trace['transferEnd'] = time.time()
                self.trace['clientState'] = 'DONE'
                file['state'] = 'A'
                logger.info('Successfully uploaded file %s' % basename)
                send_trace(self.trace, self.client.host,
                           self.client.user_agent)

                if summary_file_path:
                    summary.append(copy.deepcopy(file))

                # add file to dataset if needed
                if dataset_did_str and not no_register:
                    try:
                        self.client.attach_dids(file['dataset_scope'],
                                                file['dataset_name'],
                                                [file_did])
                    except Exception as error:
                        logger.warning('Failed to attach file to the dataset')
                        logger.debug(error)
                if not no_register:
                    replica_for_api = self._convert_file_for_api(file)
                    if not self.client.update_replicas_states(
                            rse, files=[replica_for_api]):
                        logger.warning('Failed to update replica state')
            else:
                logger.error('Failed to upload file %s' % basename)

        if summary_file_path:
            final_summary = {}
            for file in summary:
                file_scope = file['did_scope']
                file_name = file['did_name']
                file_did_str = '%s:%s' % (file_scope, file_name)
                final_summary[file_did_str] = {
                    'scope': file['scope'],
                    'name': file['name'],
                    'bytes': file['bytes'],
                    'rse': file['rse'],
                    'pfn': file['upload_result']['pfn'],
                    'guid': file['meta']['guid'],
                    'adler32': file['adler32'],
                    'md5': file['md5']
                }
            with open(summary_file_path, 'wb') as summary_file:
                json.dump(final_summary,
                          summary_file,
                          sort_keys=True,
                          indent=1)

        if num_succeeded == 0:
            raise NoFilesUploaded()
        elif num_succeeded != len(files):
            raise NotAllFilesUploaded()
        return 0
コード例 #12
0
ファイル: downloadclient.py プロジェクト: TiO2/rucio
    def download_dids(self, items, num_threads=2, trace_custom_fields={}):
        """
        Download items with given DIDs. This function can also download datasets and wildcarded DIDs.

        :param items: List of dictionaries. Each dictionary describing an item to download. Keys:
            did                 - DID string of this file (e.g. 'scope:file.name'). Wildcards are not allowed
            rse                 - Optional: rse name (e.g. 'CERN-PROD_DATADISK') or rse expression from where to download
            force_scheme        - Optional: force a specific scheme to download this item. (Default: None)
            base_dir            - Optional: base directory where the downloaded files will be stored. (Default: '.')
            no_subdir           - Optional: If true, files are written directly into base_dir and existing files are overwritten. (Default: False)
            nrandom             - Optional: if the DID addresses a dataset, nrandom files will be randomly choosen for download from the dataset
            ignore_checksum     - Optional: If true, skips the checksum validation between the downloaded file and the rucio catalouge. (Default: False)
            transfer_timeout    - Optional: Timeout time for the download protocols. (Default: None)
        :param num_threads: Suggestion of number of threads to use for the download. It will be lowered if it's too high.
        :param trace_custom_fields: Custom key value pairs to send with the traces

        :returns: a list of dictionaries with an entry for each file, containing the input options, the did, and the clientState

        :raises InputValidationError: if one of the input items is in the wrong format
        :raises NoFilesDownloaded: if no files could be downloaded
        :raises NotAllFilesDownloaded: if not all files could be downloaded
        :raises RucioException: if something unexpected went wrong during the download
        """
        logger = self.logger
        trace_custom_fields['uuid'] = generate_uuid()

        logger.info('Processing %d item(s) for input' % len(items))
        resolved_items = []
        for item in items:
            did_str = item.get('did')
            if not did_str:
                raise InputValidationError('The key did is mandatory')

            logger.debug('Processing item %s' % did_str)

            new_item = copy.deepcopy(item)

            # extend RSE expression to exclude tape RSEs for non-admin accounts
            if not self.is_admin:
                rse = new_item.get('rse')
                new_item[
                    'rse'] = 'istape=False' if not rse else '(%s)&istape=False' % rse
                logger.debug('RSE-Expression: %s' % new_item['rse'])

            # resolve any wildcards in the input dids
            did_scope, did_name = self._split_did_str(did_str)
            logger.debug('Splitted DID: %s:%s' % (did_scope, did_name))
            new_item['scope'] = did_scope
            if '*' in did_name:
                logger.debug('Resolving wildcarded DID %s' % did_str)
                for dsn in self.client.list_dids(did_scope,
                                                 filters={'name': did_name},
                                                 type='all'):
                    logger.debug('%s:%s' % (did_scope, dsn))
                    new_item['name'] = dsn
                    new_item['did'] = '%s:%s' % (did_scope, dsn)
                    resolved_items.append(new_item)
            else:
                new_item['name'] = did_name
                resolved_items.append(new_item)

        input_items = []

        # get replicas for every file of the given dids
        logger.debug('%d DIDs after processing input' % len(resolved_items))
        for item in resolved_items:
            did_scope = item['scope']
            did_name = item['name']
            did_str = item['did']

            logger.debug('Processing: %s' % item)

            # get type of given did
            did_type = self.client.get_did(did_scope, did_name)['type'].upper()
            logger.debug('Type: %s' % did_type)

            # get replicas (RSEs) with PFNs for each file (especially if its a dataset)
            files_with_replicas = self.client.list_replicas(
                [{
                    'scope': did_scope,
                    'name': did_name
                }],
                schemes=item.get('force_scheme'),
                rse_expression=item.get('rse'),
                client_location=detect_client_location())

            nrandom = item.get('nrandom')
            if nrandom:
                logger.info('Selecting %d random replicas from dataset %s' %
                            (nrandom, did_str))
                files_with_replicas = list(files_with_replicas)
                random.shuffle(files_with_replicas)
                files_with_replicas = files_with_replicas[0:nrandom]

            for file_item in files_with_replicas:
                file_did_scope = file_item['scope']
                file_did_name = file_item['name']
                file_did_str = '%s:%s' % (file_did_scope, file_did_name)

                logger.debug('Queueing file: %s' % file_did_str)

                # put the input options from item into the file item
                file_item.update(item)

                dest_dir_name = file_did_scope
                if did_type == 'DATASET':
                    # if the did is a dataset, scope and name were updated wrongly
                    file_item['scope'] = file_did_scope
                    file_item['name'] = file_did_name
                    file_item['did'] = file_did_str
                    file_item['dataset_scope'] = did_scope
                    file_item['dataset_name'] = did_name
                    dest_dir_name = did_name

                dest_dir_path = self._prepare_dest_dir(
                    item.get('base_dir', '.'), dest_dir_name, file_did_name,
                    item.get('no_subdir'))
                file_item['dest_dir_path'] = dest_dir_path

                input_items.append(file_item)

        num_files_in = len(input_items)
        output_items = self._download_multithreaded(input_items, num_threads,
                                                    trace_custom_fields)
        num_files_out = len(output_items)

        if num_files_in != num_files_out:
            raise RucioException(
                '%d items were in the input queue but only %d are in the output queue'
                % (num_files_in, num_files_out))

        return self._check_output(output_items)
コード例 #13
0
ファイル: uploadclient.py プロジェクト: poush/rucio
    def upload(self, sources_with_settings, summary_file_path=None):
        """
        List of dictionaries of file descriptions. None means optional
        [{'path': 'file1',
          'rse': 'rse_name1',
          'did_scope': None,
          'did_name': None,
          'dataset_name': None,
          'dataset_scope': None,
          'scheme': None,
          'pfn': None,
          'no_register': None,
          'lifetime': None },

         {'path': 'file2',
          'rse': 'rse_name2',
          'did_scope': None,
          'did_name': None,
          'dataset_name': None,
          'dataset_scope': None,
          'scheme': None,
          'pfn': None,
          'no_register': None,
          'lifetime': None }]

          raises InputValidationError
          raises RSEBlacklisted
        """
        logger = self.logger

        self.trace['uuid'] = generate_uuid()

        # check given sources, resolve dirs into files, and collect meta infos
        files = self.collect_and_validate_file_info(sources_with_settings)

        # check if RSE of every file is available for writing
        # and cache rse settings
        registered_dataset_dids = set()
        registered_file_dids = set()
        for file in files:
            rse = file['rse']
            if not self.rses.get(rse):
                rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse))
                if rse_settings['availability_write'] != 1:
                    raise RSEBlacklisted('%s is blacklisted for writing. No actions have been taken' % rse)

            dataset_scope = file.get('dataset_scope')
            dataset_name = file.get('dataset_name')
            if dataset_scope and dataset_name:
                dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name))
                file['dataset_did_str'] = dataset_did_str
                registered_dataset_dids.add(dataset_did_str)

            registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name']))

        wrong_dids = registered_file_dids.intersection(registered_dataset_dids)
        if len(wrong_dids):
            raise InputValidationError('DIDs used to address both files and datasets: %s' % str(wrong_dids))

        # clear this set again to ensure that we only try to register datasets once
        registered_dataset_dids = set()
        for file in files:
            basename = file['basename']
            logger.info('Preparing upload for file %s' % basename)

            no_register = file.get('no_register')
            pfn = file.get('pfn')
            scheme = file.get('scheme')

            self.trace['scope'] = file['did_scope']
            self.trace['datasetScope'] = file.get('dataset_scope', '')
            self.trace['dataset'] = file.get('dataset_name', '')
            self.trace['remoteSite'] = rse
            self.trace['filesize'] = file['bytes']

            file_scope = file['did_scope']
            file_name = file['did_name']
            file_did = {'scope': file_scope, 'name': file_name}
            file_did_str = '%s:%s' % (file_scope, file_name)
            dataset_did_str = file.get('dataset_did_str')

            rse = file['rse']
            rse_settings = self.rses[rse]

            # register a dataset if we need to
            if dataset_did_str and dataset_did_str not in registered_dataset_dids and not no_register:
                registered_dataset_dids.add(dataset_did_str)
                try:
                    self.client.add_dataset(scope=file['dataset_scope'],
                                            name=file['dataset_name'],
                                            rules=[{'account': self.account,
                                                    'copies': 1,
                                                    'rse_expression': rse,
                                                    'grouping': 'DATASET',
                                                    'lifetime': file['lifetime']}])
                    logger.info('Dataset %s successfully created' % dataset_did_str)
                except DataIdentifierAlreadyExists:
                    # TODO: Need to check the rules thing!!
                    logger.info("Dataset %s already exists" % dataset_did_str)

            replica_for_api = self.convert_file_for_api(file)
            try:
                # if the remote checksum is different this did must not be used
                meta = self.client.get_metadata(file_scope, file_name)
                logger.info('Comparing checksums of %s and %s' % (basename, file_did_str))
                if meta['adler32'] != file['adler32']:
                    logger.error('Local checksum %s does not match remote checksum %s' % (file['adler32'], meta['adler32']))
                    raise DataIdentifierAlreadyExists

                # add file to rse if it is not registered yet
                replicastate = list(self.client.list_replicas([file_did], all_states=True))
                if rse not in replicastate[0]['rses'] and not no_register:
                    logger.info('Adding replica at %s in Rucio catalog' % rse)
                    self.client.add_replicas(rse=file['rse'], files=[replica_for_api])
            except DataIdentifierNotFound:
                if not no_register:
                    logger.info('Adding replica at %s in Rucio catalog' % rse)
                    self.client.add_replicas(rse=file['rse'], files=[replica_for_api])
                    if not dataset_did_str:
                        # only need to add rules for files if no dataset is given
                        logger.info('Adding replication rule at %s' % rse)
                        self.client.add_replication_rule([file_did], copies=1, rse_expression=rse, lifetime=file['lifetime'])

            # if file already exists on RSE we're done
            if not rsemgr.exists(rse_settings, file_did):
                protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=scheme)
                protocols.reverse()
                success = False
                summary = []
                while not success and len(protocols):
                    protocol = protocols.pop()
                    logger.info('Trying upload to %s with protocol %s' % (rse, protocol['scheme']))
                    lfn = {}
                    lfn['filename'] = file['basename']
                    lfn['scope'] = file['did_scope']
                    lfn['name'] = file['did_name']
                    lfn['adler32'] = file['adler32']
                    lfn['filesize'] = file['bytes']

                    self.trace['protocol'] = protocol['scheme']
                    self.trace['transferStart'] = time.time()
                    try:
                        state = rsemgr.upload(rse_settings=rse_settings,
                                              lfns=lfn,
                                              source_dir=file['dirname'],
                                              force_scheme=protocol['scheme'],
                                              force_pfn=pfn)
                        success = True
                        file['upload_result'] = state
                    except (ServiceUnavailable, ResourceTemporaryUnavailable) as error:
                        logger.warning('Upload attempt failed')
                        logger.debug('Exception: %s' % str(error))

                if success:
                    self.trace['transferEnd'] = time.time()
                    self.trace['clientState'] = 'DONE'
                    file['state'] = 'A'
                    logger.info('File %s successfully uploaded' % basename)
                    send_trace(self.trace, self.client.host, self.user_agent, logger=logger)
                    if summary_file_path:
                        summary.append(copy.deepcopy(file))
                else:
                    logger.error('Failed to upload file %s' % basename)
                    # TODO trace?
                    continue  # skip attach_did and update_states for this file
            else:
                logger.info('File already exists on RSE. Skipped upload')

            if not no_register:
                # add file to dataset if needed
                if dataset_did_str:
                    try:
                        logger.info('Attaching file to dataset %s' % dataset_did_str)
                        self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did])
                    except Exception as error:
                        logger.warning('Failed to attach file to the dataset')
                        logger.warning(error)

                logger.info('Setting replica state to available')
                replica_for_api = self.convert_file_for_api(file)
                self.client.update_replicas_states(rse, files=[replica_for_api])

        if summary_file_path:
            final_summary = {}
            for file in summary:
                file_scope = file['did_scope']
                file_name = file['did_name']
                file_did_str = '%s:%s' % (file_scope, file_name)
                final_summary[file_did_str] = {'scope': file['scope'],
                                               'name': file['name'],
                                               'bytes': file['bytes'],
                                               'rse': file['rse'],
                                               'pfn': file['upload_result']['pfn'],
                                               'guid': file['meta']['guid'],
                                               'adler32': file['adler32'],
                                               'md5': file['md5']}
            with open(summary_file_path, 'wb') as summary_file:
                json.dump(final_summary, summary_file, sort_keys=True, indent=1)
コード例 #14
0
    def upload(self, items, summary_file_path=None, traces_copy_out=None):
        """
        :param items: List of dictionaries. Each dictionary describing a file to upload. Keys:
            path                  - path of the file that will be uploaded
            rse                   - rse name (e.g. 'CERN-PROD_DATADISK') where to upload the file
            did_scope             - Optional: custom did scope (Default: user.<account>)
            did_name              - Optional: custom did name (Default: name of the file)
            dataset_scope         - Optional: custom dataset scope
            dataset_name          - Optional: custom dataset name
            force_scheme          - Optional: force a specific scheme (if PFN upload this will be overwritten) (Default: None)
            pfn                   - Optional: use a given PFN (this sets no_register to True, and no_register becomes mandatory)
            no_register           - Optional: if True, the file will not be registered in the rucio catalogue
            register_after_upload - Optional: if True, the file will be registered after successful upload
            lifetime              - Optional: the lifetime of the file after it was uploaded
            transfer_timeout      - Optional: time after the upload will be aborted
            guid                  - Optional: guid of the file
        :param summary_file_path: Optional: a path where a summary in form of a json file will be stored
        :param traces_copy_out: reference to an external list, where the traces should be uploaded
        :returns: 0 on success
        :raises InputValidationError: if any input arguments are in a wrong format
        :raises RSEBlacklisted: if a given RSE is not available for writing
        :raises NoFilesUploaded: if no files were successfully uploaded
        :raises NotAllFilesUploaded: if not all files were successfully uploaded
        """

        logger = self.logger
        from admix.helper import helper
        helper.global_dictionary['logger'].Info('     r: start')
        self.trace['uuid'] = generate_uuid()

        helper.global_dictionary['logger'].Info('     r: collect and validate')

        # check given sources, resolve dirs into files, and collect meta infos
        files = self._collect_and_validate_file_info(items)

        # check if RSE of every file is available for writing
        # and cache rse settings
        registered_dataset_dids = set()
        registered_file_dids = set()

        helper.global_dictionary['logger'].Info('     r: check if files are available')
        for file in files:
            rse = file['rse']
            if not self.rses.get(rse):
                rse_settings = self.rses.setdefault(rse, rsemgr.get_rse_info(rse))
                if rse_settings['availability_write'] != 1:
                    raise RSEBlacklisted('%s is blacklisted for writing. No actions have been taken' % rse)

            dataset_scope = file.get('dataset_scope')
            dataset_name = file.get('dataset_name')
            if dataset_scope and dataset_name:
                dataset_did_str = ('%s:%s' % (dataset_scope, dataset_name))
                file['dataset_did_str'] = dataset_did_str
                registered_dataset_dids.add(dataset_did_str)

            registered_file_dids.add('%s:%s' % (file['did_scope'], file['did_name']))

        wrong_dids = registered_file_dids.intersection(registered_dataset_dids)
        if len(wrong_dids):
            raise InputValidationError('DIDs used to address both files and datasets: %s' % str(wrong_dids))

        # clear this set again to ensure that we only try to register datasets once
        registered_dataset_dids = set()
        num_succeeded = 0
        summary = []

        helper.global_dictionary['logger'].Info('     r: Starting loop')

        for file in files:
            helper.global_dictionary['logger'].Info('Start')
            basename = file['basename']
            logger.info('Preparing upload for file %s' % basename)

            no_register = file.get('no_register')
            register_after_upload = file.get('register_after_upload') and not no_register
            pfn = file.get('pfn')
            force_scheme = file.get('force_scheme')
            delete_existing = False

            helper.global_dictionary['logger'].Info('1')
            trace = copy.deepcopy(self.trace)
            helper.global_dictionary['logger'].Info('2')
            # appending trace to list reference, if the reference exists
            if traces_copy_out is not None:
                helper.global_dictionary['logger'].Info('3')
                traces_copy_out.append(trace)

            helper.global_dictionary['logger'].Info('4')
            trace['scope'] = file['did_scope']
            trace['datasetScope'] = file.get('dataset_scope', '')
            trace['dataset'] = file.get('dataset_name', '')
            trace['remoteSite'] = rse
            trace['filesize'] = file['bytes']

            file_did = {'scope': file['did_scope'], 'name': file['did_name']}
            dataset_did_str = file.get('dataset_did_str')

            rse = file['rse']
            rse_settings = self.rses[rse]
            rse_sign_service = rse_settings.get('sign_url', None)
            is_deterministic = rse_settings.get('deterministic', True)
            if not is_deterministic and not pfn:
                logger.error('PFN has to be defined for NON-DETERMINISTIC RSE.')
                continue
            if pfn and is_deterministic:
                logger.warning('Upload with given pfn implies that no_register is True, except non-deterministic RSEs')
                no_register = True

            helper.global_dictionary['logger'].Info('5')
            if not no_register and not register_after_upload:
                helper.global_dictionary['logger'].Info('6')
                self._register_file(file, registered_dataset_dids,helper)

            helper.global_dictionary['logger'].Info('7')
            # if register_after_upload, file should be overwritten if it is not registered
            # otherwise if file already exists on RSE we're done
#            if register_after_upload:
#                helper.global_dictionary['logger'].Info('8')
#                if rsemgr.exists(rse_settings, pfn if pfn else file_did):
#                    helper.global_dictionary['logger'].Info('9')
#                    try:
#                        helper.global_dictionary['logger'].Info('10')
#                        self.client.get_did(file['did_scope'], file['did_name'])
#                        logger.info('File already registered. Skipping upload.')
#                        trace['stateReason'] = 'File already exists'
#                        continue
#                    except DataIdentifierNotFound:
#                        logger.info('File already exists on RSE. Previous left overs will be overwritten.')
#                        delete_existing = True
#            elif not is_deterministic and not no_register:
#                if rsemgr.exists(rse_settings, pfn):
#                    logger.info('File already exists on RSE with given pfn. Skipping upload. Existing replica has to be removed first.')
#                    trace['stateReason'] = 'File already exists'
#                    continue
#                elif rsemgr.exists(rse_settings, file_did):
#                    logger.info('File already exists on RSE with different pfn. Skipping upload.')
#                    trace['stateReason'] = 'File already exists'
#                    continue
#            else:
#                if rsemgr.exists(rse_settings, pfn if pfn else file_did):
#                    logger.info('File already exists on RSE. Skipping upload')
#                    trace['stateReason'] = 'File already exists'
#                    continue

            helper.global_dictionary['logger'].Info('11')
            # resolving local area networks
            domain = 'wan'
#            rse_attributes = {}
#            try:
#                rse_attributes = self.client.list_rse_attributes(rse)
#                helper.global_dictionary['logger'].Info('12')
#            except:
#                logger.warning('Attributes of the RSE: %s not available.' % rse)
#            if (self.client_location and 'lan' in rse_settings['domain'] and 'site' in rse_attributes):
#                if self.client_location['site'] == rse_attributes['site']:
#                    domain = 'lan'


            # protocol handling and upload
            protocols = rsemgr.get_protocols_ordered(rse_settings=rse_settings, operation='write', scheme=force_scheme, domain=domain)
            helper.global_dictionary['logger'].Info('14')
            protocols.reverse()
            helper.global_dictionary['logger'].Info('15')
            success = False
            state_reason = ''
            while not success and len(protocols):
                helper.global_dictionary['logger'].Info('16')
                protocol = protocols.pop()
                cur_scheme = protocol['scheme']
                logger.info('Trying upload with %s to %s' % (cur_scheme, rse))
                lfn = {}
                lfn['filename'] = basename
                lfn['scope'] = file['did_scope']
                lfn['name'] = file['did_name']

                for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS:
                    if checksum_name in file:
                        lfn[checksum_name] = file[checksum_name]

                lfn['filesize'] = file['bytes']

                sign_service = None
                if cur_scheme == 'https':
                    sign_service = rse_sign_service

                trace['protocol'] = cur_scheme
                trace['transferStart'] = time.time()
                try:
                    helper.global_dictionary['logger'].Info('17')
                    state = rsemgr.upload(rse_settings=rse_settings,
                                          lfns=lfn,
                                          source_dir=file['dirname'],
                                          force_scheme=cur_scheme,
                                          force_pfn=pfn,
                                          transfer_timeout=file.get('transfer_timeout'),
                                          delete_existing=delete_existing,
                                          sign_service=sign_service)
                    helper.global_dictionary['logger'].Info('18')
                    success = state['success']
                    file['upload_result'] = state
                except (ServiceUnavailable, ResourceTemporaryUnavailable) as error:
                    logger.warning('Upload attempt failed')
                    logger.debug('Exception: %s' % str(error))
                    state_reason = str(error)

            helper.global_dictionary['logger'].Info('19')

            if success:
                num_succeeded += 1
                trace['transferEnd'] = time.time()
                trace['clientState'] = 'DONE'
                file['state'] = 'A'
                logger.info('Successfully uploaded file %s' % basename)
                print('Successfully uploaded file %s' % basename)

                self._send_trace(trace)

                helper.global_dictionary['logger'].Info('Before if deepcopy')
                if summary_file_path:
                    helper.global_dictionary['logger'].Info('Before deepcopy')
                    summary.append(copy.deepcopy(file))

                helper.global_dictionary['logger'].Info('Before if register')
                if not no_register:
                    helper.global_dictionary['logger'].Info('Before if2 register')
                    if register_after_upload:
                        helper.global_dictionary['logger'].Info('Before register')
                        self._register_file(file, registered_dataset_dids,helper)
                    replica_for_api = self._convert_file_for_api(file)
                    helper.global_dictionary['logger'].Info('Before if register2')
                    if not self.client.update_replicas_states(rse, files=[replica_for_api]):
                        helper.global_dictionary['logger'].Info('Before if register3')
                        logger.warning('Failed to update replica state')

                # add file to dataset if needed
                helper.global_dictionary['logger'].Info('Before if attach')
                if dataset_did_str and not no_register:
                    try:
                        helper.global_dictionary['logger'].Info('Before attach')
                        self.client.attach_dids(file['dataset_scope'], file['dataset_name'], [file_did])
                        helper.global_dictionary['logger'].Info('After attach')
                    except Exception as error:
                        helper.global_dictionary['logger'].Info('Failed to attach file to the dataset')
                        logger.warning('Failed to attach file to the dataset')
                        logger.debug(error)
                helper.global_dictionary['logger'].Info('Really finished')
            else:
                trace['clientState'] = 'FAILED'
                trace['stateReason'] = state_reason
                self._send_trace(trace)
                logger.error('Failed to upload file %s' % basename)

        if summary_file_path:
            final_summary = {}
            for file in summary:
                file_scope = file['did_scope']
                file_name = file['did_name']
                file_did_str = '%s:%s' % (file_scope, file_name)
                final_summary[file_did_str] = {'scope': file_scope,
                                               'name': file_name,
                                               'bytes': file['bytes'],
                                               'rse': file['rse'],
                                               'pfn': file['upload_result'].get('pfn', ''),
                                               'guid': file['meta']['guid']}

                for checksum_name in GLOBALLY_SUPPORTED_CHECKSUMS:
                    if checksum_name in file:
                        final_summary[file_did_str][checksum_name] = file[checksum_name]

            with open(summary_file_path, 'wb') as summary_file:
                json.dump(final_summary, summary_file, sort_keys=True, indent=1)

        if num_succeeded == 0:
            raise NoFilesUploaded()
        elif num_succeeded != len(files):
            raise NotAllFilesUploaded()
        return 0