Beispiel #1
0
def get_aip_info(aic_dir):
    """ Get AIP UUID, name and labels from objects directory and METS file. """
    aips = []
    aic_dir = os.path.join(aic_dir, 'objects')
    # Parse out AIP names and UUIDs
    # The only contents of the folder should be a bunch of files whose filenames
    # are AIP UUIDs, and the contents are the AIP name.
    uuid_regex = r'^[0-9A-Fa-f]{8}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{4}-[0-9A-Fa-f]{12}$'
    files = [d for d in os.listdir(aic_dir)
             if os.path.isfile(os.path.join(aic_dir, d)) and re.match(uuid_regex, d)]
    for filename in files:
        file_path = os.path.join(aic_dir, filename)
        with open(file_path, 'r') as f:
            aip_name = f.readline()
        os.remove(file_path)
        aips.append({'name': aip_name, 'uuid': filename})

    # Fetch the METS file and parse out the Dublic Core metadata with the label
    for aip in aips:
        mets_in_aip = "{aip_name}-{aip_uuid}/data/METS.{aip_uuid}.xml".format(
            aip_name=aip['name'], aip_uuid=aip['uuid'])
        mets_path = os.path.join(aic_dir, "METS.{}.xml".format(aip['uuid']))
        storage_service.extract_file(aip['uuid'], mets_in_aip, mets_path)

        root = etree.parse(mets_path)
        # Title may be namespaced as dc: or dcterms: depending on version
        aip['label'] = (
            root.findtext('mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dc:title', namespaces=ns.NSMAP) or root.findtext('mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:title', namespaces=ns.NSMAP) or '')

        os.remove(mets_path)

    print('AIP info:', aips)
    return aips
Beispiel #2
0
def get_aips_in_aic(mets_root, temp_dir, uuid):
    """Return the number of AIPs in the AIC as found in the AIP METS.

    :param mets_root: AIP METS document root.
    :param temp_dir: Path to tempdir where we'll write AIC METS file.
    :param uuid: AIC UUID.

    :returns: Count of AIPs in AIC or None.
    """
    # Find the name of AIC METS file from within the AIP METS file.
    aic_mets_filename = am.find_aic_mets_filename(mets_root)
    aip_dirname = am.find_aip_dirname(mets_root)
    if aic_mets_filename is None or aip_dirname is None:
        return None

    # Download a copy of the AIC METS file.
    mets_relative_path = os.path.join(aip_dirname, "data", aic_mets_filename)
    aic_mets_filename = os.path.basename(aic_mets_filename)
    mets_download_path = os.path.join(temp_dir, aic_mets_filename)
    storageService.extract_file(uuid, mets_relative_path, mets_download_path)
    if not os.path.isfile(mets_download_path):
        return None

    # Find number of AIPs in the AIC in AIC METS file.
    aic_root = etree.parse(mets_download_path)
    aips_in_aic = am.find_aips_in_aic(aic_root)
    return aips_in_aic
Beispiel #3
0
def upload_dip_metadata_to_atom(aip_name, aip_uuid, parent_slug):
    """
    Write to a AtoM's resource (parent_slug) the metadata of the objects of a
    AIP given its name and UUID. Return the slug of the new container resource
    created to hold the metadata objects.
    """
    with tempfile.NamedTemporaryFile() as temp:
        # Download METS file
        mets_path = "{}-{}/data/METS.{}.xml".format(aip_name, aip_uuid, aip_uuid)
        logger.debug("Extracting file %s into %s", mets_path, temp.name)
        try:
            extract_file(aip_uuid, mets_path, temp.name)
        except requests.exceptions.RequestException:
            raise AtomMetadataUploadError

        client = get_atom_client()
        mw = METSDocument.fromfile(temp.name)

        # Create file container
        try:
            logger.info(
                "Creating file container with slug %s and title %s",
                parent_slug,
                aip_name,
            )
            file_slug = client.add_child(
                parent_slug=parent_slug, title=aip_name, level="File"
            )
        except (AtomError, CommunicationError):
            raise AtomMetadataUploadError

        # Add objects
        for item in mw.all_files():
            if item.type == "Directory" or item.use != "original":
                continue
            attrs = {
                "title": os.path.basename(item.path),
                "usage": "Offline",
                "file_uuid": item.file_uuid,
                "aip_uuid": aip_uuid,
            }
            _load_premis(attrs, item)
            title = os.path.basename(item.path)
            try:
                logger.info("Creating child with title %s", title)
                slug = client.add_child(
                    parent_slug=file_slug, title=title, level="Item"
                )
                logger.info("Adding digital object to new child with slug %s", slug)
                client.add_digital_object(slug, **attrs)
            except (AtomError, CommunicationError):
                raise AtomMetadataUploadError

        return file_slug
Beispiel #4
0
def upload_dip_metadata_to_atom(aip_name, aip_uuid, parent_slug):
    """
    Write to a AtoM's resource (parent_slug) the metadata of the objects of a
    AIP given its name and UUID. Return the slug of the new container resource
    created to hold the metadata objects.
    """
    with tempfile.NamedTemporaryFile() as temp:
        # Download METS file
        mets_path = '{}-{}/data/METS.{}.xml'.format(aip_name, aip_uuid,
                                                    aip_uuid)
        logger.debug('Extracting file %s into %s', mets_path, temp.name)
        try:
            extract_file(aip_uuid, mets_path, temp.name)
        except requests.exceptions.RequestException:
            raise AtomMetadataUploadError

        client = get_atom_client()
        mw = METSDocument.fromfile(temp.name)

        # Create file container
        try:
            logger.info('Creating file container with slug %s and title %s',
                        parent_slug, aip_name)
            file_slug = client.add_child(parent_slug=parent_slug,
                                         title=aip_name,
                                         level='File')
        except (AtomError, CommunicationError):
            raise AtomMetadataUploadError

        def add_prop_from_xml(dict_, name, el, xpath):
            """
            Write to a dictionary a new pair with the given key and the value
            taken from the text attribute of the element matched by the given
            XPath query.
            """
            res = el.find(xpath)
            if res is not None and res.text:
                dict_[name] = res.text
                logger.debug('Extracted property %s from METS: %s', name,
                             res.text)
            logger.debug('Failed to extract property %s from METS: not found',
                         name)

        # Add objects
        for item in mw.all_files():
            if item.type == 'Directory' or item.use != 'original':
                continue
            attrs = {
                'title': os.path.basename(item.path),
                'usage': 'Offline',
                'file_uuid': item.file_uuid,
                'aip_uuid': aip_uuid,
            }
            amdsec = item.amdsecs[0].serialize()
            add_prop_from_xml(
                attrs, 'size', amdsec,
                './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}size'
            )
            add_prop_from_xml(
                attrs, 'format_name', amdsec,
                './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}format/{info:lc/xmlns/premis-v2}formatDesignation/{info:lc/xmlns/premis-v2}formatName'
            )
            add_prop_from_xml(
                attrs, 'format_version', amdsec,
                './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}format/{info:lc/xmlns/premis-v2}formatDesignation/{info:lc/xmlns/premis-v2}formatVersion'
            )
            add_prop_from_xml(
                attrs, 'format_registry_name', amdsec,
                './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}format/{info:lc/xmlns/premis-v2}formatRegistry/{info:lc/xmlns/premis-v2}formatRegistryName'
            )
            add_prop_from_xml(
                attrs, 'format_registry_key', amdsec,
                './/{info:lc/xmlns/premis-v2}objectCharacteristics/{info:lc/xmlns/premis-v2}format/{info:lc/xmlns/premis-v2}formatRegistry/{info:lc/xmlns/premis-v2}formatRegistryKey'
            )
            title = os.path.basename(item.path)

            try:
                logger.info('Creating child with title %s', title)
                slug = client.add_child(parent_slug=file_slug,
                                        title=title,
                                        level='Item')
                logger.info('Adding digital object to new child with slug %s',
                            slug)
                client.add_digital_object(slug, **attrs)
            except (AtomError, CommunicationError):
                raise AtomMetadataUploadError

        return file_slug
Beispiel #5
0
    def process_package(
        self, es_client, package_info, temp_dir, delete_before_reindexing, is_aic=False
    ):
        """Index package in 'aips' and 'aipfiles' indices.

        :param es_client: Elasticsearch client.
        :param package_info: Package info dict returned by Storage
        Service.
        :param temp_dir: Path to tempdir for downloaded METS files.
        :param delete_before_reindexing: Boolean of whether to delete
        package from indices prior to reindexing.
        :is_aic: Optional boolean to indicate if package being indexed
        is an AIC.

        :returns: Boolean indicating success.
        """
        uuid = package_info["uuid"]

        # Download the AIP METS file to a temporary directory.
        mets_relative_path = am.relative_path_to_aip_mets_file(
            package_info["uuid"], package_info["current_path"]
        )
        mets_filename = os.path.basename(mets_relative_path)
        mets_download_path = os.path.join(temp_dir, mets_filename)
        storageService.extract_file(uuid, mets_relative_path, mets_download_path)

        if not os.path.isfile(mets_download_path):
            error_message = "Unable to download AIP METS file from Storage Service"
            self.error(
                "Error indexing package {0}. Details: {1}".format(uuid, error_message)
            )
            return False

        aips_in_aic = None
        if is_aic:
            mets_root = etree.parse(mets_download_path)
            aips_in_aic = get_aips_in_aic(mets_root, temp_dir, uuid)

        package_name = am.package_name_from_path(
            package_info["current_path"], remove_uuid_suffix=True
        )

        aip_location = package_info.get("current_location", "")
        location_description = storageService.retrieve_storage_location_description(
            aip_location
        )

        if delete_before_reindexing:
            self.info(
                "Deleting package {} from 'aips' and 'aipfiles' indices.".format(uuid)
            )
            es.delete_aip(es_client, uuid)
            es.delete_aip_files(es_client, uuid)

        # Index the AIP and then immediately delete the METS file.
        try:
            es.index_aip_and_files(
                client=es_client,
                uuid=uuid,
                aip_stored_path=package_info["current_full_path"],
                mets_staging_path=mets_download_path,
                name=package_name,
                aip_size=package_info["size"],
                aips_in_aic=aips_in_aic,
                encrypted=package_info.get("encrypted", False),
                location=location_description,
            )
            self.info("Successfully indexed package {}".format(uuid))
            os.remove(mets_download_path)
            return True
        except (ElasticsearchException, etree.XMLSyntaxError) as err:
            self.error("Error indexing package {0}. Details: {1}".format(uuid, err))
            os.remove(mets_download_path)
            return False