def find_aip_dirname(mets_root):
    """Find name of AIP directory within AIP METS document.

    :param mets_root: AIP METS document root.

    :returns: AIP dirname or None.
    """
    return xml_find_premis(mets_root, "mets:structMap/mets:div").get("LABEL")
def find_aic_mets_filename(mets_root):
    """Find name of AIC METS file within AIP METS document.

    :param mets_root: AIP METS document root.

    :returns: AIC METS filename or None.
    """
    return xml_find_premis(
        mets_root,
        "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file/mets:FLocat"
    ).get("{" + NSMAP["xlink"] + "}href")
Beispiel #3
0
def get_aips_in_aic(mets_root, archive_path, temp_dir):
    """Return the number of AIPs in the AIC, extracted from AIC METS file."""
    # Find name of AIC METS file
    try:
        # aic_mets_filename includes metadata/
        aic_mets_filename = ns.xml_find_premis(
            mets_root,
            "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file/mets:FLocat",
        ).get("{" + ns.NSMAP["xlink"] + "}href")
        aip_dirname = ns.xml_find_premis(
            mets_root, "mets:structMap/mets:div").get("LABEL")
    except Exception:
        # Catch any parsing errors
        return None

    # Extract AIC METS file
    aic_mets_path = extract_file(
        archive_path=archive_path,
        destination_dir=temp_dir,
        relative_path=os.path.join(aip_dirname, "data", aic_mets_filename),
    )

    # Parse for number of AIPs
    aic_root = etree.parse(aic_mets_path)
    extent = ns.xml_find_premis(
        aic_root,
        "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:extent",
    )

    try:
        aips_in_aic = re.search("\d+", extent.text).group()
    except AttributeError:
        # Probably because extent was None
        # Or the search returned None
        return None

    return aips_in_aic
def find_aips_in_aic(aic_root):
    """Find extent of AIPs in AIC within AIC METS document.

    :param aic_root" AIC METS document root.

    :returns: Count of AIPs in AIC or None.
    """
    extent = xml_find_premis(
        aic_root,
        "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:extent",
    )
    try:
        return re.search("\d+", extent.text).group()
    except AttributeError:
        return None
Beispiel #5
0
def processAIPThenDeleteMETSFile(path,
                                 temp_dir,
                                 es_client,
                                 delete_existing_data=False):
    archive_file = os.path.basename(path)

    # Regex match the UUID - AIP might end with .7z, .tar.bz2, or
    # something else.
    match = re.search(
        r"[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}",
        archive_file)
    if match is not None:
        aip_uuid = match.group()
    else:
        return -1

    print("Processing AIP", aip_uuid)

    if delete_existing_data is True:
        print("Deleting AIP", aip_uuid, "from aips/aip and aips/aipfile.")
        elasticSearchFunctions.delete_aip(es_client, aip_uuid)
        elasticSearchFunctions.delete_aip_files(es_client, aip_uuid)

    # AIP filenames are <name>-<uuid><extension>
    # Index of match end is right before the extension
    subdir = archive_file[:match.end()]
    aip_name = subdir[:-37]
    mets_file = "METS." + aip_uuid + ".xml"
    mets_file_relative_path = os.path.join("data", mets_file)
    if os.path.isfile(path):
        mets_file_relative_path = os.path.join(subdir, mets_file_relative_path)
    path_to_mets = extract_file(
        archive_path=path,
        destination_dir=temp_dir,
        relative_path=mets_file_relative_path,
    )

    # If AIC, need to extract number of AIPs in AIC to index as well
    aips_in_aic = None
    root = etree.parse(path_to_mets)
    try:
        aip_type = ns.xml_find_premis(
            root,
            "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore/dcterms:type"
        ).text
    except AttributeError:
        pass
    else:
        if aip_type == "Archival Information Collection":
            aips_in_aic = get_aips_in_aic(root, path, temp_dir)

    aip_info = storage_service.get_file_info(uuid=aip_uuid)

    if not aip_info:
        print("Information not found in Storage Service for AIP UUID: ",
              aip_uuid)
        return 1

    return elasticSearchFunctions.index_aip_and_files(
        client=es_client,
        uuid=aip_uuid,
        aip_stored_path=path,
        mets_staging_path=path_to_mets,
        name=aip_name,
        aip_size=aip_info[0]["size"],
        aips_in_aic=aips_in_aic,
        identifiers=[],  # TODO get these
    )
Beispiel #6
0
def _index_aip_files(client,
                     uuid,
                     mets_path,
                     name,
                     identifiers=[],
                     printfn=print):
    """Index AIP files from AIP with UUID `uuid` and METS at path `mets_path`.

    :param client: The ElasticSearch client.
    :param uuid: The UUID of the AIP we're indexing.
    :param mets_path: path on disk where the AIP's METS file is located.
    :param name: AIP name.
    :param identifiers: optional additional identifiers (MODS, Islandora, etc.).
    :param printfn: optional print funtion.
    :return: number of files indexed.
    """
    # Parse XML
    tree = ElementTree.parse(mets_path)
    root = tree.getroot()

    # TODO: Add a conditional to toggle this
    _remove_tool_output_from_mets(tree)

    # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore
    dublincore = ns.xml_find_premis(
        root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore")
    aic_identifier = None
    is_part_of = None
    if dublincore is not None:
        aip_type = ns.xml_findtext_premis(dublincore,
                                          "dc:type") or ns.xml_findtext_premis(
                                              dublincore, "dcterms:type")
        if aip_type == "Archival Information Collection":
            aic_identifier = ns.xml_findtext_premis(
                dublincore, "dc:identifier") or ns.xml_findtext_premis(
                    dublincore, "dcterms:identifier")
        elif aip_type == "Archival Information Package":
            is_part_of = ns.xml_findtext_premis(dublincore, "dcterms:isPartOf")

    # Establish structure to be indexed for each file item
    fileData = {
        "archivematicaVersion": version.get_version(),
        "AIPUUID": uuid,
        "sipName": name,
        "FILEUUID": "",
        "indexedAt": time.time(),
        "filePath": "",
        "fileExtension": "",
        "isPartOf": is_part_of,
        "AICID": aic_identifier,
        "METS": {
            "dmdSec": {},
            "amdSec": {}
        },
        "origin": get_dashboard_uuid(),
        "identifiers": identifiers,
        "transferMetadata": _extract_transfer_metadata(root),
    }

    # Index all files in a fileGrup with USE='original' or USE='metadata'
    original_files = ns.xml_findall_premis(
        root, "mets:fileSec/mets:fileGrp[@USE='original']/mets:file")
    metadata_files = ns.xml_findall_premis(
        root, "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file")
    files = original_files + metadata_files

    # Index AIC METS file if it exists
    for file_ in files:
        indexData = fileData.copy()  # Deep copy of dict, not of dict contents

        # Get file UUID.  If and ADMID exists, look in the amdSec for the UUID,
        # otherwise parse it out of the file ID.
        # 'Original' files have ADMIDs, 'Metadata' files don't
        admID = file_.attrib.get("ADMID", None)
        if admID is None:
            # Parse UUID from file ID
            fileUUID = None
            uuix_regex = r"\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}"
            uuids = re.findall(uuix_regex, file_.attrib["ID"])
            # Multiple UUIDs may be returned - if they are all identical, use that
            # UUID, otherwise use None.
            # To determine all UUIDs are identical, use the size of the set
            if len(set(uuids)) == 1:
                fileUUID = uuids[0]
        else:
            amdSecInfo = ns.xml_find_premis(
                root, "mets:amdSec[@ID='{}']".format(admID))
            fileUUID = ns.xml_findtext_premis(
                amdSecInfo,
                "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue",
            )

            # Index amdSec information
            xml = ElementTree.tostring(amdSecInfo)
            indexData["METS"]["amdSec"] = _rename_dict_keys_with_child_dicts(
                _normalize_dict_values(xmltodict.parse(xml)))

        # Get the parent division for the file pointer
        # by searching the physical structural map section (structMap)
        file_id = file_.attrib.get("ID", None)
        file_pointer_division = ns.xml_find_premis(
            root,
            "mets:structMap[@TYPE='physical']//mets:fptr[@FILEID='{}']/..".
            format(file_id),
        )
        if file_pointer_division is not None:
            # If the parent division has a DMDID attribute then index
            # its data from the descriptive metadata section (dmdSec)
            dmd_section_id = file_pointer_division.attrib.get("DMDID", None)
            if dmd_section_id is not None:
                # dmd_section_id can contain one id (e.g., "dmdSec_2")
                # or more than one (e.g., "dmdSec_2 dmdSec_3",
                # when a file has both DC and non-DC metadata).
                # Attempt to index only the DC dmdSec if available
                for dmd_section_id_item in dmd_section_id.split():
                    dmd_section_info = ns.xml_find_premis(
                        root,
                        "mets:dmdSec[@ID='{}']/mets:mdWrap[@MDTYPE='DC']/mets:xmlData"
                        .format(dmd_section_id_item),
                    )
                    if dmd_section_info is not None:
                        xml = ElementTree.tostring(dmd_section_info)
                        data = _rename_dict_keys_with_child_dicts(
                            _normalize_dict_values(xmltodict.parse(xml)))
                        indexData["METS"]["dmdSec"] = data
                        break

        indexData["FILEUUID"] = fileUUID

        # Get file path from FLocat and extension
        filePath = ns.xml_find_premis(
            file_, "mets:FLocat").attrib["{http://www.w3.org/1999/xlink}href"]
        indexData["filePath"] = filePath
        _, fileExtension = os.path.splitext(filePath)
        if fileExtension:
            indexData["fileExtension"] = fileExtension[1:].lower()

        # Index data
        _wait_for_cluster_yellow_status(client)
        _try_to_index(client, indexData, "aipfiles", printfn=printfn)

        # Reset fileData['METS']['amdSec'] and fileData['METS']['dmdSec'],
        # since they are updated in the loop above.
        # See http://stackoverflow.com/a/3975388 for explanation
        fileData["METS"]["amdSec"] = {}
        fileData["METS"]["dmdSec"] = {}

    return len(files)
Beispiel #7
0
def index_aip_and_files(
    client,
    uuid,
    aip_stored_path,
    mets_staging_path,
    name,
    aip_size,
    aips_in_aic=None,
    identifiers=[],
    encrypted=False,
    printfn=print,
):
    """Index AIP and AIP files with UUID `uuid` at path `path`.

    :param client: The ElasticSearch client.
    :param uuid: The UUID of the AIP we're indexing.
    :param aip_stored_path: path on disk where the AIP is located.
    :param mets_staging_path: path on disk where the AIP METS file is located.
    :param name: AIP name.
    :param aip_size: AIP size.
    :param aips_in_aic: optional number of AIPs stored in AIC.
    :param identifiers: optional additional identifiers (MODS, Islandora, etc.).
    :param identifiers: optional AIP encrypted boolean (defaults to `False`).
    :param printfn: optional print funtion.
    :return: 0 is succeded, 1 otherwise.
    """
    # Stop if METS file is not at staging path.
    error_message = None
    if not os.path.exists(mets_staging_path):
        error_message = "METS file does not exist at: " + mets_staging_path
    if error_message:
        logger.error(error_message)
        printfn(error_message, file=sys.stderr)
        return 1
    printfn("AIP UUID: " + uuid)
    printfn("Indexing AIP ...")
    tree = ElementTree.parse(mets_staging_path)
    _remove_tool_output_from_mets(tree)
    root = tree.getroot()
    # Extract AIC identifier, other specially-indexed information
    aic_identifier = None
    is_part_of = None
    dublincore = ns.xml_find_premis(
        root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore")
    if dublincore is not None:
        aip_type = ns.xml_findtext_premis(dublincore,
                                          "dc:type") or ns.xml_findtext_premis(
                                              dublincore, "dcterms:type")
        if aip_type == "Archival Information Collection":
            aic_identifier = ns.xml_findtext_premis(
                dublincore, "dc:identifier") or ns.xml_findtext_premis(
                    dublincore, "dcterms:identifier")
        is_part_of = ns.xml_findtext_premis(dublincore, "dcterms:isPartOf")

    # Convert METS XML to dict
    xml = ElementTree.tostring(root)
    mets_data = _rename_dict_keys_with_child_dicts(
        _normalize_dict_values(xmltodict.parse(xml)))

    # Pull the create time from the METS header.
    # Old METS did not use `metsHdr`.
    created = time.time()
    mets_hdr = ns.xml_find_premis(root, "mets:metsHdr")
    if mets_hdr is not None:
        mets_created_attr = mets_hdr.get("CREATEDATE")
        if mets_created_attr:
            try:
                created = calendar.timegm(
                    time.strptime(mets_created_attr, "%Y-%m-%dT%H:%M:%S"))
            except ValueError:
                printfn("Failed to parse METS CREATEDATE: %s" %
                        (mets_created_attr))

    aip_data = {
        "uuid": uuid,
        "name": name,
        "filePath": aip_stored_path,
        "size": aip_size / (1024 * 1024),
        "mets": mets_data,
        "origin": get_dashboard_uuid(),
        "created": created,
        "AICID": aic_identifier,
        "isPartOf": is_part_of,
        "countAIPsinAIC": aips_in_aic,
        "identifiers": identifiers,
        "transferMetadata": _extract_transfer_metadata(root),
        "encrypted": encrypted,
    }
    _wait_for_cluster_yellow_status(client)
    _try_to_index(client, aip_data, "aips", printfn=printfn)
    printfn("Done.")
    printfn("Indexing AIP files ...")
    files_indexed = _index_aip_files(
        client=client,
        uuid=uuid,
        mets_path=mets_staging_path,
        name=name,
        identifiers=identifiers,
        printfn=printfn,
    )
    printfn("Files indexed: " + str(files_indexed))
    return 0
Beispiel #8
0
def update_rights(job, mets, sip_uuid, state):
    """
    Add rightsMDs for updated PREMIS Rights.
    """
    # Get original files to add rights to
    original_files = [f for f in mets.all_files() if f.use == "original"]

    # Check for deleted rights - exist in METS but not in DB
    # Cache rightsbasis in DB
    rightsmds_db = {}  # memoize
    for rightsbasis in models.RightsStatement.RIGHTS_BASIS_CHOICES:
        # ORIGINAL RightsStatements are unrelated to the old one.
        rightsmds_db[rightsbasis[0]] = models.RightsStatement.objects.filter(
            metadataappliestoidentifier=sip_uuid,
            metadataappliestotype_id=createmets2.SIPMetadataAppliesToType,
            rightsbasis=rightsbasis[0],
        ).exclude(status=models.METADATA_STATUS_ORIGINAL)

    for fsentry in original_files:
        rightsmds = [
            s for s in fsentry.amdsecs[0].subsections if s.subsection == "rightsMD"
        ]
        for r in rightsmds:
            # Don't follow MDRef pointers (see #1083 for more details).
            if isinstance(r.contents, metsrw.metadata.MDRef):
                continue
            if r.status == "superseded":
                continue
            rightsbasis = ns.xml_find_premis(
                r.contents.document, ".//premis:rightsBasis"
            )
            if rightsbasis is None:
                continue
            basis = rightsbasis.text
            if basis == "Other":
                otherrightsbasis = ns.xml_find_premis(
                    r.contents.document, ".//premis:otherRightsBasis"
                )
                if otherrightsbasis is not None:
                    basis = otherrightsbasis.text
            db_rights = rightsmds_db[basis]
            if (
                not db_rights
            ):  # TODO this may need to be more robust for RightsStatementRightsGranted
                job.pyprint("Rights", r.id_string, "looks deleted - making superseded")
                r.status = "superseded"

    # Check for newly added rights
    rights_list = models.RightsStatement.objects.filter(
        metadataappliestoidentifier=sip_uuid,
        metadataappliestotype_id=createmets2.SIPMetadataAppliesToType,
        status=models.METADATA_STATUS_ORIGINAL,
    )
    if not rights_list:
        job.pyprint("No new rights added")
    else:
        add_rights_elements(job, rights_list, original_files, state)

    # Check for updated rights
    rights_list = models.RightsStatement.objects.filter(
        metadataappliestoidentifier=sip_uuid,
        metadataappliestotype_id=createmets2.SIPMetadataAppliesToType,
        status=models.METADATA_STATUS_UPDATED,
    )
    if not rights_list:
        job.pyprint("No updated rights found")
    else:
        add_rights_elements(job, rights_list, original_files, state, updated=True)

    return mets