def _extract_transfer_metadata(doc): return [ xmltodict.parse(ElementTree.tostring(el))['transfer_metadata'] for el in doc.findall( "mets:amdSec/mets:sourceMD/mets:mdWrap/mets:xmlData/transfer_metadata", namespaces=ns.NSMAP) ]
def _extract_transfer_metadata(doc): return [ xmltodict.parse(ElementTree.tostring(el))["transfer_metadata"] for el in ns.xml_findall_premis( doc, "mets:amdSec/mets:sourceMD/mets:mdWrap/mets:xmlData/transfer_metadata" ) ]
def index_aip(client, uuid, name, filePath, pathToMETS, size=None, aips_in_aic=None, identifiers=[]): tree = ElementTree.parse(pathToMETS) # TODO add a conditional to toggle this remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = root.find( 'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) if dublincore is not None: aip_type = dublincore.findtext( 'dc:type', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:type', namespaces=ns.NSMAP) if aip_type == "Archival Information Collection": aic_identifier = dublincore.findtext( 'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:identifier', namespaces=ns.NSMAP) is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # convert METS XML to dict xml = ElementTree.tostring(root) mets_data = rename_dict_keys_with_child_dicts( normalize_dict_values(xmltodict.parse(xml))) aipData = { 'uuid': uuid, 'name': name, 'filePath': filePath, 'size': (size or os.path.getsize(filePath)) / 1024 / 1024, 'mets': mets_data, 'origin': get_dashboard_uuid(), 'created': os.path.getmtime(pathToMETS), 'AICID': aic_identifier, 'isPartOf': is_part_of, 'countAIPsinAIC': aips_in_aic, 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), } wait_for_cluster_yellow_status(client) try_to_index(client, aipData, 'aips', 'aip')
def index_aip(client, uuid, name, filePath, pathToMETS, size=None, aips_in_aic=None, identifiers=[], encrypted=False): tree = ElementTree.parse(pathToMETS) # TODO add a conditional to toggle this remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = root.find('mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) if dublincore is not None: aip_type = dublincore.findtext('dc:type', namespaces=ns.NSMAP) or dublincore.findtext('dcterms:type', namespaces=ns.NSMAP) if aip_type == "Archival Information Collection": aic_identifier = dublincore.findtext('dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext('dcterms:identifier', namespaces=ns.NSMAP) is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # convert METS XML to dict xml = ElementTree.tostring(root) mets_data = rename_dict_keys_with_child_dicts(normalize_dict_values(xmltodict.parse(xml))) # Pull the create time from the METS header mets_hdr = root.find("mets:metsHdr", namespaces=ns.NSMAP) mets_created_attr = mets_hdr.get('CREATEDATE') created = time.time() if mets_created_attr: try: created = calendar.timegm(time.strptime(mets_created_attr, '%Y-%m-%dT%H:%M:%S')) except ValueError: print("Failed to parse METS CREATEDATE: %s" % (mets_created_attr)) aipData = { 'uuid': uuid, 'name': name, 'filePath': filePath, 'size': (size or os.path.getsize(filePath)) / 1024 / 1024, 'mets': mets_data, 'origin': get_dashboard_uuid(), 'created': created, 'AICID': aic_identifier, 'isPartOf': is_part_of, 'countAIPsinAIC': aips_in_aic, 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), 'encrypted': encrypted } wait_for_cluster_yellow_status(client) try_to_index(client, aipData, 'aips', 'aip')
def index_mets_file_metadata(client, uuid, metsFilePath, index, type_, sipName, identifiers=[]): # parse XML tree = ElementTree.parse(metsFilePath) root = tree.getroot() # TODO add a conditional to toggle this remove_tool_output_from_mets(tree) # get SIP-wide dmdSec dmdSec = root.findall("mets:dmdSec/mets:mdWrap/mets:xmlData", namespaces=ns.NSMAP) dmdSecData = {} for item in dmdSec: xml = ElementTree.tostring(item) dmdSecData = xmltodict.parse(xml) # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore dublincore = root.find( 'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) aic_identifier = None is_part_of = None if dublincore is not None: aip_type = dublincore.findtext( 'dc:type', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:type', namespaces=ns.NSMAP) if aip_type == "Archival Information Collection": aic_identifier = dublincore.findtext( 'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:identifier', namespaces=ns.NSMAP) elif aip_type == "Archival Information Package": is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # establish structure to be indexed for each file item fileData = { 'archivematicaVersion': version.get_version(), 'AIPUUID': uuid, 'sipName': sipName, 'FILEUUID': '', 'indexedAt': time.time(), 'filePath': '', 'fileExtension': '', 'isPartOf': is_part_of, 'AICID': aic_identifier, 'METS': { 'dmdSec': rename_dict_keys_with_child_dicts( normalize_dict_values(dmdSecData)), 'amdSec': {}, }, 'origin': get_dashboard_uuid(), 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), } # Index all files in a fileGrup with USE='original' or USE='metadata' original_files = root.findall( "mets:fileSec/mets:fileGrp[@USE='original']/mets:file", namespaces=ns.NSMAP) metadata_files = root.findall( "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file", namespaces=ns.NSMAP) files = original_files + metadata_files # Index AIC METS file if it exists for file_ in files: indexData = fileData.copy() # Deep copy of dict, not of dict contents # Get file UUID. If and ADMID exists, look in the amdSec for the UUID, # otherwise parse it out of the file ID. # 'Original' files have ADMIDs, 'Metadata' files don't admID = file_.attrib.get('ADMID', None) if admID is None: # Parse UUID from file ID fileUUID = None uuix_regex = r'\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}' uuids = re.findall(uuix_regex, file_.attrib['ID']) # Multiple UUIDs may be returned - if they are all identical, use that # UUID, otherwise use None. # To determine all UUIDs are identical, use the size of the set if len(set(uuids)) == 1: fileUUID = uuids[0] else: amdSecInfo = root.find("mets:amdSec[@ID='{}']".format(admID), namespaces=ns.NSMAP) fileUUID = amdSecInfo.findtext( "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=ns.NSMAP) # Index amdSec information xml = ElementTree.tostring(amdSecInfo) indexData['METS']['amdSec'] = rename_dict_keys_with_child_dicts( normalize_dict_values(xmltodict.parse(xml))) indexData['FILEUUID'] = fileUUID # Get file path from FLocat and extension filePath = file_.find( 'mets:FLocat', namespaces=ns.NSMAP).attrib['{http://www.w3.org/1999/xlink}href'] indexData['filePath'] = filePath _, fileExtension = os.path.splitext(filePath) if fileExtension: indexData['fileExtension'] = fileExtension[1:].lower() # index data wait_for_cluster_yellow_status(client) try_to_index(client, indexData, index, type_) # Reset fileData['METS']['amdSec'], since it is updated in the loop # above. See http://stackoverflow.com/a/3975388 for explanation fileData['METS']['amdSec'] = {} print('Indexed AIP files and corresponding METS XML.') return len(files)
def _index_aip_files(client, uuid, mets_path, name, identifiers=[], printfn=print): """Index AIP files from AIP with UUID `uuid` and METS at path `mets_path`. :param client: The ElasticSearch client. :param uuid: The UUID of the AIP we're indexing. :param mets_path: path on disk where the AIP's METS file is located. :param name: AIP name. :param identifiers: optional additional identifiers (MODS, Islandora, etc.). :param printfn: optional print funtion. :return: number of files indexed. """ # Parse XML tree = ElementTree.parse(mets_path) root = tree.getroot() # TODO: Add a conditional to toggle this _remove_tool_output_from_mets(tree) # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore dublincore = root.find( 'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) aic_identifier = None is_part_of = None if dublincore is not None: aip_type = dublincore.findtext( 'dc:type', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:type', namespaces=ns.NSMAP) if aip_type == "Archival Information Collection": aic_identifier = dublincore.findtext( 'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:identifier', namespaces=ns.NSMAP) elif aip_type == "Archival Information Package": is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # Establish structure to be indexed for each file item fileData = { 'archivematicaVersion': version.get_version(), 'AIPUUID': uuid, 'sipName': name, 'FILEUUID': '', 'indexedAt': time.time(), 'filePath': '', 'fileExtension': '', 'isPartOf': is_part_of, 'AICID': aic_identifier, 'METS': { 'dmdSec': {}, 'amdSec': {}, }, 'origin': get_dashboard_uuid(), 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), } # Index all files in a fileGrup with USE='original' or USE='metadata' original_files = root.findall( "mets:fileSec/mets:fileGrp[@USE='original']/mets:file", namespaces=ns.NSMAP) metadata_files = root.findall( "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file", namespaces=ns.NSMAP) files = original_files + metadata_files # Index AIC METS file if it exists for file_ in files: indexData = fileData.copy() # Deep copy of dict, not of dict contents # Get file UUID. If and ADMID exists, look in the amdSec for the UUID, # otherwise parse it out of the file ID. # 'Original' files have ADMIDs, 'Metadata' files don't admID = file_.attrib.get('ADMID', None) if admID is None: # Parse UUID from file ID fileUUID = None uuix_regex = r'\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}' uuids = re.findall(uuix_regex, file_.attrib['ID']) # Multiple UUIDs may be returned - if they are all identical, use that # UUID, otherwise use None. # To determine all UUIDs are identical, use the size of the set if len(set(uuids)) == 1: fileUUID = uuids[0] else: amdSecInfo = root.find("mets:amdSec[@ID='{}']".format(admID), namespaces=ns.NSMAP) fileUUID = amdSecInfo.findtext( "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=ns.NSMAP) # Index amdSec information xml = ElementTree.tostring(amdSecInfo) indexData['METS']['amdSec'] = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) # Get the parent division for the file pointer # by searching the physical structural map section (structMap) file_id = file_.attrib.get('ID', None) file_pointer_division = root.find( "mets:structMap[@TYPE='physical']//mets:fptr[@FILEID='{}']/..". format(file_id), namespaces=ns.NSMAP) if file_pointer_division is not None: # If the parent division has a DMDID attribute then index # its data from the descriptive metadata section (dmdSec) dmd_section_id = file_pointer_division.attrib.get('DMDID', None) if dmd_section_id is not None: # dmd_section_id can contain one id (e.g., "dmdSec_2") # or more than one (e.g., "dmdSec_2 dmdSec_3", # when a file has both DC and non-DC metadata). # Attempt to index only the DC dmdSec if available for dmd_section_id_item in dmd_section_id.split(): dmd_section_info = root.find( "mets:dmdSec[@ID='{}']/mets:mdWrap[@MDTYPE='DC']/mets:xmlData" .format(dmd_section_id_item), namespaces=ns.NSMAP) if dmd_section_info is not None: xml = ElementTree.tostring(dmd_section_info) data = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) indexData["METS"]["dmdSec"] = data break indexData['FILEUUID'] = fileUUID # Get file path from FLocat and extension filePath = file_.find( 'mets:FLocat', namespaces=ns.NSMAP).attrib['{http://www.w3.org/1999/xlink}href'] indexData['filePath'] = filePath _, fileExtension = os.path.splitext(filePath) if fileExtension: indexData['fileExtension'] = fileExtension[1:].lower() # Index data _wait_for_cluster_yellow_status(client) _try_to_index(client, indexData, 'aipfiles', printfn=printfn) # Reset fileData['METS']['amdSec'] and fileData['METS']['dmdSec'], # since they are updated in the loop above. # See http://stackoverflow.com/a/3975388 for explanation fileData['METS']['amdSec'] = {} fileData['METS']['dmdSec'] = {} return len(files)
def index_aip_and_files(client, uuid, aip_stored_path, mets_staging_path, name, aip_size, aips_in_aic=None, identifiers=[], encrypted=False, printfn=print): """Index AIP and AIP files with UUID `uuid` at path `path`. :param client: The ElasticSearch client. :param uuid: The UUID of the AIP we're indexing. :param aip_stored_path: path on disk where the AIP is located. :param mets_staging_path: path on disk where the AIP METS file is located. :param name: AIP name. :param aip_size: AIP size. :param aips_in_aic: optional number of AIPs stored in AIC. :param identifiers: optional additional identifiers (MODS, Islandora, etc.). :param identifiers: optional AIP encrypted boolean (defaults to `False`). :param printfn: optional print funtion. :return: 0 is succeded, 1 otherwise. """ # Stop if AIP or METS file don't not exist. error_message = None if not os.path.exists(mets_staging_path): error_message = 'METS file does not exist at: ' + mets_staging_path if error_message: logger.error(error_message) printfn(error_message, file=sys.stderr) return 1 printfn('AIP UUID: ' + uuid) printfn('Indexing AIP ...') tree = ElementTree.parse(mets_staging_path) _remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = root.find( 'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) if dublincore is not None: aip_type = dublincore.findtext( 'dc:type', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:type', namespaces=ns.NSMAP) if aip_type == 'Archival Information Collection': aic_identifier = dublincore.findtext( 'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:identifier', namespaces=ns.NSMAP) is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # Convert METS XML to dict xml = ElementTree.tostring(root) mets_data = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) # Pull the create time from the METS header mets_hdr = root.find('mets:metsHdr', namespaces=ns.NSMAP) mets_created_attr = mets_hdr.get('CREATEDATE') created = time.time() if mets_created_attr: try: created = calendar.timegm( time.strptime(mets_created_attr, '%Y-%m-%dT%H:%M:%S')) except ValueError: printfn('Failed to parse METS CREATEDATE: %s' % (mets_created_attr)) aip_data = { 'uuid': uuid, 'name': name, 'filePath': aip_stored_path, 'size': aip_size / (1024 * 1024), 'mets': mets_data, 'origin': get_dashboard_uuid(), 'created': created, 'AICID': aic_identifier, 'isPartOf': is_part_of, 'countAIPsinAIC': aips_in_aic, 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), 'encrypted': encrypted } _wait_for_cluster_yellow_status(client) _try_to_index(client, aip_data, 'aips', printfn=printfn) printfn('Done.') printfn('Indexing AIP files ...') files_indexed = _index_aip_files( client=client, uuid=uuid, mets_path=mets_staging_path, name=name, identifiers=identifiers, printfn=printfn, ) printfn('Files indexed: ' + str(files_indexed)) return 0
def index_aip_and_files( client, uuid, aip_stored_path, mets_staging_path, name, aip_size, aips_in_aic=None, identifiers=[], encrypted=False, printfn=print, ): """Index AIP and AIP files with UUID `uuid` at path `path`. :param client: The ElasticSearch client. :param uuid: The UUID of the AIP we're indexing. :param aip_stored_path: path on disk where the AIP is located. :param mets_staging_path: path on disk where the AIP METS file is located. :param name: AIP name. :param aip_size: AIP size. :param aips_in_aic: optional number of AIPs stored in AIC. :param identifiers: optional additional identifiers (MODS, Islandora, etc.). :param identifiers: optional AIP encrypted boolean (defaults to `False`). :param printfn: optional print funtion. :return: 0 is succeded, 1 otherwise. """ # Stop if METS file is not at staging path. error_message = None if not os.path.exists(mets_staging_path): error_message = "METS file does not exist at: " + mets_staging_path if error_message: logger.error(error_message) printfn(error_message, file=sys.stderr) return 1 printfn("AIP UUID: " + uuid) printfn("Indexing AIP ...") tree = ElementTree.parse(mets_staging_path) _remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = ns.xml_find_premis( root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore") if dublincore is not None: aip_type = ns.xml_findtext_premis(dublincore, "dc:type") or ns.xml_findtext_premis( dublincore, "dcterms:type") if aip_type == "Archival Information Collection": aic_identifier = ns.xml_findtext_premis( dublincore, "dc:identifier") or ns.xml_findtext_premis( dublincore, "dcterms:identifier") is_part_of = ns.xml_findtext_premis(dublincore, "dcterms:isPartOf") # Convert METS XML to dict xml = ElementTree.tostring(root) mets_data = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) # Pull the create time from the METS header. # Old METS did not use `metsHdr`. created = time.time() mets_hdr = ns.xml_find_premis(root, "mets:metsHdr") if mets_hdr is not None: mets_created_attr = mets_hdr.get("CREATEDATE") if mets_created_attr: try: created = calendar.timegm( time.strptime(mets_created_attr, "%Y-%m-%dT%H:%M:%S")) except ValueError: printfn("Failed to parse METS CREATEDATE: %s" % (mets_created_attr)) aip_data = { "uuid": uuid, "name": name, "filePath": aip_stored_path, "size": aip_size / (1024 * 1024), "mets": mets_data, "origin": get_dashboard_uuid(), "created": created, "AICID": aic_identifier, "isPartOf": is_part_of, "countAIPsinAIC": aips_in_aic, "identifiers": identifiers, "transferMetadata": _extract_transfer_metadata(root), "encrypted": encrypted, } _wait_for_cluster_yellow_status(client) _try_to_index(client, aip_data, "aips", printfn=printfn) printfn("Done.") printfn("Indexing AIP files ...") files_indexed = _index_aip_files( client=client, uuid=uuid, mets_path=mets_staging_path, name=name, identifiers=identifiers, printfn=printfn, ) printfn("Files indexed: " + str(files_indexed)) return 0