def filter_packages( package_list, statuses=("UPLOADED", "DEL_REQ"), package_types=("AIP", "AIC", "transfer", "DIP"), pipeline_uuid=None, filter_replicas=False, ): """Filter packages by status and origin pipeline. :param package_list: List of package info returned by Storage Service (list). :param statuses: Acceptable statuses for filter (tuple). Defaults to filtering out deleted packages. :param package_types: Acceptable package types for filter (tuple). :param pipeline_uuid: Acceptable pipeline UUID for filter (str). :param filter_replicas: Option to filter out replicas (bool). :returns: Filtered package list. """ if pipeline_uuid is None: pipeline_uuid = am.get_dashboard_uuid() origin_pipeline = "/api/v2/pipeline/{}/".format(pipeline_uuid) if filter_replicas: return [ package for package in package_list if package["status"] in statuses and package["package_type"] in package_types and package["origin_pipeline"] == origin_pipeline and package["replicated_package"] is None ] return [ package for package in package_list if package["status"] in statuses and package["package_type"] in package_types and package["origin_pipeline"] == origin_pipeline ]
def add_arguments(self, parser): """Entry point to add custom arguments.""" parser.add_argument( "-d", "--delete", action="store_true", help="Delete AIP-related Elasticsearch data for AIPs with matching" " UUIDS before indexing AIP data", ) parser.add_argument( "--delete-all", action="store_true", help="Delete all AIP information in the index before starting.", ) parser.add_argument( "-u", "--uuid", action="store", default="", help="Specify a single AIP by UUID to process", ) parser.add_argument( "--pipeline", help="Pipeline UUID to use when filtering packages", default=am.get_dashboard_uuid(), )
def index_aip(client, uuid, name, filePath, pathToMETS, size=None, aips_in_aic=None, identifiers=[]): tree = ElementTree.parse(pathToMETS) # TODO add a conditional to toggle this remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = root.find( 'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) if dublincore is not None: aip_type = dublincore.findtext( 'dc:type', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:type', namespaces=ns.NSMAP) if aip_type == "Archival Information Collection": aic_identifier = dublincore.findtext( 'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:identifier', namespaces=ns.NSMAP) is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # convert METS XML to dict xml = ElementTree.tostring(root) mets_data = rename_dict_keys_with_child_dicts( normalize_dict_values(xmltodict.parse(xml))) aipData = { 'uuid': uuid, 'name': name, 'filePath': filePath, 'size': (size or os.path.getsize(filePath)) / 1024 / 1024, 'mets': mets_data, 'origin': get_dashboard_uuid(), 'created': os.path.getmtime(pathToMETS), 'AICID': aic_identifier, 'isPartOf': is_part_of, 'countAIPsinAIC': aips_in_aic, 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), } wait_for_cluster_yellow_status(client) try_to_index(client, aipData, 'aips', 'aip')
def index_aip(client, uuid, name, filePath, pathToMETS, size=None, aips_in_aic=None, identifiers=[], encrypted=False): tree = ElementTree.parse(pathToMETS) # TODO add a conditional to toggle this remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = root.find('mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) if dublincore is not None: aip_type = dublincore.findtext('dc:type', namespaces=ns.NSMAP) or dublincore.findtext('dcterms:type', namespaces=ns.NSMAP) if aip_type == "Archival Information Collection": aic_identifier = dublincore.findtext('dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext('dcterms:identifier', namespaces=ns.NSMAP) is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # convert METS XML to dict xml = ElementTree.tostring(root) mets_data = rename_dict_keys_with_child_dicts(normalize_dict_values(xmltodict.parse(xml))) # Pull the create time from the METS header mets_hdr = root.find("mets:metsHdr", namespaces=ns.NSMAP) mets_created_attr = mets_hdr.get('CREATEDATE') created = time.time() if mets_created_attr: try: created = calendar.timegm(time.strptime(mets_created_attr, '%Y-%m-%dT%H:%M:%S')) except ValueError: print("Failed to parse METS CREATEDATE: %s" % (mets_created_attr)) aipData = { 'uuid': uuid, 'name': name, 'filePath': filePath, 'size': (size or os.path.getsize(filePath)) / 1024 / 1024, 'mets': mets_data, 'origin': get_dashboard_uuid(), 'created': created, 'AICID': aic_identifier, 'isPartOf': is_part_of, 'countAIPsinAIC': aips_in_aic, 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), 'encrypted': encrypted } wait_for_cluster_yellow_status(client) try_to_index(client, aipData, 'aips', 'aip')
def add_arguments(self, parser): """Entry point to add custom arguments.""" parser.add_argument("--transfer-backlog-dir", default=self.DEFAULT_TRANSFER_BACKLOG_DIR) parser.add_argument("--no-prompt", action="store_true") parser.add_argument( "--from-storage-service", help="Import packages from Storage Service", action="store_true", ) parser.add_argument( "--pipeline", help= "Pipeline UUID to use when filtering packages from Storage Service", default=am.get_dashboard_uuid(), )
def write_mets(mets_path, transfer_dir_path, base_path_placeholder, transfer_uuid): """ Writes a METS XML file to disk, containing all the data we can find. Args: mets_path: Output path for METS XML output transfer_dir_path: Location of the files on disk base_path_placeholder: The placeholder string for the base path, e.g. 'transferDirectory' identifier_group: The database column used to lookup file UUIDs, e.g. 'transfer_id' transfer_uuid: The UUID for the transfer """ transfer_dir_path = os.path.expanduser(transfer_dir_path) transfer_dir_path = os.path.normpath(transfer_dir_path) db_base_path = r"%{}%".format(base_path_placeholder) mets = metsrw.METSDocument() mets.objid = str(transfer_uuid) dashboard_uuid = get_dashboard_uuid() if dashboard_uuid: agent = metsrw.Agent( "CREATOR", type="SOFTWARE", name=str(dashboard_uuid), notes=["Archivematica dashboard UUID"], ) mets.agents.append(agent) try: transfer = Transfer.objects.get(uuid=transfer_uuid) except Transfer.DoesNotExist: logger.info("No record in database for transfer: %s", transfer_uuid) raise if transfer.accessionid: alt_record_id = metsrw.AltRecordID(transfer.accessionid, type="Accession ID") mets.alternate_ids.append(alt_record_id) fsentry_tree = FSEntriesTree(transfer_dir_path, db_base_path, transfer) fsentry_tree.scan() mets.append_file(fsentry_tree.root_node) mets.write(mets_path, pretty_print=True)
def createMetsHdr(sip_uuid): header = etree.Element(ns.metsBNS + "metsHdr", CREATEDATE=getUTCDate().strftime("%Y-%m-%dT%H:%M:%S")) agent = etree.SubElement(header, ns.metsBNS + "agent", ROLE="CREATOR", TYPE="OTHER", OTHERTYPE="SOFTWARE") name = etree.SubElement(agent, ns.metsBNS + "name") name.text = get_dashboard_uuid() note = etree.SubElement(agent, ns.metsBNS + "note") note.text = "Archivematica dashboard UUID" accession_number = getAccessionNumberFromTransfer(sip_uuid) if accession_number: alt_id = etree.SubElement(header, ns.metsBNS + "altRecordID", TYPE="Accession number") alt_id.text = accession_number return header
def index_transfer_files(client, uuid, pathToTransfer, index, type_, status=''): """ Indexes files in the Transfer with UUID `uuid` at path `pathToTransfer`. Returns the number of files indexed. client: ElasticSearch client uuid: UUID of the Transfer in the DB pathToTransfer: path on disk, including the transfer directory and a trailing / but not including objects/ index, type: index and type in ElasticSearch """ files_indexed = 0 ingest_date = str(datetime.datetime.today())[0:10] # Some files should not be indexed # This should match the basename of the file ignore_files = [ 'processingMCP.xml', ] # Get accessionId and name from Transfers table using UUID try: transfer = Transfer.objects.get(uuid=uuid) accession_id = transfer.accessionid transfer_name = transfer.currentlocation.split('/')[-2] except Transfer.DoesNotExist: accession_id = transfer_name = '' # Get dashboard UUID dashboard_uuid = get_dashboard_uuid() for filepath in list_files_in_dir(pathToTransfer): if os.path.isfile(filepath): # Get file UUID file_uuid = '' relative_path = filepath.replace(pathToTransfer, '%transferDirectory%') try: f = File.objects.get(currentlocation=relative_path, transfer_id=uuid) file_uuid = f.uuid formats = _get_file_formats(f) bulk_extractor_reports = _list_bulk_extractor_reports( pathToTransfer, file_uuid) modification_date = f.modificationtime.strftime('%Y-%m-%d') except File.DoesNotExist: file_uuid = '' formats = [] bulk_extractor_reports = [] modification_date = '' # Get file path info relative_path = relative_path.replace('%transferDirectory%', transfer_name + '/') file_extension = os.path.splitext(filepath)[1][1:].lower() filename = os.path.basename(filepath) # Size in megabytes size = os.path.getsize(filepath) / (1024 * 1024) create_time = os.stat(filepath).st_ctime if filename not in ignore_files: print('Indexing {} (UUID: {})'.format(relative_path, file_uuid)) # TODO Index Backlog Location UUID? indexData = { 'filename': filename, 'relative_path': relative_path, 'fileuuid': file_uuid, 'sipuuid': uuid, 'accessionid': accession_id, 'status': status, 'origin': dashboard_uuid, 'ingestdate': ingest_date, 'created': create_time, 'modification_date': modification_date, 'size': size, 'tags': [], 'file_extension': file_extension, 'bulk_extractor_reports': bulk_extractor_reports, 'format': formats, } wait_for_cluster_yellow_status(client) try_to_index(client, indexData, index, type_) files_indexed = files_indexed + 1 else: print('Skipping indexing {}'.format(relative_path)) if files_indexed > 0: client.indices.refresh() return files_indexed
def index_mets_file_metadata(client, uuid, metsFilePath, index, type_, sipName, identifiers=[]): # parse XML tree = ElementTree.parse(metsFilePath) root = tree.getroot() # TODO add a conditional to toggle this remove_tool_output_from_mets(tree) # get SIP-wide dmdSec dmdSec = root.findall("mets:dmdSec/mets:mdWrap/mets:xmlData", namespaces=ns.NSMAP) dmdSecData = {} for item in dmdSec: xml = ElementTree.tostring(item) dmdSecData = xmltodict.parse(xml) # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore dublincore = root.find( 'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) aic_identifier = None is_part_of = None if dublincore is not None: aip_type = dublincore.findtext( 'dc:type', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:type', namespaces=ns.NSMAP) if aip_type == "Archival Information Collection": aic_identifier = dublincore.findtext( 'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:identifier', namespaces=ns.NSMAP) elif aip_type == "Archival Information Package": is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # establish structure to be indexed for each file item fileData = { 'archivematicaVersion': version.get_version(), 'AIPUUID': uuid, 'sipName': sipName, 'FILEUUID': '', 'indexedAt': time.time(), 'filePath': '', 'fileExtension': '', 'isPartOf': is_part_of, 'AICID': aic_identifier, 'METS': { 'dmdSec': rename_dict_keys_with_child_dicts( normalize_dict_values(dmdSecData)), 'amdSec': {}, }, 'origin': get_dashboard_uuid(), 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), } # Index all files in a fileGrup with USE='original' or USE='metadata' original_files = root.findall( "mets:fileSec/mets:fileGrp[@USE='original']/mets:file", namespaces=ns.NSMAP) metadata_files = root.findall( "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file", namespaces=ns.NSMAP) files = original_files + metadata_files # Index AIC METS file if it exists for file_ in files: indexData = fileData.copy() # Deep copy of dict, not of dict contents # Get file UUID. If and ADMID exists, look in the amdSec for the UUID, # otherwise parse it out of the file ID. # 'Original' files have ADMIDs, 'Metadata' files don't admID = file_.attrib.get('ADMID', None) if admID is None: # Parse UUID from file ID fileUUID = None uuix_regex = r'\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}' uuids = re.findall(uuix_regex, file_.attrib['ID']) # Multiple UUIDs may be returned - if they are all identical, use that # UUID, otherwise use None. # To determine all UUIDs are identical, use the size of the set if len(set(uuids)) == 1: fileUUID = uuids[0] else: amdSecInfo = root.find("mets:amdSec[@ID='{}']".format(admID), namespaces=ns.NSMAP) fileUUID = amdSecInfo.findtext( "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=ns.NSMAP) # Index amdSec information xml = ElementTree.tostring(amdSecInfo) indexData['METS']['amdSec'] = rename_dict_keys_with_child_dicts( normalize_dict_values(xmltodict.parse(xml))) indexData['FILEUUID'] = fileUUID # Get file path from FLocat and extension filePath = file_.find( 'mets:FLocat', namespaces=ns.NSMAP).attrib['{http://www.w3.org/1999/xlink}href'] indexData['filePath'] = filePath _, fileExtension = os.path.splitext(filePath) if fileExtension: indexData['fileExtension'] = fileExtension[1:].lower() # index data wait_for_cluster_yellow_status(client) try_to_index(client, indexData, index, type_) # Reset fileData['METS']['amdSec'], since it is updated in the loop # above. See http://stackoverflow.com/a/3975388 for explanation fileData['METS']['amdSec'] = {} print('Indexed AIP files and corresponding METS XML.') return len(files)
def _index_aip_files(client, uuid, mets_path, name, identifiers=[], printfn=print): """Index AIP files from AIP with UUID `uuid` and METS at path `mets_path`. :param client: The ElasticSearch client. :param uuid: The UUID of the AIP we're indexing. :param mets_path: path on disk where the AIP's METS file is located. :param name: AIP name. :param identifiers: optional additional identifiers (MODS, Islandora, etc.). :param printfn: optional print funtion. :return: number of files indexed. """ # Parse XML tree = ElementTree.parse(mets_path) root = tree.getroot() # TODO: Add a conditional to toggle this _remove_tool_output_from_mets(tree) # Extract isPartOf (for AIPs) or identifier (for AICs) from DublinCore dublincore = root.find( 'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) aic_identifier = None is_part_of = None if dublincore is not None: aip_type = dublincore.findtext( 'dc:type', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:type', namespaces=ns.NSMAP) if aip_type == "Archival Information Collection": aic_identifier = dublincore.findtext( 'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:identifier', namespaces=ns.NSMAP) elif aip_type == "Archival Information Package": is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # Establish structure to be indexed for each file item fileData = { 'archivematicaVersion': version.get_version(), 'AIPUUID': uuid, 'sipName': name, 'FILEUUID': '', 'indexedAt': time.time(), 'filePath': '', 'fileExtension': '', 'isPartOf': is_part_of, 'AICID': aic_identifier, 'METS': { 'dmdSec': {}, 'amdSec': {}, }, 'origin': get_dashboard_uuid(), 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), } # Index all files in a fileGrup with USE='original' or USE='metadata' original_files = root.findall( "mets:fileSec/mets:fileGrp[@USE='original']/mets:file", namespaces=ns.NSMAP) metadata_files = root.findall( "mets:fileSec/mets:fileGrp[@USE='metadata']/mets:file", namespaces=ns.NSMAP) files = original_files + metadata_files # Index AIC METS file if it exists for file_ in files: indexData = fileData.copy() # Deep copy of dict, not of dict contents # Get file UUID. If and ADMID exists, look in the amdSec for the UUID, # otherwise parse it out of the file ID. # 'Original' files have ADMIDs, 'Metadata' files don't admID = file_.attrib.get('ADMID', None) if admID is None: # Parse UUID from file ID fileUUID = None uuix_regex = r'\w{8}-?\w{4}-?\w{4}-?\w{4}-?\w{12}' uuids = re.findall(uuix_regex, file_.attrib['ID']) # Multiple UUIDs may be returned - if they are all identical, use that # UUID, otherwise use None. # To determine all UUIDs are identical, use the size of the set if len(set(uuids)) == 1: fileUUID = uuids[0] else: amdSecInfo = root.find("mets:amdSec[@ID='{}']".format(admID), namespaces=ns.NSMAP) fileUUID = amdSecInfo.findtext( "mets:techMD/mets:mdWrap/mets:xmlData/premis:object/premis:objectIdentifier/premis:objectIdentifierValue", namespaces=ns.NSMAP) # Index amdSec information xml = ElementTree.tostring(amdSecInfo) indexData['METS']['amdSec'] = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) # Get the parent division for the file pointer # by searching the physical structural map section (structMap) file_id = file_.attrib.get('ID', None) file_pointer_division = root.find( "mets:structMap[@TYPE='physical']//mets:fptr[@FILEID='{}']/..". format(file_id), namespaces=ns.NSMAP) if file_pointer_division is not None: # If the parent division has a DMDID attribute then index # its data from the descriptive metadata section (dmdSec) dmd_section_id = file_pointer_division.attrib.get('DMDID', None) if dmd_section_id is not None: # dmd_section_id can contain one id (e.g., "dmdSec_2") # or more than one (e.g., "dmdSec_2 dmdSec_3", # when a file has both DC and non-DC metadata). # Attempt to index only the DC dmdSec if available for dmd_section_id_item in dmd_section_id.split(): dmd_section_info = root.find( "mets:dmdSec[@ID='{}']/mets:mdWrap[@MDTYPE='DC']/mets:xmlData" .format(dmd_section_id_item), namespaces=ns.NSMAP) if dmd_section_info is not None: xml = ElementTree.tostring(dmd_section_info) data = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) indexData["METS"]["dmdSec"] = data break indexData['FILEUUID'] = fileUUID # Get file path from FLocat and extension filePath = file_.find( 'mets:FLocat', namespaces=ns.NSMAP).attrib['{http://www.w3.org/1999/xlink}href'] indexData['filePath'] = filePath _, fileExtension = os.path.splitext(filePath) if fileExtension: indexData['fileExtension'] = fileExtension[1:].lower() # Index data _wait_for_cluster_yellow_status(client) _try_to_index(client, indexData, 'aipfiles', printfn=printfn) # Reset fileData['METS']['amdSec'] and fileData['METS']['dmdSec'], # since they are updated in the loop above. # See http://stackoverflow.com/a/3975388 for explanation fileData['METS']['amdSec'] = {} fileData['METS']['dmdSec'] = {} return len(files)
def index_aip_and_files(client, uuid, aip_stored_path, mets_staging_path, name, aip_size, aips_in_aic=None, identifiers=[], encrypted=False, printfn=print): """Index AIP and AIP files with UUID `uuid` at path `path`. :param client: The ElasticSearch client. :param uuid: The UUID of the AIP we're indexing. :param aip_stored_path: path on disk where the AIP is located. :param mets_staging_path: path on disk where the AIP METS file is located. :param name: AIP name. :param aip_size: AIP size. :param aips_in_aic: optional number of AIPs stored in AIC. :param identifiers: optional additional identifiers (MODS, Islandora, etc.). :param identifiers: optional AIP encrypted boolean (defaults to `False`). :param printfn: optional print funtion. :return: 0 is succeded, 1 otherwise. """ # Stop if AIP or METS file don't not exist. error_message = None if not os.path.exists(mets_staging_path): error_message = 'METS file does not exist at: ' + mets_staging_path if error_message: logger.error(error_message) printfn(error_message, file=sys.stderr) return 1 printfn('AIP UUID: ' + uuid) printfn('Indexing AIP ...') tree = ElementTree.parse(mets_staging_path) _remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = root.find( 'mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore', namespaces=ns.NSMAP) if dublincore is not None: aip_type = dublincore.findtext( 'dc:type', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:type', namespaces=ns.NSMAP) if aip_type == 'Archival Information Collection': aic_identifier = dublincore.findtext( 'dc:identifier', namespaces=ns.NSMAP) or dublincore.findtext( 'dcterms:identifier', namespaces=ns.NSMAP) is_part_of = dublincore.findtext('dcterms:isPartOf', namespaces=ns.NSMAP) # Convert METS XML to dict xml = ElementTree.tostring(root) mets_data = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) # Pull the create time from the METS header mets_hdr = root.find('mets:metsHdr', namespaces=ns.NSMAP) mets_created_attr = mets_hdr.get('CREATEDATE') created = time.time() if mets_created_attr: try: created = calendar.timegm( time.strptime(mets_created_attr, '%Y-%m-%dT%H:%M:%S')) except ValueError: printfn('Failed to parse METS CREATEDATE: %s' % (mets_created_attr)) aip_data = { 'uuid': uuid, 'name': name, 'filePath': aip_stored_path, 'size': aip_size / (1024 * 1024), 'mets': mets_data, 'origin': get_dashboard_uuid(), 'created': created, 'AICID': aic_identifier, 'isPartOf': is_part_of, 'countAIPsinAIC': aips_in_aic, 'identifiers': identifiers, 'transferMetadata': _extract_transfer_metadata(root), 'encrypted': encrypted } _wait_for_cluster_yellow_status(client) _try_to_index(client, aip_data, 'aips', printfn=printfn) printfn('Done.') printfn('Indexing AIP files ...') files_indexed = _index_aip_files( client=client, uuid=uuid, mets_path=mets_staging_path, name=name, identifiers=identifiers, printfn=printfn, ) printfn('Files indexed: ' + str(files_indexed)) return 0
def _index_transfer_files(client, uuid, path, ingest_date, status="", printfn=print): """Indexes files in the Transfer with UUID `uuid` at path `path`. :param client: ElasticSearch client. :param uuid: UUID of the Transfer in the DB. :param path: path on disk, including the transfer directory and a trailing / but not including objects/. :param status: optional Transfer status. :param printfn: optional print funtion. :return: number of files indexed. """ files_indexed = 0 # Some files should not be indexed. # This should match the basename of the file. ignore_files = ["processingMCP.xml"] # Get accessionId and name from Transfers table using UUID try: transfer = Transfer.objects.get(uuid=uuid) accession_id = transfer.accessionid transfer_name = transfer.currentlocation.split("/")[-2] except Transfer.DoesNotExist: accession_id = transfer_name = "" # Get dashboard UUID dashboard_uuid = get_dashboard_uuid() for filepath in _list_files_in_dir(path): if os.path.isfile(filepath): # We need to account for the possibility of dealing with a BagIt # transfer package - the new default in Archivematica. # The BagIt is created when the package is sent to backlog hence # the locations in the database do not reflect the BagIt paths. # Strip the "data/" part when looking up the file entry. currentlocation = "%transferDirectory%" + os.path.relpath( filepath, path).lstrip("data/") try: f = File.objects.get(currentlocation=currentlocation, transfer_id=uuid) file_uuid = f.uuid formats = _get_file_formats(f) bulk_extractor_reports = _list_bulk_extractor_reports( path, file_uuid) if f.modificationtime is not None: modification_date = f.modificationtime.strftime("%Y-%m-%d") except File.DoesNotExist: file_uuid, modification_date = "", "" formats = [] bulk_extractor_reports = [] # Get file path info relative_path = filepath.replace(path, transfer_name + "/") file_extension = os.path.splitext(filepath)[1][1:].lower() filename = os.path.basename(filepath) # Size in megabytes size = os.path.getsize(filepath) / (1024 * 1024) create_time = os.stat(filepath).st_ctime if filename not in ignore_files: printfn("Indexing {} (UUID: {})".format( relative_path, file_uuid)) # TODO: Index Backlog Location UUID? indexData = { "filename": filename, "relative_path": relative_path, "fileuuid": file_uuid, "sipuuid": uuid, "accessionid": accession_id, "status": status, "origin": dashboard_uuid, "ingestdate": ingest_date, "created": create_time, "modification_date": modification_date, "size": size, "tags": [], "file_extension": file_extension, "bulk_extractor_reports": bulk_extractor_reports, "format": formats, } _wait_for_cluster_yellow_status(client) _try_to_index(client, indexData, "transferfiles", printfn=printfn) files_indexed = files_indexed + 1 else: printfn("Skipping indexing {}".format(relative_path)) return files_indexed
def index_aip_and_files( client, uuid, aip_stored_path, mets_staging_path, name, aip_size, aips_in_aic=None, identifiers=[], encrypted=False, printfn=print, ): """Index AIP and AIP files with UUID `uuid` at path `path`. :param client: The ElasticSearch client. :param uuid: The UUID of the AIP we're indexing. :param aip_stored_path: path on disk where the AIP is located. :param mets_staging_path: path on disk where the AIP METS file is located. :param name: AIP name. :param aip_size: AIP size. :param aips_in_aic: optional number of AIPs stored in AIC. :param identifiers: optional additional identifiers (MODS, Islandora, etc.). :param identifiers: optional AIP encrypted boolean (defaults to `False`). :param printfn: optional print funtion. :return: 0 is succeded, 1 otherwise. """ # Stop if METS file is not at staging path. error_message = None if not os.path.exists(mets_staging_path): error_message = "METS file does not exist at: " + mets_staging_path if error_message: logger.error(error_message) printfn(error_message, file=sys.stderr) return 1 printfn("AIP UUID: " + uuid) printfn("Indexing AIP ...") tree = ElementTree.parse(mets_staging_path) _remove_tool_output_from_mets(tree) root = tree.getroot() # Extract AIC identifier, other specially-indexed information aic_identifier = None is_part_of = None dublincore = ns.xml_find_premis( root, "mets:dmdSec/mets:mdWrap/mets:xmlData/dcterms:dublincore") if dublincore is not None: aip_type = ns.xml_findtext_premis(dublincore, "dc:type") or ns.xml_findtext_premis( dublincore, "dcterms:type") if aip_type == "Archival Information Collection": aic_identifier = ns.xml_findtext_premis( dublincore, "dc:identifier") or ns.xml_findtext_premis( dublincore, "dcterms:identifier") is_part_of = ns.xml_findtext_premis(dublincore, "dcterms:isPartOf") # Convert METS XML to dict xml = ElementTree.tostring(root) mets_data = _rename_dict_keys_with_child_dicts( _normalize_dict_values(xmltodict.parse(xml))) # Pull the create time from the METS header. # Old METS did not use `metsHdr`. created = time.time() mets_hdr = ns.xml_find_premis(root, "mets:metsHdr") if mets_hdr is not None: mets_created_attr = mets_hdr.get("CREATEDATE") if mets_created_attr: try: created = calendar.timegm( time.strptime(mets_created_attr, "%Y-%m-%dT%H:%M:%S")) except ValueError: printfn("Failed to parse METS CREATEDATE: %s" % (mets_created_attr)) aip_data = { "uuid": uuid, "name": name, "filePath": aip_stored_path, "size": aip_size / (1024 * 1024), "mets": mets_data, "origin": get_dashboard_uuid(), "created": created, "AICID": aic_identifier, "isPartOf": is_part_of, "countAIPsinAIC": aips_in_aic, "identifiers": identifiers, "transferMetadata": _extract_transfer_metadata(root), "encrypted": encrypted, } _wait_for_cluster_yellow_status(client) _try_to_index(client, aip_data, "aips", printfn=printfn) printfn("Done.") printfn("Indexing AIP files ...") files_indexed = _index_aip_files( client=client, uuid=uuid, mets_path=mets_staging_path, name=name, identifiers=identifiers, printfn=printfn, ) printfn("Files indexed: " + str(files_indexed)) return 0