def test_record_backlog_event(tmp_path): transfer = Transfer.objects.create(uuid="756db89c-1380-459d-83bc-d3772f1e7dd8") user = User.objects.create(id=1) transfer.update_active_agent(user_id=user.id) file_obj = File.objects.create( uuid="3c567bc8-0847-4d12-a77d-0ed3a0361c0a", transfer=transfer ) # ``_record_backlog_event`` expects the METS file to exist already. # We're creating one with a single file in it. (tmp_path / "metadata/submissionDocumentation").mkdir(parents=True) mets_path = str(tmp_path / "metadata/submissionDocumentation/METS.xml") mets = metsrw.METSDocument() mets.append_file( metsrw.FSEntry( path="foobar.jpg", label="foobar", type="Item", file_uuid=file_obj.uuid ) ) mets.write(mets_path, pretty_print=True) move_to_backlog._record_backlog_event(transfer.uuid, str(tmp_path), "2019-03-12") # Load METS document again and test that the file has a PREMIS event. mets = metsrw.METSDocument().fromfile(mets_path) fsentry = next(iter(mets.all_files())) premis_events = fsentry.get_premis_events() assert len(premis_events) == 1 assert premis_events[0].event_type == "placement in backlog" assert premis_events[0].event_date_time == "2019-03-12"
def test_remove_file(self): """ It should """ # Setup f3_uuid = str(uuid.uuid4()) f3 = metsrw.FSEntry('dir1/dir2/level3.txt', file_uuid=f3_uuid) d2 = metsrw.FSEntry('dir1/dir2', type='Directory', children=[f3]) f2_uuid = str(uuid.uuid4()) f2 = metsrw.FSEntry('dir1/level2.txt', file_uuid=f2_uuid) d1 = metsrw.FSEntry('dir1', type='Directory', children=[d2, f2]) f1_uuid = str(uuid.uuid4()) f1 = metsrw.FSEntry('level1.txt', file_uuid=f1_uuid) d = metsrw.FSEntry('root', type='Directory', children=[d1, f1]) mw = metsrw.METSDocument() mw.append_file(d) assert len(mw.all_files()) == 6 # Test remove file mw.remove_entry(f3) assert len(mw.all_files()) == 5 assert mw.get_file(file_uuid=f3_uuid) is None assert f3 not in d2.children assert f3 not in mw.all_files() # Test remove dir mw.remove_entry(d1) assert len(mw.all_files()) == 2 assert mw.get_file(path='dir1') is None assert d1 not in d.children assert d1 not in mw.all_files() assert f2 not in mw.all_files() assert d2 not in mw.all_files() assert f1 in d.children # Test remove root element mw.remove_entry(d) assert len(mw.all_files()) == 0
def test_files(self): # Test collects several children deep f3 = metsrw.FSEntry('level3.txt', file_uuid=str(uuid.uuid4())) d2 = metsrw.FSEntry('dir2', type='Directory', children=[f3]) f2 = metsrw.FSEntry('level2.txt', file_uuid=str(uuid.uuid4())) d1 = metsrw.FSEntry('dir1', type='Directory', children=[d2, f2]) f1 = metsrw.FSEntry('level1.txt', file_uuid=str(uuid.uuid4())) d = metsrw.FSEntry('root', type='Directory', children=[d1, f1]) mw = metsrw.METSDocument() mw.append_file(d) files = mw.all_files() assert files assert len(files) == 6 assert d in files assert f1 in files assert d1 in files assert f2 in files assert d2 in files assert f3 in files f4_uuid = str(uuid.uuid4()) f4 = metsrw.FSEntry('file4.txt', file_uuid=f4_uuid) mw.append_file(f4) files = mw.all_files() assert len(files) == 7 assert f4 in files
def create_test_pointer_file(self): # 1. Get the PREMIS events and object as premisrw class instances. compression_event = premisrw.PREMISEvent(data=c.EX_COMPR_EVT) events = [compression_event] _, compression_program_version, archive_tool = ( compression_event.compression_details) premis_object = premisrw.PREMISObject( xsi_type=c.EX_PTR_XSI_TYPE, identifier_value=c.EX_PTR_IDENTIFIER_VALUE, message_digest_algorithm=c.EX_PTR_MESSAGE_DIGEST_ALGORITHM, message_digest=c.EX_PTR_MESSAGE_DIGEST, size=c.EX_PTR_SIZE, format_name=c.EX_PTR_FORMAT_NAME, format_registry_key=c.EX_PTR_FORMAT_REGISTRY_KEY, creating_application_name=archive_tool, creating_application_version=compression_program_version, date_created_by_application=c.EX_PTR_DATE_CREATED_BY_APPLICATION) transform_files = compression_event.get_decompression_transform_files() # 2. Construct the METS pointer file mw = metsrw.METSDocument() mets_fs_entry = metsrw.FSEntry(path=c.EX_PTR_PATH, file_uuid=c.EX_PTR_IDENTIFIER_VALUE, use=c.EX_PTR_PACKAGE_TYPE, type=c.EX_PTR_PACKAGE_TYPE, transform_files=transform_files, mets_div_type=c.EX_PTR_AIP_SUBTYPE) mets_fs_entry.add_premis_object(premis_object.serialize()) for event in events: mets_fs_entry.add_premis_event(event.serialize()) for agent in [c.EX_AGT_1, c.EX_AGT_2]: mets_fs_entry.add_premis_agent(premisrw.data_to_premis(agent)) mw.append_file(mets_fs_entry) return mw
def write_mets_to_file(sip, unit_path, output_md_path, output_md_name): """Write METS to file.""" metadata_path = output_md_path if metadata_path is None: metadata_path = os.path.join(unit_path, "metadata") if not os.path.exists(metadata_path): os.makedirs(metadata_path) metadata_name = output_md_name if metadata_name is None: metadata_name = "METS.xml" mets_path = os.path.join(metadata_path, metadata_name) # Write the data structure out to a file and ensure that the encoding is # purposely set to UTF-8. This pattern is used in ```create_mets_v2.py```. # Given the opportunity we should add an encoding feature to the metsrw # package. mets_f = metsrw.METSDocument() mets_f.append_file(sip) with open(mets_path, "w") as xml_file: xml_file.write( etree.tostring( mets_f.serialize(), pretty_print=True, encoding="utf-8", xml_declaration=True, ) )
def _record_backlog_event(transfer_id, transfer_path, created_at): """Record backlog event in both the database and the transfer METS.""" mets_path = os.path.join( transfer_path, "metadata", "submissionDocumentation", "METS.xml" ) mets = metsrw.METSDocument().fromfile(mets_path) # Run all_files once, convert into a dict for faster lookups. fsentries = {entry.file_uuid: entry for entry in mets.all_files()} # Assuming the same agents apply to all files. agents = _transfer_agents(transfer_id) for file_obj in File.objects.filter(transfer_id=transfer_id).iterator(): try: fsentry = fsentries[file_obj.uuid] except KeyError: continue event_id, event_type = str(uuid.uuid4()), "placement in backlog" fsentry.add_premis_event( _premis_event_data(event_id, event_type, created_at, agents) ) insertIntoEvents( fileUUID=file_obj.uuid, eventIdentifierUUID=event_id, eventType=event_type, eventDateTime=created_at, agents=agents, ) mets.write(mets_path, pretty_print=True)
def test_full_mets(self): mw = metsrw.METSDocument() file1 = metsrw.FSEntry('objects/object1.ext', file_uuid=str(uuid.uuid4())) file2 = metsrw.FSEntry('objects/object2.ext', file_uuid=str(uuid.uuid4())) file1p = metsrw.FSEntry('objects/object1-preservation.ext', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file1) file2p = metsrw.FSEntry('objects/object2-preservation.ext', use='preservation', file_uuid=str(uuid.uuid4()), derived_from=file2) children = [file1, file2, file1p, file2p] objects = metsrw.FSEntry('objects', type='Directory', children=children) children = [ metsrw.FSEntry('transfers', type='Directory', children=[]), metsrw.FSEntry('metadata/metadata.csv', use='metadata', file_uuid=str(uuid.uuid4())), ] metadata = metsrw.FSEntry('metadata', type='Directory', children=children) children = [ metsrw.FSEntry('submissionDocumentation/METS.xml', use='submissionDocumentation', file_uuid=str(uuid.uuid4())), ] sub_doc = metsrw.FSEntry('submissionDocumentation', type='Directory', children=children) children = [objects, metadata, sub_doc] sip = metsrw.FSEntry('sipname-uuid', type='Directory', children=children) sip.add_dublin_core('<dublincore>sip metadata</dublincore>') file1.add_premis_object('<premis>object</premis>') file1.add_premis_event('<premis>event</premis>') file1.add_premis_agent('<premis>agent</premis>') rights = file1.add_premis_rights('<premis>rights</premis>') rights.replace_with(file1.add_premis_rights('<premis>newer rights</premis>')) dc = file1.add_dublin_core('<dublincore>metadata</dublincore>') dc.replace_with(file1.add_dublin_core('<dublincore>newer metadata</dublincore>')) mw.append_file(sip) mw.write('full_metsrw.xml', fully_qualified=True, pretty_print=True) os.remove('full_metsrw.xml')
def test_write(self): mw = metsrw.METSDocument() # mock serialize parser = etree.XMLParser(remove_blank_text=True) root = etree.parse('fixtures/complete_mets.xml', parser=parser).getroot() mw.serialize = lambda fully_qualified=True: root mw.write('test_write.xml', pretty_print=True) assert filecmp.cmp('fixtures/complete_mets.xml', 'test_write.xml', shallow=False) os.remove('test_write.xml')
def test_parse(self): """ It should set the correct createdate. It should create FSEntrys for every file and directory. It should associate amdSec and dmdSec with the FSEntry. It should associated derived files. """ mw = metsrw.METSDocument() parser = etree.XMLParser(remove_blank_text=True) root = etree.parse('fixtures/complete_mets.xml', parser=parser) mw.tree = root mw._parse_tree() assert mw.createdate == '2014-07-23T21:48:33' assert len(mw.all_files()) == 14 assert mw.get_file( type='Directory', label='csv-48accdf3-e425-4874-aad3-67ade019a214') is not None parent = mw.get_file(type='Directory', label='objects') assert parent is not None f = mw.get_file(type='Item', label='Landing_zone.jpg') assert f is not None assert f.path == 'objects/Landing_zone.jpg' assert f.use == 'original' assert f.parent == parent assert f.children == [] assert f.file_uuid == 'ab5c67fc-8f80-4e46-9f20-8d5ae29c43f2' assert f.derived_from is None assert f.admids == ['amdSec_1'] assert f.dmdids == ['dmdSec_1'] assert f.file_id() == 'file-ab5c67fc-8f80-4e46-9f20-8d5ae29c43f2' assert f.group_id() == 'Group-ab5c67fc-8f80-4e46-9f20-8d5ae29c43f2' f = mw.get_file( type='Item', label='Landing_zone-fc33fc0e-40ef-4ad9-ba52-860368e8ce5a.tif') assert f is not None assert f.path == 'objects/Landing_zone-fc33fc0e-40ef-4ad9-ba52-860368e8ce5a.tif' assert f.use == 'preservation' assert f.parent == parent assert f.children == [] assert f.file_uuid == 'e284d015-cfb0-45dd-961d-512bf0f47cf6' assert f.derived_from == mw.get_file(type='Item', label='Landing_zone.jpg') assert f.admids == ['amdSec_2'] assert f.dmdids == [] assert f.file_id() == 'file-e284d015-cfb0-45dd-961d-512bf0f47cf6' assert f.group_id() == 'Group-ab5c67fc-8f80-4e46-9f20-8d5ae29c43f2' assert mw.get_file(type='Directory', label='metadata') is not None assert mw.get_file(type='Directory', label='transfers') is not None assert mw.get_file( type='Directory', label='csv-55599568-90bd-46ac-b1be-d1a538793cae') is not None assert mw.get_file(type='Directory', label='submissionDocumentation') is not None assert mw.get_file( type='Directory', label='transfer-csv-55599568-90bd-46ac-b1be-d1a538793cae' ) is not None
def test_mets_header_lastmoddate(self): mw = metsrw.METSDocument() date = '2014-07-16T22:52:02.480108' new_date = '3014-07-16T22:52:02.480108' mw.createdate = date header = mw._mets_header(new_date) assert header.tag == '{http://www.loc.gov/METS/}metsHdr' assert header.attrib['CREATEDATE'] == date assert header.attrib['LASTMODDATE'] == new_date assert header.attrib['CREATEDATE'] < header.attrib['LASTMODDATE']
def test_parse_tree_no_createdate(self): mw = metsrw.METSDocument() mets_string = b"""<?xml version='1.0' encoding='ASCII'?> <mets xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.loc.gov/METS/" xsi:schemaLocation="http://www.loc.gov/METS/ http://www.loc.gov/standards/mets/version18/mets.xsd"> <metsHdr/><structMap TYPE="physical"></structMap> </mets> """ root = etree.fromstring(mets_string) mw.tree = root mw._parse_tree() assert mw.createdate is None
def test_mets_root(self): mw = metsrw.METSDocument() root = mw._document_root() location = "http://www.loc.gov/METS/ " + \ "http://www.loc.gov/standards/mets/version18/mets.xsd" assert root.tag == '{http://www.loc.gov/METS/}mets' assert root.attrib[metsrw.lxmlns('xsi') + 'schemaLocation'] == location nsmap = { 'mets': "http://www.loc.gov/METS/", 'xsi': "http://www.w3.org/2001/XMLSchema-instance", 'xlink': "http://www.w3.org/1999/xlink", } assert root.nsmap == nsmap
def test_filesec(self): o = metsrw.FSEntry('objects/file1.txt', file_uuid=str(uuid.uuid4())) p = metsrw.FSEntry('objects/file1-preservation.txt', use='preservaton', file_uuid=str(uuid.uuid4())) o2 = metsrw.FSEntry('objects/file2.txt', file_uuid=str(uuid.uuid4())) mw = metsrw.METSDocument() element = mw._filesec([o, p, o2]) assert isinstance(element, etree._Element) assert element.tag == '{http://www.loc.gov/METS/}fileSec' assert len(element) == 2 # 2 groups assert element[0].tag == '{http://www.loc.gov/METS/}fileGrp' assert element[0].get('USE') == 'original' assert element[1].tag == '{http://www.loc.gov/METS/}fileGrp' assert element[1].get('USE') == 'preservaton'
def test_collect_mdsec_elements(self): f1 = metsrw.FSEntry('file1.txt', file_uuid=str(uuid.uuid4())) f1.amdsecs.append(metsrw.AMDSec()) f1.dmdsecs.append(metsrw.SubSection('dmdSec', None)) f2 = metsrw.FSEntry('file2.txt', file_uuid=str(uuid.uuid4())) f2.dmdsecs.append(metsrw.SubSection('dmdSec', None)) mw = metsrw.METSDocument() elements = mw._collect_mdsec_elements([f1, f2]) # Check ordering - dmdSec before amdSec assert isinstance(elements, list) assert len(elements) == 3 assert isinstance(elements[0], metsrw.SubSection) assert elements[0].subsection == 'dmdSec' assert isinstance(elements[1], metsrw.SubSection) assert elements[1].subsection == 'dmdSec' assert isinstance(elements[2], metsrw.AMDSec)
def write_mets(mets_path, transfer_dir_path, base_path_placeholder, transfer_uuid): """ Writes a METS XML file to disk, containing all the data we can find. Args: mets_path: Output path for METS XML output transfer_dir_path: Location of the files on disk base_path_placeholder: The placeholder string for the base path, e.g. 'transferDirectory' identifier_group: The database column used to lookup file UUIDs, e.g. 'transfer_id' transfer_uuid: The UUID for the transfer """ transfer_dir_path = os.path.expanduser(transfer_dir_path) transfer_dir_path = os.path.normpath(transfer_dir_path) db_base_path = rf"%{base_path_placeholder}%" mets = metsrw.METSDocument() mets.objid = str(transfer_uuid) instance_id = django_settings.INSTANCE_ID if instance_id: agent = metsrw.Agent( "CREATOR", type="SOFTWARE", name=str(instance_id), notes=["Archivematica dashboard UUID"], ) mets.agents.append(agent) try: transfer = Transfer.objects.get(uuid=transfer_uuid) except Transfer.DoesNotExist: logger.debug("No record in database for transfer: %s", transfer_uuid) raise if transfer.accessionid: alt_record_id = metsrw.AltRecordID(transfer.accessionid, type="Accession ID") mets.alternate_ids.append(alt_record_id) fsentry_tree = FSEntriesTree(transfer_dir_path, db_base_path, transfer) fsentry_tree.scan() mets.append_file(fsentry_tree.root_node) mets.write(mets_path, pretty_print=True)
def test_get_file(self): # Setup f3_uuid = str(uuid.uuid4()) f3 = metsrw.FSEntry('dir1/dir2/level3.txt', file_uuid=f3_uuid) d2 = metsrw.FSEntry('dir1/dir2', type='Directory', children=[f3]) f2_uuid = str(uuid.uuid4()) f2 = metsrw.FSEntry('dir1/level2.txt', file_uuid=f2_uuid) d1 = metsrw.FSEntry('dir1', type='Directory', children=[d2, f2]) f1_uuid = str(uuid.uuid4()) f1 = metsrw.FSEntry('level1.txt', file_uuid=f1_uuid) d = metsrw.FSEntry('root', type='Directory', children=[d1, f1]) mw = metsrw.METSDocument() mw.append_file(d) # Test # By UUID assert mw.get_file(file_uuid=f3_uuid) == f3 assert mw.get_file(file_uuid=f2_uuid) == f2 assert mw.get_file(file_uuid=f1_uuid) == f1 assert mw.get_file(file_uuid='does not exist') is None # By path assert mw.get_file(path='dir1/dir2/level3.txt') == f3 assert mw.get_file(path='dir1/dir2') == d2 assert mw.get_file(path='dir1/level2.txt') == f2 assert mw.get_file(path='dir1') == d1 assert mw.get_file(path='level1.txt') == f1 assert mw.get_file(path='does not exist') is None # By label assert mw.get_file(label='level3.txt') == f3 assert mw.get_file(label='dir2') == d2 assert mw.get_file(label='level2.txt') == f2 assert mw.get_file(label='dir1') == d1 assert mw.get_file(label='level1.txt') == f1 assert mw.get_file(label='does not exist') is None # By multiple assert mw.get_file(label='level3.txt', path='dir1/dir2/level3.txt') == f3 assert mw.get_file(label='dir2', type='Directory') == d2 assert mw.get_file(label='level2.txt', type='Item') == f2 assert mw.get_file(file_uuid=None, type='Item') is None # Updates list f4_uuid = str(uuid.uuid4()) f4 = metsrw.FSEntry('file4.txt', file_uuid=f4_uuid) mw.append_file(f4) assert mw.get_file(file_uuid=f4_uuid) == f4 assert mw.get_file(path='file4.txt') == f4
def test_add_file_to_child(self): # Test collects several children deep f2 = metsrw.FSEntry('level2.txt', file_uuid=str(uuid.uuid4())) d1 = metsrw.FSEntry('dir1', type='Directory', children=[f2]) f1 = metsrw.FSEntry('level1.txt', file_uuid=str(uuid.uuid4())) d = metsrw.FSEntry('root', type='Directory', children=[d1, f1]) mw = metsrw.METSDocument() mw.append_file(d) files = mw.all_files() assert files assert len(files) == 4 assert d in files assert f1 in files assert d1 in files assert f2 in files f3 = metsrw.FSEntry('level3.txt', file_uuid=str(uuid.uuid4())) d1.add_child(f3) files = mw.all_files() assert len(files) == 5 assert f3 in files
def test_structmap(self): """ It should create a structMap tag. It should have a div tag for the directory. It should have div tags for the children beneath the directory. It should not have div tags for deleted files (without label). """ children = [ metsrw.FSEntry('objects/file1.txt', file_uuid=str(uuid.uuid4())), metsrw.FSEntry('objects/file2.txt', file_uuid=str(uuid.uuid4())), ] parent = metsrw.FSEntry('objects', type='Directory', children=children) deleted_f = metsrw.FSEntry(use='deletion', file_uuid=str(uuid.uuid4())) writer = metsrw.METSDocument() writer.append_file(parent) writer.append_file(deleted_f) sm = writer._structmap() assert sm.tag == '{http://www.loc.gov/METS/}structMap' assert sm.attrib['TYPE'] == 'physical' assert sm.attrib['ID'] == 'structMap_1' assert sm.attrib['LABEL'] == 'Archivematica default' assert len(sm.attrib) == 3 assert len(sm) == 1 parent = sm[0] assert parent.tag == '{http://www.loc.gov/METS/}div' assert parent.attrib['LABEL'] == 'objects' assert parent.attrib['TYPE'] == 'Directory' assert len(parent.attrib) == 2 assert len(parent) == 2 assert parent[0].attrib['LABEL'] == 'file1.txt' assert parent[0].attrib['TYPE'] == 'Item' assert len(parent[0].attrib) == 2 assert parent[0].find('{http://www.loc.gov/METS/}fptr') is not None assert parent[1].attrib['LABEL'] == 'file2.txt' assert parent[1].attrib['TYPE'] == 'Item' assert len(parent[1].attrib) == 2 assert parent[1].find('{http://www.loc.gov/METS/}fptr') is not None
def test_dependency_injection(self): """Test the dependency injection (DI) infrastructure for metsrw plugins. - client: metsrw.FSEntry - services: classes for reading and writing metadata elements, e.g., the PREMISObject class of metsrw.plugins.premisrw or other classes exposing the same interface. - injector: this test code or the code in metsrw/di.py which calls ``provide`` on the ``feature_broker`` singleton. The ``FSEntry`` class declares its dependency on the class attributes ``premis_object_class``, ``premis_event_class``, and ``premis_agent_class`` and further requires that these return classes with ``fromtree`` and ``serialize`` methods:: >>> premis_object_class = Dependency( ... has_methods('serialize'), ... has_class_methods('fromtree'), ... is_class) """ # Clear the feature broker and then register/provide the premisrw # plugin classes (services) with the feature broker. feature_broker = metsrw.feature_broker assert len(feature_broker) == 3 feature_broker.clear() assert not feature_broker feature_broker.provide('premis_object_class', premisrw.PREMISObject) feature_broker.provide('premis_event_class', premisrw.PREMISEvent) feature_broker.provide('premis_agent_class', premisrw.PREMISAgent) assert len(feature_broker) == 3 # Create premisrw instances. compression_premis_event = premisrw.PREMISEvent(data=EX_COMPR_EVT) premis_events = [compression_premis_event] premis_agents = [ premisrw.PREMISAgent(data=x) for x in [EX_AGT_1, EX_AGT_2] ] _, compression_program_version, archive_tool = ( compression_premis_event.compression_details) premis_object = premisrw.PREMISObject( xsi_type=EX_PTR_XSI_TYPE, identifier_value=EX_PTR_IDENTIFIER_VALUE, message_digest_algorithm=EX_PTR_MESSAGE_DIGEST_ALGORITHM, message_digest=EX_PTR_MESSAGE_DIGEST, size=EX_PTR_SIZE, format_name=EX_PTR_FORMAT_NAME, format_registry_key=EX_PTR_FORMAT_REGISTRY_KEY, creating_application_name=archive_tool, creating_application_version=compression_program_version, date_created_by_application=EX_PTR_DATE_CREATED_BY_APPLICATION) transform_files = compression_premis_event.get_decompression_transform_files( ) # Create metsrw ``METSDocument`` and ``FSEntry`` instances. mets_doc = metsrw.METSDocument() fs_entry = metsrw.FSEntry(path=EX_PTR_PATH, file_uuid=EX_PTR_IDENTIFIER_VALUE, use=EX_PTR_PACKAGE_TYPE, type=EX_PTR_PACKAGE_TYPE, transform_files=transform_files, mets_div_type=EX_PTR_AIP_SUBTYPE) mets_doc.append_file(fs_entry) # Use the ``add_premis_...`` methods to add the PREMIS metadata # elements to the ``FSEntry`` instance. This will assert that each # PREMIS instance is of the correct type (e.g., that ``premis_object`` # is an instance of ``FSEntry().premis_object_class``) and will call the # instance's ``serialize`` method and incorporate the resulting # ``lxml.etree._ElementTree`` instance into the ``FSEntry`` instance # appropriately. fs_entry.add_premis_object(premis_object) for premis_event in premis_events: fs_entry.add_premis_event(premis_event) for premis_agent in premis_agents: fs_entry.add_premis_agent(premis_agent) # Assert that the instances returned by the # ``FSEntry().get_premis_...`` methods are of the anticipated type. new_premis_agents = fs_entry.get_premis_agents() for new_premis_agent in new_premis_agents: assert isinstance(new_premis_agent, premisrw.PREMISAgent) assert new_premis_agent in premis_agents assert id(new_premis_agent) not in [id(pa) for pa in premis_agents] new_premis_events = fs_entry.get_premis_events() for new_premis_event in new_premis_events: assert isinstance(new_premis_event, premisrw.PREMISEvent) assert new_premis_event in premis_events assert id(new_premis_event) not in [id(pa) for pa in premis_events] new_premis_objects = fs_entry.get_premis_objects() for new_premis_object in new_premis_objects: assert isinstance(new_premis_object, premisrw.PREMISObject) assert new_premis_object == premis_object assert id(new_premis_object) is not premis_object # Assert that the resulting mets XML contains a # premis:objectIdentifierValue in the anticipated location in the # structure with the anticipated value. mets_doc_el = mets_doc.serialize() xpath = ('mets:amdSec/mets:techMD/mets:mdWrap[@MDTYPE="PREMIS:OBJECT"]' '/mets:xmlData/premis:object/premis:objectIdentifier/' 'premis:objectIdentifierValue') a = mets_doc_el.find(xpath, namespaces=metsrw.NAMESPACES) assert a.text == EX_PTR_IDENTIFIER_VALUE # Now change the feature broker so that ``FSEntry``'s dependency on a # ``premis_object_class`` class attribute is being fulfilled by a new # class: ``BetterPREMISObject``. feature_broker.provide('premis_object_class', BetterPREMISObject) # Now create a new PREMIS object premis_object_tree = premis_object.serialize() better_premis_object = BetterPREMISObject.fromtree(premis_object_tree) # And re-create the ``METSDocument`` and ``FSEntry`` instances. mets_doc = metsrw.METSDocument() fs_entry = metsrw.FSEntry(path=EX_PTR_PATH, file_uuid=EX_PTR_IDENTIFIER_VALUE, use=EX_PTR_PACKAGE_TYPE, type=EX_PTR_PACKAGE_TYPE, transform_files=transform_files, mets_div_type=EX_PTR_AIP_SUBTYPE) mets_doc.append_file(fs_entry) # Add the PREMIS metadata again, but this time use the instance of # ``BetterPREMISObject``. fs_entry.add_premis_object(better_premis_object) for premis_event in premis_events: fs_entry.add_premis_event(premis_event) for premis_agent in premis_agents: fs_entry.add_premis_agent(premis_agent) # Assert that the instances returned by the # ``FSEntry().get_premis_...`` methods are of the anticipated type. new_premis_objects = fs_entry.get_premis_objects() for new_premis_object in new_premis_objects: assert isinstance(new_premis_object, BetterPREMISObject) # Make sure we can still find the PREMIS object id value. mets_doc_el = mets_doc.serialize() assert (mets_doc_el.find( xpath, namespaces=metsrw.NAMESPACES).text == EX_PTR_IDENTIFIER_VALUE) # Reset the feature broker to its default state so subsequent tests # don't break. metsrw.set_feature_broker_to_default_state(feature_broker)
def test_pointer_file(self): """Test the creation of pointer files.""" # Mocks of the AIP, its compression event, and other details. aip_uuid = str(uuid.uuid4()) aip = { 'current_path': '/path/to/myaip-{}.7z'.format(aip_uuid), 'uuid': aip_uuid, 'package_type': 'Archival Information Package', 'checksum_algorithm': 'sha256', 'checksum': '78e4509313928d2964fe877a6a82f1ba728c171eedf696e3f5b0aed61ec547f6', 'size': '11854', 'extension': '.7z', 'archive_tool': '7-Zip', 'archive_tool_version': '9.20', 'transform_files': [ {'algorithm': 'bzip2', 'order': '2', 'type': 'decompression'}, {'algorithm': 'gpg', 'order': '1', 'type': 'decryption'} ] } compression_event = { 'uuid': str(uuid.uuid4()), 'detail': ( 'program=7z; version=p7zip Version 9.20' ' (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,2 CPUs)'), 'outcome': '', # This should be the output from 7-zip or other... 'outcome_detail_note': '', 'agents': [ {'name': 'Archivematica', 'type': 'software', 'identifier_type': 'preservation system', 'identifier_value': 'Archivematica-1.6.1'}, {'name': 'test', 'type': 'organization', 'identifier_type': 'repository code', 'identifier_value': 'test'} ] } now = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S") pronom_conversion = { '.7z': {'puid': 'fmt/484', 'name': '7Zip format'}, '.bz2': {'puid': 'x-fmt/268', 'name': 'BZIP2 Compressed Archive'}, } # Create the METS using metsrw mw = metsrw.METSDocument() # TODO: metsrw will prefix "file-" to the AIP UUID when creating <mets:fptr # FILEID> and <mets:file ID> attr vals. However, we want "file-" to be # replaced by the AIP's (i.e., the transfer's) name. aip_fs_entry = metsrw.FSEntry( path=aip['current_path'], file_uuid=aip['uuid'], use=aip['package_type'], type=aip['package_type'], transform_files=aip['transform_files']) premis_schema_location = ( 'info:lc/xmlns/premis-v2' ' http://www.loc.gov/standards/premis/v2/premis-v2-2.xsd') nsmap = { 'mets': metsrw.NAMESPACES['mets'], 'xsi': metsrw.NAMESPACES['xsi'], 'xlink': metsrw.NAMESPACES['xlink'], } E_P = ElementMaker(namespace=metsrw.NAMESPACES['premis'], nsmap={'premis': metsrw.NAMESPACES['premis']}) # Create the AIP's PREMIS:OBJECT using raw lxml aip_premis_object = E_P.object( E_P.objectIdentifier( E_P.objectIdentifierType('UUID'), E_P.objectIdentifierValue(aip['uuid']), ), E_P.objectCharacteristics( E_P.compositionLevel('1'), E_P.fixity( E_P.messageDigestAlgorithm(aip['checksum_algorithm']), E_P.messageDigest(aip['checksum']), ), E_P.size(str(aip['size'])), E_P.format( E_P.formatDesignation( E_P.formatName( pronom_conversion[aip['extension']]['name']), E_P.formatVersion(), ), E_P.formatRegistry( E_P.formatRegistryName('PRONOM'), E_P.formatRegistryKey( pronom_conversion[aip['extension']]['puid']) ), ), E_P.creatingApplication( E_P.creatingApplicationName(aip['archive_tool']), E_P.creatingApplicationVersion(aip['archive_tool_version']), E_P.dateCreatedByApplication(now), ), ), version='2.2', ) aip_premis_object.attrib['{' + nsmap['xsi'] + '}type'] = 'premis:file' aip_premis_object.attrib['{' + nsmap['xsi'] + '}schemaLocation'] = ( premis_schema_location) aip_fs_entry.add_premis_object(aip_premis_object) # Create the AIP's PREMIS:EVENT for the compression using raw lxml aip_premis_compression_event = E_P.event( E_P.eventIdentifier( E_P.eventIdentifierType('UUID'), E_P.eventIdentifierValue(compression_event['uuid']), ), E_P.eventType('compression'), E_P.eventDateTime(now), E_P.eventDetail(compression_event['detail']), E_P.eventOutcomeInformation( E_P.eventOutcome(compression_event['outcome']), E_P.eventOutcomeDetail( E_P.eventOutcomeDetailNote( compression_event['outcome_detail_note']) ), ), *[E_P.linkingAgentIdentifier( E_P.linkingAgentIdentifierType(ag['identifier_type']), E_P.linkingAgentIdentifierValue(ag['identifier_value'])) for ag in compression_event['agents']], version='2.2' ) aip_premis_compression_event.attrib[ '{' + nsmap['xsi'] + '}schemaLocation'] = (premis_schema_location) aip_fs_entry.add_premis_event(aip_premis_compression_event) # Create the AIP's PREMIS:AGENTs using raw lxml for agent in compression_event['agents']: agent_el = E_P.agent( E_P.agentIdentifier( E_P.agentIdentifierType(agent['identifier_type']), E_P.agentIdentifierValue(agent['identifier_value']) ), E_P.agentName(agent['name']), E_P.agentType(agent['type']) ) agent_el.attrib['{' + nsmap['xsi'] + '}schemaLocation'] = ( premis_schema_location) aip_fs_entry.add_premis_agent(agent_el) # TODO: we need metsrw to be able to set transformFile elements. # compression - 7z or tar.bz2 """ if extension == '.7z': etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='1', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM=algorithm) elif extension == '.bz2': etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='1', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM='bzip2') etree.SubElement(file_, namespaces.metsBNS + "transformFile", TRANSFORMORDER='2', TRANSFORMTYPE='decompression', TRANSFORMALGORITHM='tar') """ mw.append_file(aip_fs_entry) self.assert_pointer_valid(mw.serialize())
def test_parse_no_groupid(self): """ It should handle files with no GROUPID. """ mw = metsrw.METSDocument().fromfile( 'fixtures/mets_without_groupid_in_file.xml') assert mw.get_file( file_uuid='db653873-d0ab-4bc1-9edb-2b6d2d84ab5a') is not None
def test_write_normative_structmap(self): """It should be able to write a normative logical structural map. """ # Create the empty directory as an FSEntry and give it a simple PREMIS # object. d_empty = metsrw.FSEntry('EMPTY_DIR', type='Directory') d_empty_id = str(uuid.uuid4()) d_empty_premis_object = PREMISObject( identifier_value=d_empty_id, premis_version='3.0', xsi_type='premis:intellectualEntity') d_empty.add_premis_object(d_empty_premis_object) # Create the parent directory of the empty directory and give it a # simple PREMIS object also. f3 = metsrw.FSEntry('level3.txt', file_uuid=str(uuid.uuid4())) d2 = metsrw.FSEntry('dir2', type='Directory', children=[f3, d_empty]) d2_id = str(uuid.uuid4()) d2_premis_object = PREMISObject(identifier_value=d2_id) d2.add_premis_object(d2_premis_object) # Create more directories and files and add the root of the dir # structure to a metsrw METSDocument instance. f2 = metsrw.FSEntry('level2.txt', file_uuid=str(uuid.uuid4())) d1 = metsrw.FSEntry('dir1', type='Directory', children=[d2, f2]) f1 = metsrw.FSEntry('level1.txt', file_uuid=str(uuid.uuid4())) d = metsrw.FSEntry('root', type='Directory', children=[d1, f1]) mw = metsrw.METSDocument() mw.append_file(d) # Expect to find all of our files and directories in the return value # of ``all_files``, including the empty directory. files = mw.all_files() assert files assert len(files) == 7 assert d in files assert f1 in files assert d1 in files assert f2 in files assert d2 in files assert f3 in files assert d_empty in files # Get XPaths for the empty directory and its sister file. The file # should exist in both structmaps, the empty directory should only # exist in the normative logical structmap. exists_in_both_path = ('mets:div[@LABEL="root"]/' 'mets:div[@LABEL="dir1"]/' 'mets:div[@LABEL="dir2"]/' 'mets:div[@LABEL="level3.txt"]') exists_in_normative_only_path = ('mets:div[@LABEL="root"]/' 'mets:div[@LABEL="dir1"]/' 'mets:div[@LABEL="dir2"]/' 'mets:div[@LABEL="EMPTY_DIR"]') # Expect that the empty directory is not in the physical structmap. structmap_el = mw._structmap() assert structmap_el.find(exists_in_both_path, metsrw.NAMESPACES) is not None assert structmap_el.find(exists_in_normative_only_path, metsrw.NAMESPACES) is None # Expect that the empty directory is in the normative logical structmap. normative_structmap_el = mw._normative_structmap() assert normative_structmap_el.find( exists_in_both_path, metsrw.NAMESPACES) is not None empty_div_el = normative_structmap_el.find( exists_in_normative_only_path, metsrw.NAMESPACES) assert empty_div_el is not None # Expect that the empty directory in the normative logical structmap # has a DMDID that references a dmdSec in the METS document and that # this dmdSec contains a PREMIS object. dmdid = empty_div_el.get('DMDID') assert dmdid.startswith('dmdSec_') doc = mw.serialize() empty_dir_dmd_sec = doc.find( 'mets:dmdSec[@ID="{}"]'.format(dmdid), metsrw.NAMESPACES) assert empty_dir_dmd_sec is not None xml_data_el = empty_dir_dmd_sec.find( 'mets:mdWrap/mets:xmlData', metsrw.NAMESPACES) premis_object_el_retrieved = xml_data_el.find( 'premis:object', PREMIS_3_0_NAMESPACES) d_empty_id_retrieved = premis_object_el_retrieved.find( 'premis:objectIdentifier/premis:objectIdentifierValue', PREMIS_3_0_NAMESPACES).text assert d_empty_id_retrieved == d_empty_id xsi_type = premis_object_el_retrieved.get( '{}type'.format(lxmlns('xsi', premis_version='3.0'))) assert xsi_type == 'premis:intellectualEntity'
def test_mets_header(self): mw = metsrw.METSDocument() date = '2014-07-16T22:52:02.480108' header = mw._mets_header(date) assert header.tag == '{http://www.loc.gov/METS/}metsHdr' assert header.attrib['CREATEDATE'] == date
def test_parse_tree_createdate_too_new(self): mw = metsrw.METSDocument() root = etree.parse('fixtures/createdate_too_new.xml') mw.tree = root with pytest.raises(metsrw.ParseError): mw._parse_tree()