def test_dmdsec_from_csv_parsed_metadata_no_data(self): """It should not create dmdSecs with no parsed metadata.""" data = {} # Test ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data) # Verify assert ret == []
def test_dmdsec_from_csv_parsed_metadata_other_only(self): data = collections.OrderedDict([ ("Title", ["Yamani Weapons"]), ("Contributor", [u"雪 ユキ".encode('utf8')]), ("Long Description", ['This is about how glaives are used in the Yamani Islands']) ]) # Test ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data) # Verify assert ret assert len(ret) == 1 dmdsec = ret[0] assert dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec' assert 'ID' in dmdsec.attrib mdwrap = dmdsec[0] assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap' assert 'MDTYPE' in mdwrap.attrib assert mdwrap.attrib['MDTYPE'] == 'OTHER' assert 'OTHERMDTYPE' in mdwrap.attrib assert mdwrap.attrib['OTHERMDTYPE'] == 'CUSTOM' xmldata = mdwrap[0] assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData' # Elements are direct children of xmlData assert len(xmldata) == 3 assert xmldata[0].tag == 'title' assert xmldata[0].text == 'Yamani Weapons' assert xmldata[1].tag == 'contributor' assert xmldata[1].text == u'雪 ユキ' assert xmldata[2].tag == 'long_description' assert xmldata[2].text == 'This is about how glaives are used in the Yamani Islands'
def test_dmdsec_from_csv_parsed_metadata_both(self): """It should create a dmdSec for DC and Other parsed metadata.""" data = collections.OrderedDict([ ("dc.title", ["Yamani Weapons"]), ("dc.contributor", [u"雪 ユキ".encode('utf8')]), ("dcterms.isPartOf", ["AIC#42"]), ("Title", ["Yamani Weapons"]), ("Contributor", [u"雪 ユキ".encode('utf8')]), ("Long Description", ['This is about how glaives are used in the Yamani Islands']) ]) # Test ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data) # Verify assert ret assert len(ret) == 2 # Return can be DC or OTHER first, but in this case DC should be first dc_dmdsec = ret[0] assert dc_dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec' assert 'ID' in dc_dmdsec.attrib mdwrap = dc_dmdsec[0] assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap' assert 'MDTYPE' in mdwrap.attrib assert mdwrap.attrib['MDTYPE'] == 'DC' xmldata = mdwrap[0] assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData' dc_elem = xmldata[0] # Elements are children of dublincore tag assert dc_elem.tag == '{http://purl.org/dc/terms/}dublincore' assert len(dc_elem) == 3 assert dc_elem[0].tag == '{http://purl.org/dc/elements/1.1/}title' assert dc_elem[0].text == 'Yamani Weapons' assert dc_elem[ 1].tag == '{http://purl.org/dc/elements/1.1/}contributor' assert dc_elem[1].text == u'雪 ユキ' assert dc_elem[2].tag == '{http://purl.org/dc/terms/}isPartOf' assert dc_elem[2].text == 'AIC#42' other_dmdsec = ret[1] assert other_dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec' assert 'ID' in other_dmdsec.attrib mdwrap = other_dmdsec[0] assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap' assert 'MDTYPE' in mdwrap.attrib assert mdwrap.attrib['MDTYPE'] == 'OTHER' assert 'OTHERMDTYPE' in mdwrap.attrib assert mdwrap.attrib['OTHERMDTYPE'] == 'CUSTOM' xmldata = mdwrap[0] assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData' # Elements are direct children of xmlData assert len(xmldata) == 3 assert xmldata[0].tag == 'title' assert xmldata[0].text == 'Yamani Weapons' assert xmldata[1].tag == 'contributor' assert xmldata[1].text == u'雪 ユキ' assert xmldata[2].tag == 'long_description' assert xmldata[ 2].text == 'This is about how glaives are used in the Yamani Islands'
def test_dmdsec_from_csv_parsed_metadata_repeats(self): """It should create multiple elements for repeated input.""" data = collections.OrderedDict([ ("dc.contributor", ["Yuki", u"雪 ユキ".encode('utf8')]), ("Contributor", ["Yuki", u"雪 ユキ".encode('utf8')]), ]) # Test ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data) # Verify assert ret assert len(ret) == 2 # Return can be DC or OTHER first, but in this case DC should be first dc_dmdsec = ret[0] assert dc_dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec' assert 'ID' in dc_dmdsec.attrib mdwrap = dc_dmdsec[0] assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap' assert 'MDTYPE' in mdwrap.attrib assert mdwrap.attrib['MDTYPE'] == 'DC' xmldata = mdwrap[0] assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData' dc_elem = xmldata[0] # Elements are children of dublincore tag assert dc_elem.tag == '{http://purl.org/dc/terms/}dublincore' assert len(dc_elem) == 2 assert dc_elem[ 0].tag == '{http://purl.org/dc/elements/1.1/}contributor' assert dc_elem[0].text == 'Yuki' assert dc_elem[ 1].tag == '{http://purl.org/dc/elements/1.1/}contributor' assert dc_elem[1].text == u'雪 ユキ' other_dmdsec = ret[1] assert other_dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec' assert 'ID' in other_dmdsec.attrib mdwrap = other_dmdsec[0] assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap' assert 'MDTYPE' in mdwrap.attrib assert mdwrap.attrib['MDTYPE'] == 'OTHER' assert 'OTHERMDTYPE' in mdwrap.attrib assert mdwrap.attrib['OTHERMDTYPE'] == 'CUSTOM' xmldata = mdwrap[0] assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData' # Elements are direct children of xmlData assert len(xmldata) == 2 assert xmldata[0].tag == 'contributor' assert xmldata[0].text == 'Yuki' assert xmldata[1].tag == 'contributor' assert xmldata[1].text == u'雪 ユキ'
def update_metadata_csv(mets, metadata_csv, sip_uuid, sip_dir): print('Parse new metadata.csv') full_path = metadata_csv.currentlocation.replace('%SIPDirectory%', sip_dir, 1) csvmetadata = createmetscsv.parseMetadataCSV(full_path) # FIXME This doesn't support having both DC and non-DC metadata in dmdSecs # If createDmdSecsFromCSVParsedMetadata returns more than 1 dmdSec, behaviour is undefined for f, md in csvmetadata.items(): # Verify file is in AIP print('Looking for', f, 'from metadata.csv in SIP') # Find File with original or current locationg matching metadata.csv # Prepend % to match the end of %SIPDirectory% or %transferDirectory% try: file_obj = models.File.objects.get(sip_id=sip_uuid, originallocation__endswith='%' + f) except models.File.DoesNotExist: try: file_obj = models.File.objects.get( sip_id=sip_uuid, currentlocation__endswith='%' + f) except models.File.DoesNotExist: print(f, 'not found in database') continue print(f, 'found in database') fsentry = mets.get_file(file_uuid=file_obj.uuid) print(f, 'was associated with', fsentry.dmdids) # Create dmdSec new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(md) # Add both for new_dmdsec in new_dmdsecs: # need to strip new_d to just the DC part new_dc = new_dmdsec.find('.//dcterms:dublincore', namespaces=ns.NSMAP) new_metsrw_dmdsec = fsentry.add_dublin_core(new_dc) if len(fsentry.dmdsecs) > 1: fsentry.dmdsecs[-2].replace_with(new_metsrw_dmdsec) print(f, 'now associated with', fsentry.dmdids) return mets
def test_dmdsec_from_csv_parsed_metadata_no_data(self): data = {} # Test ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data) # Verify assert ret == []
def test_dmdsec_from_csv_parsed_metadata_dc_only(self): data = collections.OrderedDict([ ("dc.title", ["Yamani Weapons"]), ("dc.creator", ["Keladry of Mindelan"]), ("dc.subject", ["Glaives"]), ("dc.description", ["Glaives are cool"]), ("dc.publisher", ["Tortall Press"]), ("dc.contributor", [u"雪 ユキ".encode('utf8')]), ("dc.date", ["2015"]), ("dc.type", ["Archival Information Package"]), ("dc.format", ["parchement"]), ("dc.identifier", ["42/1"]), ("dc.source", ["Numair's library"]), ("dc.relation", ["None"]), ("dc.language", ["en"]), ("dc.rights", ["Public Domain"]), ("dcterms.isPartOf", ["AIC#42"]), ]) # Test ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data) # Verify assert ret assert len(ret) == 1 dmdsec = ret[0] assert dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec' assert 'ID' in dmdsec.attrib mdwrap = dmdsec[0] assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap' assert 'MDTYPE' in mdwrap.attrib assert mdwrap.attrib['MDTYPE'] == 'DC' xmldata = mdwrap[0] assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData' # Elements are children of dublincore tag dc_elem = xmldata[0] assert dc_elem.tag == '{http://purl.org/dc/terms/}dublincore' assert len(dc_elem) == 15 assert dc_elem[0].tag == '{http://purl.org/dc/elements/1.1/}title' assert dc_elem[0].text == 'Yamani Weapons' assert dc_elem[1].tag == '{http://purl.org/dc/elements/1.1/}creator' assert dc_elem[1].text == 'Keladry of Mindelan' assert dc_elem[2].tag == '{http://purl.org/dc/elements/1.1/}subject' assert dc_elem[2].text == 'Glaives' assert dc_elem[3].tag == '{http://purl.org/dc/elements/1.1/}description' assert dc_elem[3].text == 'Glaives are cool' assert dc_elem[4].tag == '{http://purl.org/dc/elements/1.1/}publisher' assert dc_elem[4].text == 'Tortall Press' assert dc_elem[5].tag == '{http://purl.org/dc/elements/1.1/}contributor' assert dc_elem[5].text == u'雪 ユキ' assert dc_elem[6].tag == '{http://purl.org/dc/elements/1.1/}date' assert dc_elem[6].text == '2015' assert dc_elem[7].tag == '{http://purl.org/dc/elements/1.1/}type' assert dc_elem[7].text == 'Archival Information Package' assert dc_elem[8].tag == '{http://purl.org/dc/elements/1.1/}format' assert dc_elem[8].text == 'parchement' assert dc_elem[9].tag == '{http://purl.org/dc/elements/1.1/}identifier' assert dc_elem[9].text == '42/1' assert dc_elem[10].tag == '{http://purl.org/dc/elements/1.1/}source' assert dc_elem[10].text == "Numair's library" assert dc_elem[11].tag == '{http://purl.org/dc/elements/1.1/}relation' assert dc_elem[11].text == 'None' assert dc_elem[12].tag == '{http://purl.org/dc/elements/1.1/}language' assert dc_elem[12].text == 'en' assert dc_elem[13].tag == '{http://purl.org/dc/elements/1.1/}rights' assert dc_elem[13].text == 'Public Domain' assert dc_elem[14].tag == '{http://purl.org/dc/terms/}isPartOf' assert dc_elem[14].text == 'AIC#42'
def update_metadata_csv(root, metadata_csv, sip_uuid, sip_dir, now): print('Parse new metadata.csv') full_path = metadata_csv.currentlocation.replace('%SIPDirectory%', sip_dir, 1) csvmetadata = createmetscsv.parseMetadataCSV(full_path) # Set globalDmdSecCounter so createDmdSecsFromCSVParsedMetadata will work createmets2.globalDmdSecCounter = int( root.xpath('count(mets:dmdSec)', namespaces=ns.NSMAP)) # dmdSecs added after existing dmdSecs or metsHdr if none try: add_after = root.findall('mets:dmdSec', namespaces=ns.NSMAP)[-1] except IndexError: add_after = root.find('mets:metsHdr', namespaces=ns.NSMAP) aip_div = root.find('mets:structMap[@TYPE="physical"]/mets:div', namespaces=ns.NSMAP) # FIXME Does this have to support having non DC metadata in the CSV? Assuming not for f, md in csvmetadata.iteritems(): # Verify file is in AIP print('Looking for', f, 'from metadata.csv in SIP') # Find File with original or current locationg matching metadata.csv # Prepend % to match the end of %SIPDirectory% or %transferDirectory% try: file_obj = models.File.objects.get(sip_id=sip_uuid, originallocation__endswith='%' + f) except models.File.DoesNotExist: try: file_obj = models.File.objects.get( sip_id=sip_uuid, currentlocation__endswith='%' + f) except models.File.DoesNotExist: print(f, 'not found in database') continue print(f, 'found in database') # Find structMap div to associate with split_path = file_obj.currentlocation.replace('%SIPDirectory%', '', 1).split('/') obj_div = aip_div for label in split_path: child = obj_div.find('mets:div[@LABEL="' + label + '"]', namespaces=ns.NSMAP) if child is None: print(f, 'not in structMap') break obj_div = child if obj_div is None: continue ids = obj_div.get('DMDID', '') print(f, 'was associated with', ids) # Create dmdSec new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(md) # Add DMDIDs new_ids = [d.get('ID') for d in new_dmdsecs] new_ids = ids.split() + new_ids print(f, 'now associated with', ' '.join(new_ids)) obj_div.set('DMDID', ' '.join(new_ids)) # Update old dmdSecs if needed new = False if not ids: # Newly generated dmdSec is the original new = True else: # Find the dmdSec with no status and mark it original search_ids = ' or '.join(['@ID="%s"' % x for x in ids.split()]) dmdsecs = root.xpath('mets:dmdSec[%s][not(@STATUS)]' % search_ids, namespaces=ns.NSMAP) for d in dmdsecs: d.set('STATUS', 'original') print(d.get('ID'), 'STATUS is original') # Add dmdSecs to document for d in new_dmdsecs: d.set('CREATED', now) if new: d.set('STATUS', 'original') else: d.set('STATUS', 'updated') print(d.get('ID'), 'STATUS is', d.get('STATUS')) add_after.addnext(d) add_after = d return root