def test_parse_metadata_csv_repeated_columns(self): """It should put repeated elements into a list of values.""" # Create metadata.csv data = [ ["Filename", "dc.title", "dc.type", "dc.type", "dc.type"], ["objects/foo.jpg", "Foo", "Photograph", "Still image", "Picture"], ] with self.metadata_file.open("wb") as f: writer = csv.writer(f) for row in data: writer.writerow(row) # Run test dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV( Job("stub", "stub", []), str(self.metadata_file) ) # Verify assert dc assert "objects/foo.jpg" in dc assert "dc.title" in dc["objects/foo.jpg"] assert dc["objects/foo.jpg"]["dc.title"] == ["Foo"] assert "dc.type" in dc["objects/foo.jpg"] assert dc["objects/foo.jpg"]["dc.type"] == [ "Photograph", "Still image", "Picture", ] assert list(dc["objects/foo.jpg"].keys()) == ["dc.title", "dc.type"]
def test_parse_metadata_csv(self): # Create metadata.csv data = [ ['Filename', 'dc.title', 'dc.date', 'Other metadata'], ['objects/foo.jpg', 'Foo', '2000', 'Taken on a sunny day'], ['objects/bar/', 'Bar', '2000', 'All taken on a rainy day'], ] with open('metadata.csv', 'wb') as f: writer = csv.writer(f) for row in data: writer.writerow(row) # Run test dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV('metadata.csv') # Verify assert dc assert 'objects/foo.jpg' in dc assert 'dc.title' in dc['objects/foo.jpg'] assert dc['objects/foo.jpg']['dc.title'] == ['Foo'] assert 'dc.date' in dc['objects/foo.jpg'] assert dc['objects/foo.jpg']['dc.date'] == ['2000'] assert 'Other metadata' in dc['objects/foo.jpg'] assert dc['objects/foo.jpg']['Other metadata'] == ['Taken on a sunny day'] assert dc['objects/foo.jpg'].keys() == ['dc.title', 'dc.date', 'Other metadata'] assert 'objects/bar' in dc assert 'dc.title' in dc['objects/bar'] assert dc['objects/bar']['dc.title'] == ['Bar'] assert 'dc.date' in dc['objects/bar'] assert dc['objects/bar']['dc.date'] == ['2000'] assert 'Other metadata' in dc['objects/bar'] assert dc['objects/bar']['Other metadata'] == ['All taken on a rainy day'] assert dc['objects/bar'].keys() == ['dc.title', 'dc.date', 'Other metadata']
def test_parse_metadata_csv_repeated_columns(self): """It should put repeated elements into a list of values.""" # Create metadata.csv data = [ ['Filename', 'dc.title', 'dc.type', 'dc.type', 'dc.type'], ['objects/foo.jpg', 'Foo', 'Photograph', 'Still image', 'Picture'], ] with open('metadata.csv', 'wb') as f: writer = csv.writer(f) for row in data: writer.writerow(row) # Run test dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV( 'metadata.csv') # Verify assert dc assert 'objects/foo.jpg' in dc assert 'dc.title' in dc['objects/foo.jpg'] assert dc['objects/foo.jpg']['dc.title'] == ['Foo'] assert 'dc.type' in dc['objects/foo.jpg'] assert dc['objects/foo.jpg']['dc.type'] == [ 'Photograph', 'Still image', 'Picture' ] assert list(dc['objects/foo.jpg'].keys()) == ['dc.title', 'dc.type']
def test_parse_metadata_csv_blank_rows(self): # Create metadata.csv data = [ ['Filename', 'dc.title', 'dc.type', 'dc.type', 'dc.type'], ['objects/foo.jpg', 'Foo', 'Photograph', 'Still image', 'Picture'], [], ] with open('metadata.csv', 'wb') as f: writer = csv.writer(f) for row in data: writer.writerow(row) # Run test dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV('metadata.csv') # Verify assert dc assert len(dc) == 1 assert 'objects/foo.jpg' in dc
def test_parse_metadata_csv_non_ascii(self): # Create metadata.csv data = [ ['Filename', 'dc.title'], ['objects/foo.jpg', u'元気です'.encode('utf8')], ] with open('metadata.csv', 'wb') as f: writer = csv.writer(f) for row in data: writer.writerow(row) # Run test dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV('metadata.csv') # Verify assert dc assert 'objects/foo.jpg' in dc assert 'dc.title' in dc['objects/foo.jpg'] assert dc['objects/foo.jpg']['dc.title'] == [u'元気です'.encode('utf8')]
def test_parse_metadata_csv_non_ascii(self): """It should parse unicode.""" # Create metadata.csv data = [["Filename", "dc.title"], ["objects/foo.jpg", "元気です".encode("utf8")]] with self.metadata_file.open("wb") as f: writer = csv.writer(f) for row in data: writer.writerow(row) # Run test dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV( Job("stub", "stub", []), str(self.metadata_file) ) # Verify assert dc assert "objects/foo.jpg" in dc assert "dc.title" in dc["objects/foo.jpg"] assert dc["objects/foo.jpg"]["dc.title"] == ["元気です".encode("utf8")]
def test_parse_metadata_csv(self): """It should parse the metadata.csv into a dict.""" # Create metadata.csv data = [ ["Filename", "dc.title", "dc.date", "Other metadata"], ["objects/foo.jpg", "Foo", "2000", "Taken on a sunny day"], ["objects/bar/", "Bar", "2000", "All taken on a rainy day"], ] with self.metadata_file.open("wb") as f: writer = csv.writer(f) for row in data: writer.writerow(row) # Run test dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV( Job("stub", "stub", []), str(self.metadata_file) ) # Verify assert dc assert "objects/foo.jpg" in dc assert "dc.title" in dc["objects/foo.jpg"] assert dc["objects/foo.jpg"]["dc.title"] == ["Foo"] assert "dc.date" in dc["objects/foo.jpg"] assert dc["objects/foo.jpg"]["dc.date"] == ["2000"] assert "Other metadata" in dc["objects/foo.jpg"] assert dc["objects/foo.jpg"]["Other metadata"] == ["Taken on a sunny day"] assert list(dc["objects/foo.jpg"].keys()) == [ "dc.title", "dc.date", "Other metadata", ] assert "objects/bar" in dc assert "dc.title" in dc["objects/bar"] assert dc["objects/bar"]["dc.title"] == ["Bar"] assert "dc.date" in dc["objects/bar"] assert dc["objects/bar"]["dc.date"] == ["2000"] assert "Other metadata" in dc["objects/bar"] assert dc["objects/bar"]["Other metadata"] == ["All taken on a rainy day"] assert list(dc["objects/bar"].keys()) == [ "dc.title", "dc.date", "Other metadata", ]
def update_metadata_csv(job, mets, metadata_csv, sip_uuid, sip_dir, state): job.pyprint('Parse new metadata.csv') full_path = metadata_csv.currentlocation.replace('%SIPDirectory%', sip_dir, 1) csvmetadata = createmetscsv.parseMetadataCSV(job, full_path) # FIXME This doesn't support having both DC and non-DC metadata in dmdSecs # If createDmdSecsFromCSVParsedMetadata returns more than 1 dmdSec, behaviour is undefined for f, md in csvmetadata.items(): # Verify file is in AIP job.pyprint('Looking for', f, 'from metadata.csv in SIP') # Find File with original or current locationg matching metadata.csv # Prepend % to match the end of %SIPDirectory% or %transferDirectory% try: file_obj = models.File.objects.get(sip_id=sip_uuid, originallocation__endswith='%' + f) except models.File.DoesNotExist: try: file_obj = models.File.objects.get( sip_id=sip_uuid, currentlocation__endswith='%' + f) except models.File.DoesNotExist: job.pyprint(f, 'not found in database') continue job.pyprint(f, 'found in database') fsentry = mets.get_file(file_uuid=file_obj.uuid) job.pyprint(f, 'was associated with', fsentry.dmdids) # Create dmdSec new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata( job, md, state) # Add both for new_dmdsec in new_dmdsecs: # need to strip new_d to just the DC part new_dc = new_dmdsec.find('.//dcterms:dublincore', namespaces=ns.NSMAP) new_metsrw_dmdsec = fsentry.add_dublin_core(new_dc) if len(fsentry.dmdsecs) > 1: fsentry.dmdsecs[-2].replace_with(new_metsrw_dmdsec) job.pyprint(f, 'now associated with', fsentry.dmdids) return mets
def test_parse_metadata_csv_blank_rows(self): """It should skip blank rows.""" # Create metadata.csv data = [ ["Filename", "dc.title", "dc.type", "dc.type", "dc.type"], ["objects/foo.jpg", "Foo", "Photograph", "Still image", "Picture"], [], ] with self.metadata_file.open("wb") as f: writer = csv.writer(f) for row in data: writer.writerow(row) # Run test dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV( Job("stub", "stub", []), str(self.metadata_file) ) # Verify assert dc assert len(dc) == 1 assert "objects/foo.jpg" in dc
def update_metadata_csv(root, metadata_csv, sip_uuid, sip_dir, now): print('Parse new metadata.csv') full_path = metadata_csv.currentlocation.replace('%SIPDirectory%', sip_dir, 1) csvmetadata = createmetscsv.parseMetadataCSV(full_path) # Set globalDmdSecCounter so createDmdSecsFromCSVParsedMetadata will work createmets2.globalDmdSecCounter = int( root.xpath('count(mets:dmdSec)', namespaces=ns.NSMAP)) # dmdSecs added after existing dmdSecs or metsHdr if none try: add_after = root.findall('mets:dmdSec', namespaces=ns.NSMAP)[-1] except IndexError: add_after = root.find('mets:metsHdr', namespaces=ns.NSMAP) aip_div = root.find('mets:structMap[@TYPE="physical"]/mets:div', namespaces=ns.NSMAP) # FIXME Does this have to support having non DC metadata in the CSV? Assuming not for f, md in csvmetadata.iteritems(): # Verify file is in AIP print('Looking for', f, 'from metadata.csv in SIP') # Find File with original or current locationg matching metadata.csv # Prepend % to match the end of %SIPDirectory% or %transferDirectory% try: file_obj = models.File.objects.get(sip_id=sip_uuid, originallocation__endswith='%' + f) except models.File.DoesNotExist: try: file_obj = models.File.objects.get( sip_id=sip_uuid, currentlocation__endswith='%' + f) except models.File.DoesNotExist: print(f, 'not found in database') continue print(f, 'found in database') # Find structMap div to associate with split_path = file_obj.currentlocation.replace('%SIPDirectory%', '', 1).split('/') obj_div = aip_div for label in split_path: child = obj_div.find('mets:div[@LABEL="' + label + '"]', namespaces=ns.NSMAP) if child is None: print(f, 'not in structMap') break obj_div = child if obj_div is None: continue ids = obj_div.get('DMDID', '') print(f, 'was associated with', ids) # Create dmdSec new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(md) # Add DMDIDs new_ids = [d.get('ID') for d in new_dmdsecs] new_ids = ids.split() + new_ids print(f, 'now associated with', ' '.join(new_ids)) obj_div.set('DMDID', ' '.join(new_ids)) # Update old dmdSecs if needed new = False if not ids: # Newly generated dmdSec is the original new = True else: # Find the dmdSec with no status and mark it original search_ids = ' or '.join(['@ID="%s"' % x for x in ids.split()]) dmdsecs = root.xpath('mets:dmdSec[%s][not(@STATUS)]' % search_ids, namespaces=ns.NSMAP) for d in dmdsecs: d.set('STATUS', 'original') print(d.get('ID'), 'STATUS is original') # Add dmdSecs to document for d in new_dmdsecs: d.set('CREATED', now) if new: d.set('STATUS', 'original') else: d.set('STATUS', 'updated') print(d.get('ID'), 'STATUS is', d.get('STATUS')) add_after.addnext(d) add_after = d return root
def update_metadata_csv(job, mets, metadata_csv, sip_uuid, sip_dir, state): job.pyprint("Parse new metadata.csv") full_path = metadata_csv.currentlocation.replace("%SIPDirectory%", sip_dir, 1) csvmetadata = createmetscsv.parseMetadataCSV(job, full_path) # FIXME This doesn't support having both DC and non-DC metadata in dmdSecs # If createDmdSecsFromCSVParsedMetadata returns more than 1 dmdSec, behaviour is undefined for f, md in csvmetadata.items(): # Verify file is in AIP job.pyprint("Looking for", f, "from metadata.csv in SIP") # Find File with original or current locationg matching metadata.csv # Prepend % to match the end of %SIPDirectory% or %transferDirectory% file_obj = None try: file_obj = models.File.objects.get( sip_id=sip_uuid, originallocation__endswith="%" + f ) except models.File.DoesNotExist: try: file_obj = models.File.objects.get( sip_id=sip_uuid, currentlocation__endswith="%" + f ) except models.File.DoesNotExist: pass if file_obj is not None: fsentry = mets.get_file(file_uuid=file_obj.uuid) else: fsentry = _get_directory_fsentry(mets, f) if fsentry is None: job.pyprint(f, "not found in database or METS file") continue job.pyprint(f, "found in database or METS file") job.pyprint(f, "was associated with", fsentry.dmdids) # Save existing dmdSecs dc_dmdsecs = [] non_dc_dmdsecs = [] for dmdsec in fsentry.dmdsecs: mdwrap = dmdsec.contents if mdwrap.mdtype == "DC": dc_dmdsecs.append(dmdsec) elif ( mdwrap.mdtype == "OTHER" and getattr(mdwrap, "othermdtype", None) == "CUSTOM" ): non_dc_dmdsecs.append(dmdsec) # Create dmdSec new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(job, md, state) # Add both for new_dmdsec in new_dmdsecs: # need to strip new_d to just the DC part new_dc = new_dmdsec.find(".//dcterms:dublincore", namespaces=ns.NSMAP) if new_dc is not None: new_metsrw_dmdsec = fsentry.add_dublin_core(new_dc) _replace_original_dmdsec(dc_dmdsecs, new_metsrw_dmdsec) else: new_non_dc = new_dmdsec.find( './/mets:mdWrap[@MDTYPE="OTHER"][@OTHERMDTYPE="CUSTOM"]/mets:xmlData', namespaces=ns.NSMAP, ) if new_non_dc is not None: new_metsrw_dmdsec = fsentry.add_dmdsec( new_non_dc, "OTHER", othermdtype="CUSTOM" ) _replace_original_dmdsec(non_dc_dmdsecs, new_metsrw_dmdsec) job.pyprint(f, "now associated with", fsentry.dmdids) return mets