Example #1
0
 def test_dmdsec_from_csv_parsed_metadata_no_data(self):
     """It should not create dmdSecs with no parsed metadata."""
     data = {}
     # Test
     ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data)
     # Verify
     assert ret == []
 def test_dmdsec_from_csv_parsed_metadata_other_only(self):
     data = collections.OrderedDict([
         ("Title", ["Yamani Weapons"]),
         ("Contributor", [u"雪 ユキ".encode('utf8')]),
         ("Long Description", ['This is about how glaives are used in the Yamani Islands'])
     ])
     # Test
     ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data)
     # Verify
     assert ret
     assert len(ret) == 1
     dmdsec = ret[0]
     assert dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec'
     assert 'ID' in dmdsec.attrib
     mdwrap = dmdsec[0]
     assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap'
     assert 'MDTYPE' in mdwrap.attrib
     assert mdwrap.attrib['MDTYPE'] == 'OTHER'
     assert 'OTHERMDTYPE' in mdwrap.attrib
     assert mdwrap.attrib['OTHERMDTYPE'] == 'CUSTOM'
     xmldata = mdwrap[0]
     assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData'
     # Elements are direct children of xmlData
     assert len(xmldata) == 3
     assert xmldata[0].tag == 'title'
     assert xmldata[0].text == 'Yamani Weapons'
     assert xmldata[1].tag == 'contributor'
     assert xmldata[1].text == u'雪 ユキ'
     assert xmldata[2].tag == 'long_description'
     assert xmldata[2].text == 'This is about how glaives are used in the Yamani Islands'
    def test_dmdsec_from_csv_parsed_metadata_both(self):
        """It should create a dmdSec for DC and Other parsed metadata."""
        data = collections.OrderedDict([
            ("dc.title", ["Yamani Weapons"]),
            ("dc.contributor", [u"雪 ユキ".encode('utf8')]),
            ("dcterms.isPartOf", ["AIC#42"]), ("Title", ["Yamani Weapons"]),
            ("Contributor", [u"雪 ユキ".encode('utf8')]),
            ("Long Description",
             ['This is about how glaives are used in the Yamani Islands'])
        ])
        # Test
        ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data)
        # Verify
        assert ret
        assert len(ret) == 2
        # Return can be DC or OTHER first, but in this case DC should be first
        dc_dmdsec = ret[0]
        assert dc_dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec'
        assert 'ID' in dc_dmdsec.attrib
        mdwrap = dc_dmdsec[0]
        assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap'
        assert 'MDTYPE' in mdwrap.attrib
        assert mdwrap.attrib['MDTYPE'] == 'DC'
        xmldata = mdwrap[0]
        assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData'
        dc_elem = xmldata[0]
        # Elements are children of dublincore tag
        assert dc_elem.tag == '{http://purl.org/dc/terms/}dublincore'
        assert len(dc_elem) == 3
        assert dc_elem[0].tag == '{http://purl.org/dc/elements/1.1/}title'
        assert dc_elem[0].text == 'Yamani Weapons'
        assert dc_elem[
            1].tag == '{http://purl.org/dc/elements/1.1/}contributor'
        assert dc_elem[1].text == u'雪 ユキ'
        assert dc_elem[2].tag == '{http://purl.org/dc/terms/}isPartOf'
        assert dc_elem[2].text == 'AIC#42'

        other_dmdsec = ret[1]
        assert other_dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec'
        assert 'ID' in other_dmdsec.attrib
        mdwrap = other_dmdsec[0]
        assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap'
        assert 'MDTYPE' in mdwrap.attrib
        assert mdwrap.attrib['MDTYPE'] == 'OTHER'
        assert 'OTHERMDTYPE' in mdwrap.attrib
        assert mdwrap.attrib['OTHERMDTYPE'] == 'CUSTOM'
        xmldata = mdwrap[0]
        assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData'
        # Elements are direct children of xmlData
        assert len(xmldata) == 3
        assert xmldata[0].tag == 'title'
        assert xmldata[0].text == 'Yamani Weapons'
        assert xmldata[1].tag == 'contributor'
        assert xmldata[1].text == u'雪 ユキ'
        assert xmldata[2].tag == 'long_description'
        assert xmldata[
            2].text == 'This is about how glaives are used in the Yamani Islands'
    def test_dmdsec_from_csv_parsed_metadata_repeats(self):
        """It should create multiple elements for repeated input."""
        data = collections.OrderedDict([
            ("dc.contributor", ["Yuki", u"雪 ユキ".encode('utf8')]),
            ("Contributor", ["Yuki", u"雪 ユキ".encode('utf8')]),
        ])
        # Test
        ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data)
        # Verify
        assert ret
        assert len(ret) == 2
        # Return can be DC or OTHER first, but in this case DC should be first
        dc_dmdsec = ret[0]
        assert dc_dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec'
        assert 'ID' in dc_dmdsec.attrib
        mdwrap = dc_dmdsec[0]
        assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap'
        assert 'MDTYPE' in mdwrap.attrib
        assert mdwrap.attrib['MDTYPE'] == 'DC'
        xmldata = mdwrap[0]
        assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData'
        dc_elem = xmldata[0]
        # Elements are children of dublincore tag
        assert dc_elem.tag == '{http://purl.org/dc/terms/}dublincore'
        assert len(dc_elem) == 2
        assert dc_elem[
            0].tag == '{http://purl.org/dc/elements/1.1/}contributor'
        assert dc_elem[0].text == 'Yuki'
        assert dc_elem[
            1].tag == '{http://purl.org/dc/elements/1.1/}contributor'
        assert dc_elem[1].text == u'雪 ユキ'

        other_dmdsec = ret[1]
        assert other_dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec'
        assert 'ID' in other_dmdsec.attrib
        mdwrap = other_dmdsec[0]
        assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap'
        assert 'MDTYPE' in mdwrap.attrib
        assert mdwrap.attrib['MDTYPE'] == 'OTHER'
        assert 'OTHERMDTYPE' in mdwrap.attrib
        assert mdwrap.attrib['OTHERMDTYPE'] == 'CUSTOM'
        xmldata = mdwrap[0]
        assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData'
        # Elements are direct children of xmlData
        assert len(xmldata) == 2
        assert xmldata[0].tag == 'contributor'
        assert xmldata[0].text == 'Yuki'
        assert xmldata[1].tag == 'contributor'
        assert xmldata[1].text == u'雪 ユキ'
Example #5
0
def update_metadata_csv(mets, metadata_csv, sip_uuid, sip_dir):
    print('Parse new metadata.csv')
    full_path = metadata_csv.currentlocation.replace('%SIPDirectory%', sip_dir,
                                                     1)
    csvmetadata = createmetscsv.parseMetadataCSV(full_path)

    # FIXME This doesn't support having both DC and non-DC metadata in dmdSecs
    # If createDmdSecsFromCSVParsedMetadata returns more than 1 dmdSec, behaviour is undefined
    for f, md in csvmetadata.items():
        # Verify file is in AIP
        print('Looking for', f, 'from metadata.csv in SIP')
        # Find File with original or current locationg matching metadata.csv
        # Prepend % to match the end of %SIPDirectory% or %transferDirectory%
        try:
            file_obj = models.File.objects.get(sip_id=sip_uuid,
                                               originallocation__endswith='%' +
                                               f)
        except models.File.DoesNotExist:
            try:
                file_obj = models.File.objects.get(
                    sip_id=sip_uuid, currentlocation__endswith='%' + f)
            except models.File.DoesNotExist:
                print(f, 'not found in database')
                continue
        print(f, 'found in database')

        fsentry = mets.get_file(file_uuid=file_obj.uuid)
        print(f, 'was associated with', fsentry.dmdids)

        # Create dmdSec
        new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(md)
        # Add both
        for new_dmdsec in new_dmdsecs:
            # need to strip new_d to just the DC part
            new_dc = new_dmdsec.find('.//dcterms:dublincore',
                                     namespaces=ns.NSMAP)
            new_metsrw_dmdsec = fsentry.add_dublin_core(new_dc)
            if len(fsentry.dmdsecs) > 1:
                fsentry.dmdsecs[-2].replace_with(new_metsrw_dmdsec)

        print(f, 'now associated with', fsentry.dmdids)

    return mets
 def test_dmdsec_from_csv_parsed_metadata_no_data(self):
     data = {}
     # Test
     ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data)
     # Verify
     assert ret == []
 def test_dmdsec_from_csv_parsed_metadata_dc_only(self):
     data = collections.OrderedDict([
         ("dc.title", ["Yamani Weapons"]),
         ("dc.creator", ["Keladry of Mindelan"]),
         ("dc.subject", ["Glaives"]),
         ("dc.description", ["Glaives are cool"]),
         ("dc.publisher", ["Tortall Press"]),
         ("dc.contributor", [u"雪 ユキ".encode('utf8')]),
         ("dc.date", ["2015"]),
         ("dc.type", ["Archival Information Package"]),
         ("dc.format", ["parchement"]),
         ("dc.identifier", ["42/1"]),
         ("dc.source", ["Numair's library"]),
         ("dc.relation", ["None"]),
         ("dc.language", ["en"]),
         ("dc.rights", ["Public Domain"]),
         ("dcterms.isPartOf", ["AIC#42"]),
     ])
     # Test
     ret = archivematicaCreateMETS2.createDmdSecsFromCSVParsedMetadata(data)
     # Verify
     assert ret
     assert len(ret) == 1
     dmdsec = ret[0]
     assert dmdsec.tag == '{http://www.loc.gov/METS/}dmdSec'
     assert 'ID' in dmdsec.attrib
     mdwrap = dmdsec[0]
     assert mdwrap.tag == '{http://www.loc.gov/METS/}mdWrap'
     assert 'MDTYPE' in mdwrap.attrib
     assert mdwrap.attrib['MDTYPE'] == 'DC'
     xmldata = mdwrap[0]
     assert xmldata.tag == '{http://www.loc.gov/METS/}xmlData'
     # Elements are children of dublincore tag
     dc_elem = xmldata[0]
     assert dc_elem.tag == '{http://purl.org/dc/terms/}dublincore'
     assert len(dc_elem) == 15
     assert dc_elem[0].tag == '{http://purl.org/dc/elements/1.1/}title'
     assert dc_elem[0].text == 'Yamani Weapons'
     assert dc_elem[1].tag == '{http://purl.org/dc/elements/1.1/}creator'
     assert dc_elem[1].text == 'Keladry of Mindelan'
     assert dc_elem[2].tag == '{http://purl.org/dc/elements/1.1/}subject'
     assert dc_elem[2].text == 'Glaives'
     assert dc_elem[3].tag == '{http://purl.org/dc/elements/1.1/}description'
     assert dc_elem[3].text == 'Glaives are cool'
     assert dc_elem[4].tag == '{http://purl.org/dc/elements/1.1/}publisher'
     assert dc_elem[4].text == 'Tortall Press'
     assert dc_elem[5].tag == '{http://purl.org/dc/elements/1.1/}contributor'
     assert dc_elem[5].text == u'雪 ユキ'
     assert dc_elem[6].tag == '{http://purl.org/dc/elements/1.1/}date'
     assert dc_elem[6].text == '2015'
     assert dc_elem[7].tag == '{http://purl.org/dc/elements/1.1/}type'
     assert dc_elem[7].text == 'Archival Information Package'
     assert dc_elem[8].tag == '{http://purl.org/dc/elements/1.1/}format'
     assert dc_elem[8].text == 'parchement'
     assert dc_elem[9].tag == '{http://purl.org/dc/elements/1.1/}identifier'
     assert dc_elem[9].text == '42/1'
     assert dc_elem[10].tag == '{http://purl.org/dc/elements/1.1/}source'
     assert dc_elem[10].text == "Numair's library"
     assert dc_elem[11].tag == '{http://purl.org/dc/elements/1.1/}relation'
     assert dc_elem[11].text == 'None'
     assert dc_elem[12].tag == '{http://purl.org/dc/elements/1.1/}language'
     assert dc_elem[12].text == 'en'
     assert dc_elem[13].tag == '{http://purl.org/dc/elements/1.1/}rights'
     assert dc_elem[13].text == 'Public Domain'
     assert dc_elem[14].tag == '{http://purl.org/dc/terms/}isPartOf'
     assert dc_elem[14].text == 'AIC#42'
Example #8
0
def update_metadata_csv(root, metadata_csv, sip_uuid, sip_dir, now):
    print('Parse new metadata.csv')
    full_path = metadata_csv.currentlocation.replace('%SIPDirectory%', sip_dir,
                                                     1)
    csvmetadata = createmetscsv.parseMetadataCSV(full_path)

    # Set globalDmdSecCounter so createDmdSecsFromCSVParsedMetadata will work
    createmets2.globalDmdSecCounter = int(
        root.xpath('count(mets:dmdSec)', namespaces=ns.NSMAP))

    # dmdSecs added after existing dmdSecs or metsHdr if none
    try:
        add_after = root.findall('mets:dmdSec', namespaces=ns.NSMAP)[-1]
    except IndexError:
        add_after = root.find('mets:metsHdr', namespaces=ns.NSMAP)

    aip_div = root.find('mets:structMap[@TYPE="physical"]/mets:div',
                        namespaces=ns.NSMAP)

    # FIXME Does this have to support having non DC metadata in the CSV?  Assuming not
    for f, md in csvmetadata.iteritems():
        # Verify file is in AIP
        print('Looking for', f, 'from metadata.csv in SIP')
        # Find File with original or current locationg matching metadata.csv
        # Prepend % to match the end of %SIPDirectory% or %transferDirectory%
        try:
            file_obj = models.File.objects.get(sip_id=sip_uuid,
                                               originallocation__endswith='%' +
                                               f)
        except models.File.DoesNotExist:
            try:
                file_obj = models.File.objects.get(
                    sip_id=sip_uuid, currentlocation__endswith='%' + f)
            except models.File.DoesNotExist:
                print(f, 'not found in database')
                continue
        print(f, 'found in database')

        # Find structMap div to associate with
        split_path = file_obj.currentlocation.replace('%SIPDirectory%', '',
                                                      1).split('/')
        obj_div = aip_div
        for label in split_path:
            child = obj_div.find('mets:div[@LABEL="' + label + '"]',
                                 namespaces=ns.NSMAP)
            if child is None:
                print(f, 'not in structMap')
                break
            obj_div = child
        if obj_div is None:
            continue
        ids = obj_div.get('DMDID', '')
        print(f, 'was associated with', ids)

        # Create dmdSec
        new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(md)

        # Add DMDIDs
        new_ids = [d.get('ID') for d in new_dmdsecs]
        new_ids = ids.split() + new_ids
        print(f, 'now associated with', ' '.join(new_ids))
        obj_div.set('DMDID', ' '.join(new_ids))

        # Update old dmdSecs if needed
        new = False
        if not ids:
            # Newly generated dmdSec is the original
            new = True
        else:
            # Find the dmdSec with no status and mark it original
            search_ids = ' or '.join(['@ID="%s"' % x for x in ids.split()])
            dmdsecs = root.xpath('mets:dmdSec[%s][not(@STATUS)]' % search_ids,
                                 namespaces=ns.NSMAP)
            for d in dmdsecs:
                d.set('STATUS', 'original')
                print(d.get('ID'), 'STATUS is original')

        # Add dmdSecs to document
        for d in new_dmdsecs:
            d.set('CREATED', now)
            if new:
                d.set('STATUS', 'original')
            else:
                d.set('STATUS', 'updated')
            print(d.get('ID'), 'STATUS is', d.get('STATUS'))

            add_after.addnext(d)
            add_after = d
    return root