Beispiel #1
0
    def test_parse_metadata_csv_repeated_columns(self):
        """It should put repeated elements into a list of values."""
        # Create metadata.csv
        data = [
            ["Filename", "dc.title", "dc.type", "dc.type", "dc.type"],
            ["objects/foo.jpg", "Foo", "Photograph", "Still image", "Picture"],
        ]
        with self.metadata_file.open("wb") as f:
            writer = csv.writer(f)
            for row in data:
                writer.writerow(row)

        # Run test
        dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV(
            Job("stub", "stub", []), str(self.metadata_file)
        )
        # Verify
        assert dc
        assert "objects/foo.jpg" in dc
        assert "dc.title" in dc["objects/foo.jpg"]
        assert dc["objects/foo.jpg"]["dc.title"] == ["Foo"]
        assert "dc.type" in dc["objects/foo.jpg"]
        assert dc["objects/foo.jpg"]["dc.type"] == [
            "Photograph",
            "Still image",
            "Picture",
        ]
        assert list(dc["objects/foo.jpg"].keys()) == ["dc.title", "dc.type"]
    def test_parse_metadata_csv(self):
        # Create metadata.csv
        data = [
            ['Filename', 'dc.title', 'dc.date', 'Other metadata'],
            ['objects/foo.jpg', 'Foo', '2000', 'Taken on a sunny day'],
            ['objects/bar/', 'Bar', '2000', 'All taken on a rainy day'],
        ]
        with open('metadata.csv', 'wb') as f:
            writer = csv.writer(f)
            for row in data:
                writer.writerow(row)

        # Run test
        dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV('metadata.csv')
        # Verify
        assert dc
        assert 'objects/foo.jpg' in dc
        assert 'dc.title' in dc['objects/foo.jpg']
        assert dc['objects/foo.jpg']['dc.title'] == ['Foo']
        assert 'dc.date' in dc['objects/foo.jpg']
        assert dc['objects/foo.jpg']['dc.date'] == ['2000']
        assert 'Other metadata' in dc['objects/foo.jpg']
        assert dc['objects/foo.jpg']['Other metadata'] == ['Taken on a sunny day']
        assert dc['objects/foo.jpg'].keys() == ['dc.title', 'dc.date', 'Other metadata']

        assert 'objects/bar' in dc
        assert 'dc.title' in dc['objects/bar']
        assert dc['objects/bar']['dc.title'] == ['Bar']
        assert 'dc.date' in dc['objects/bar']
        assert dc['objects/bar']['dc.date'] == ['2000']
        assert 'Other metadata' in dc['objects/bar']
        assert dc['objects/bar']['Other metadata'] == ['All taken on a rainy day']
        assert dc['objects/bar'].keys() == ['dc.title', 'dc.date', 'Other metadata']
    def test_parse_metadata_csv_repeated_columns(self):
        """It should put repeated elements into a list of values."""
        # Create metadata.csv
        data = [
            ['Filename', 'dc.title', 'dc.type', 'dc.type', 'dc.type'],
            ['objects/foo.jpg', 'Foo', 'Photograph', 'Still image', 'Picture'],
        ]
        with open('metadata.csv', 'wb') as f:
            writer = csv.writer(f)
            for row in data:
                writer.writerow(row)

        # Run test
        dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV(
            'metadata.csv')
        # Verify
        assert dc
        assert 'objects/foo.jpg' in dc
        assert 'dc.title' in dc['objects/foo.jpg']
        assert dc['objects/foo.jpg']['dc.title'] == ['Foo']
        assert 'dc.type' in dc['objects/foo.jpg']
        assert dc['objects/foo.jpg']['dc.type'] == [
            'Photograph', 'Still image', 'Picture'
        ]
        assert list(dc['objects/foo.jpg'].keys()) == ['dc.title', 'dc.type']
    def test_parse_metadata_csv_blank_rows(self):
        # Create metadata.csv
        data = [
            ['Filename', 'dc.title', 'dc.type', 'dc.type', 'dc.type'],
            ['objects/foo.jpg', 'Foo', 'Photograph', 'Still image', 'Picture'],
            [],
        ]
        with open('metadata.csv', 'wb') as f:
            writer = csv.writer(f)
            for row in data:
                writer.writerow(row)

        # Run test
        dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV('metadata.csv')
        # Verify
        assert dc
        assert len(dc) == 1
        assert 'objects/foo.jpg' in dc
    def test_parse_metadata_csv_non_ascii(self):
        # Create metadata.csv
        data = [
            ['Filename', 'dc.title'],
            ['objects/foo.jpg', u'元気です'.encode('utf8')],
        ]
        with open('metadata.csv', 'wb') as f:
            writer = csv.writer(f)
            for row in data:
                writer.writerow(row)

        # Run test
        dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV('metadata.csv')
        # Verify
        assert dc
        assert 'objects/foo.jpg' in dc
        assert 'dc.title' in dc['objects/foo.jpg']
        assert dc['objects/foo.jpg']['dc.title'] == [u'元気です'.encode('utf8')]
Beispiel #6
0
    def test_parse_metadata_csv_non_ascii(self):
        """It should parse unicode."""
        # Create metadata.csv
        data = [["Filename", "dc.title"], ["objects/foo.jpg", "元気です".encode("utf8")]]
        with self.metadata_file.open("wb") as f:
            writer = csv.writer(f)
            for row in data:
                writer.writerow(row)

        # Run test
        dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV(
            Job("stub", "stub", []), str(self.metadata_file)
        )
        # Verify
        assert dc
        assert "objects/foo.jpg" in dc
        assert "dc.title" in dc["objects/foo.jpg"]
        assert dc["objects/foo.jpg"]["dc.title"] == ["元気です".encode("utf8")]
Beispiel #7
0
    def test_parse_metadata_csv(self):
        """It should parse the metadata.csv into a dict."""
        # Create metadata.csv
        data = [
            ["Filename", "dc.title", "dc.date", "Other metadata"],
            ["objects/foo.jpg", "Foo", "2000", "Taken on a sunny day"],
            ["objects/bar/", "Bar", "2000", "All taken on a rainy day"],
        ]
        with self.metadata_file.open("wb") as f:
            writer = csv.writer(f)
            for row in data:
                writer.writerow(row)

        # Run test
        dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV(
            Job("stub", "stub", []), str(self.metadata_file)
        )
        # Verify
        assert dc
        assert "objects/foo.jpg" in dc
        assert "dc.title" in dc["objects/foo.jpg"]
        assert dc["objects/foo.jpg"]["dc.title"] == ["Foo"]
        assert "dc.date" in dc["objects/foo.jpg"]
        assert dc["objects/foo.jpg"]["dc.date"] == ["2000"]
        assert "Other metadata" in dc["objects/foo.jpg"]
        assert dc["objects/foo.jpg"]["Other metadata"] == ["Taken on a sunny day"]
        assert list(dc["objects/foo.jpg"].keys()) == [
            "dc.title",
            "dc.date",
            "Other metadata",
        ]

        assert "objects/bar" in dc
        assert "dc.title" in dc["objects/bar"]
        assert dc["objects/bar"]["dc.title"] == ["Bar"]
        assert "dc.date" in dc["objects/bar"]
        assert dc["objects/bar"]["dc.date"] == ["2000"]
        assert "Other metadata" in dc["objects/bar"]
        assert dc["objects/bar"]["Other metadata"] == ["All taken on a rainy day"]
        assert list(dc["objects/bar"].keys()) == [
            "dc.title",
            "dc.date",
            "Other metadata",
        ]
def update_metadata_csv(job, mets, metadata_csv, sip_uuid, sip_dir, state):
    job.pyprint('Parse new metadata.csv')
    full_path = metadata_csv.currentlocation.replace('%SIPDirectory%', sip_dir,
                                                     1)
    csvmetadata = createmetscsv.parseMetadataCSV(job, full_path)

    # FIXME This doesn't support having both DC and non-DC metadata in dmdSecs
    # If createDmdSecsFromCSVParsedMetadata returns more than 1 dmdSec, behaviour is undefined
    for f, md in csvmetadata.items():
        # Verify file is in AIP
        job.pyprint('Looking for', f, 'from metadata.csv in SIP')
        # Find File with original or current locationg matching metadata.csv
        # Prepend % to match the end of %SIPDirectory% or %transferDirectory%
        try:
            file_obj = models.File.objects.get(sip_id=sip_uuid,
                                               originallocation__endswith='%' +
                                               f)
        except models.File.DoesNotExist:
            try:
                file_obj = models.File.objects.get(
                    sip_id=sip_uuid, currentlocation__endswith='%' + f)
            except models.File.DoesNotExist:
                job.pyprint(f, 'not found in database')
                continue
        job.pyprint(f, 'found in database')

        fsentry = mets.get_file(file_uuid=file_obj.uuid)
        job.pyprint(f, 'was associated with', fsentry.dmdids)

        # Create dmdSec
        new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(
            job, md, state)
        # Add both
        for new_dmdsec in new_dmdsecs:
            # need to strip new_d to just the DC part
            new_dc = new_dmdsec.find('.//dcterms:dublincore',
                                     namespaces=ns.NSMAP)
            new_metsrw_dmdsec = fsentry.add_dublin_core(new_dc)
            if len(fsentry.dmdsecs) > 1:
                fsentry.dmdsecs[-2].replace_with(new_metsrw_dmdsec)

        job.pyprint(f, 'now associated with', fsentry.dmdids)

    return mets
Beispiel #9
0
    def test_parse_metadata_csv_blank_rows(self):
        """It should skip blank rows."""
        # Create metadata.csv
        data = [
            ["Filename", "dc.title", "dc.type", "dc.type", "dc.type"],
            ["objects/foo.jpg", "Foo", "Photograph", "Still image", "Picture"],
            [],
        ]
        with self.metadata_file.open("wb") as f:
            writer = csv.writer(f)
            for row in data:
                writer.writerow(row)

        # Run test
        dc = archivematicaCreateMETSMetadataCSV.parseMetadataCSV(
            Job("stub", "stub", []), str(self.metadata_file)
        )
        # Verify
        assert dc
        assert len(dc) == 1
        assert "objects/foo.jpg" in dc
Beispiel #10
0
def update_metadata_csv(root, metadata_csv, sip_uuid, sip_dir, now):
    print('Parse new metadata.csv')
    full_path = metadata_csv.currentlocation.replace('%SIPDirectory%', sip_dir,
                                                     1)
    csvmetadata = createmetscsv.parseMetadataCSV(full_path)

    # Set globalDmdSecCounter so createDmdSecsFromCSVParsedMetadata will work
    createmets2.globalDmdSecCounter = int(
        root.xpath('count(mets:dmdSec)', namespaces=ns.NSMAP))

    # dmdSecs added after existing dmdSecs or metsHdr if none
    try:
        add_after = root.findall('mets:dmdSec', namespaces=ns.NSMAP)[-1]
    except IndexError:
        add_after = root.find('mets:metsHdr', namespaces=ns.NSMAP)

    aip_div = root.find('mets:structMap[@TYPE="physical"]/mets:div',
                        namespaces=ns.NSMAP)

    # FIXME Does this have to support having non DC metadata in the CSV?  Assuming not
    for f, md in csvmetadata.iteritems():
        # Verify file is in AIP
        print('Looking for', f, 'from metadata.csv in SIP')
        # Find File with original or current locationg matching metadata.csv
        # Prepend % to match the end of %SIPDirectory% or %transferDirectory%
        try:
            file_obj = models.File.objects.get(sip_id=sip_uuid,
                                               originallocation__endswith='%' +
                                               f)
        except models.File.DoesNotExist:
            try:
                file_obj = models.File.objects.get(
                    sip_id=sip_uuid, currentlocation__endswith='%' + f)
            except models.File.DoesNotExist:
                print(f, 'not found in database')
                continue
        print(f, 'found in database')

        # Find structMap div to associate with
        split_path = file_obj.currentlocation.replace('%SIPDirectory%', '',
                                                      1).split('/')
        obj_div = aip_div
        for label in split_path:
            child = obj_div.find('mets:div[@LABEL="' + label + '"]',
                                 namespaces=ns.NSMAP)
            if child is None:
                print(f, 'not in structMap')
                break
            obj_div = child
        if obj_div is None:
            continue
        ids = obj_div.get('DMDID', '')
        print(f, 'was associated with', ids)

        # Create dmdSec
        new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(md)

        # Add DMDIDs
        new_ids = [d.get('ID') for d in new_dmdsecs]
        new_ids = ids.split() + new_ids
        print(f, 'now associated with', ' '.join(new_ids))
        obj_div.set('DMDID', ' '.join(new_ids))

        # Update old dmdSecs if needed
        new = False
        if not ids:
            # Newly generated dmdSec is the original
            new = True
        else:
            # Find the dmdSec with no status and mark it original
            search_ids = ' or '.join(['@ID="%s"' % x for x in ids.split()])
            dmdsecs = root.xpath('mets:dmdSec[%s][not(@STATUS)]' % search_ids,
                                 namespaces=ns.NSMAP)
            for d in dmdsecs:
                d.set('STATUS', 'original')
                print(d.get('ID'), 'STATUS is original')

        # Add dmdSecs to document
        for d in new_dmdsecs:
            d.set('CREATED', now)
            if new:
                d.set('STATUS', 'original')
            else:
                d.set('STATUS', 'updated')
            print(d.get('ID'), 'STATUS is', d.get('STATUS'))

            add_after.addnext(d)
            add_after = d
    return root
Beispiel #11
0
def update_metadata_csv(job, mets, metadata_csv, sip_uuid, sip_dir, state):
    job.pyprint("Parse new metadata.csv")
    full_path = metadata_csv.currentlocation.replace("%SIPDirectory%", sip_dir, 1)
    csvmetadata = createmetscsv.parseMetadataCSV(job, full_path)

    # FIXME This doesn't support having both DC and non-DC metadata in dmdSecs
    # If createDmdSecsFromCSVParsedMetadata returns more than 1 dmdSec, behaviour is undefined
    for f, md in csvmetadata.items():
        # Verify file is in AIP
        job.pyprint("Looking for", f, "from metadata.csv in SIP")
        # Find File with original or current locationg matching metadata.csv
        # Prepend % to match the end of %SIPDirectory% or %transferDirectory%
        file_obj = None
        try:
            file_obj = models.File.objects.get(
                sip_id=sip_uuid, originallocation__endswith="%" + f
            )
        except models.File.DoesNotExist:
            try:
                file_obj = models.File.objects.get(
                    sip_id=sip_uuid, currentlocation__endswith="%" + f
                )
            except models.File.DoesNotExist:
                pass
        if file_obj is not None:
            fsentry = mets.get_file(file_uuid=file_obj.uuid)
        else:
            fsentry = _get_directory_fsentry(mets, f)
        if fsentry is None:
            job.pyprint(f, "not found in database or METS file")
            continue

        job.pyprint(f, "found in database or METS file")
        job.pyprint(f, "was associated with", fsentry.dmdids)

        # Save existing dmdSecs
        dc_dmdsecs = []
        non_dc_dmdsecs = []
        for dmdsec in fsentry.dmdsecs:
            mdwrap = dmdsec.contents
            if mdwrap.mdtype == "DC":
                dc_dmdsecs.append(dmdsec)
            elif (
                mdwrap.mdtype == "OTHER"
                and getattr(mdwrap, "othermdtype", None) == "CUSTOM"
            ):
                non_dc_dmdsecs.append(dmdsec)

        # Create dmdSec
        new_dmdsecs = createmets2.createDmdSecsFromCSVParsedMetadata(job, md, state)
        # Add both
        for new_dmdsec in new_dmdsecs:
            # need to strip new_d to just the DC part
            new_dc = new_dmdsec.find(".//dcterms:dublincore", namespaces=ns.NSMAP)
            if new_dc is not None:
                new_metsrw_dmdsec = fsentry.add_dublin_core(new_dc)
                _replace_original_dmdsec(dc_dmdsecs, new_metsrw_dmdsec)
            else:
                new_non_dc = new_dmdsec.find(
                    './/mets:mdWrap[@MDTYPE="OTHER"][@OTHERMDTYPE="CUSTOM"]/mets:xmlData',
                    namespaces=ns.NSMAP,
                )
                if new_non_dc is not None:
                    new_metsrw_dmdsec = fsentry.add_dmdsec(
                        new_non_dc, "OTHER", othermdtype="CUSTOM"
                    )
                    _replace_original_dmdsec(non_dc_dmdsecs, new_metsrw_dmdsec)
        job.pyprint(f, "now associated with", fsentry.dmdids)

    return mets