Exemple #1
0
    def test_object_data(self):
        # mock api to read export data from a local fixture filie
        response = self.session.get("file://%s" % FIXTURES["sync1_export"])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(data)  # consume the generator so datastream processing happens
            return "uploaded://1"

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        mockapi.base_url = "http://fedora.example.co/fedora"
        self.obj.api = self.repo.api = mockapi
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            "object data for ingest should include upload id as content location",
        )

        # other tests?

        # set read block size artificially low to test chunked handling
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            "object data for ingest should include upload id as content location",
        )

        # test with second fixture - multiple small encoded datastreams
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        response = self.session.get("file://%s" % FIXTURES["sync2_export"])
        mockapi.export.return_value = response
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            "object data for ingest should include upload id as content location",
        )
Exemple #2
0
    def setUp(self):
        # todo: use mocks?
        self.repo = Mock(spec=Repository)
        self.obj = Mock()  #spec=DigitalObject)
        self.obj.pid = 'synctest:1'
        self.archex = ArchiveExport(self.obj, self.repo)

        # set up a request session that can load file uris, so
        # fixtures can be used as export data
        self.session = requests.session()
        self.session.mount('file://', LocalFileAdapter())
Exemple #3
0
    def test_object_data_split_bincontent(self):
        # explictly test handling of binary content tag split over
        # chunk boundaries

        response = self.session.get("file://%s" % FIXTURES["sync1_export"])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(data)  # consume the generator so datastream processing happens
            return "uploaded://1"

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        self.obj.api = self.repo.api = mockapi

        # test binary content tag split across chunks
        self.archex = ArchiveExport(self.obj, self.repo)
        # use a block size that will split the fixture in the middle of
        # the first binary content tag
        self.archex.read_block_size = 2688
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with just the < in foxml:binaryContent
        self.archex.read_block_size = 2680
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with an unrelated close tag </
        self.archex.read_block_size = 1526
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )
    def setUp(self):
        # todo: use mocks?
        self.repo = Mock(spec=Repository)
        self.obj = Mock() #spec=DigitalObject)
        self.obj.pid = 'synctest:1'
        self.archex = ArchiveExport(self.obj, self.repo)

        # set up a request session that can load file uris, so
        # fixtures can be used as export data
        self.session = requests.session()
        self.session.mount('file://', LocalFileAdapter())
Exemple #5
0
    def test_object_data_split_bincontent(self):
        # explictly test handling of binary content tag split over
        # chunk boundaries

        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(
                data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        self.obj.api = self.repo.api = mockapi

        # test binary content tag split across chunks
        self.archex = ArchiveExport(self.obj, self.repo)
        # use a block size that will split the fixture in the middle of
        # the first binary content tag
        self.archex.read_block_size = 2688
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with just the < in foxml:binaryContent
        self.archex.read_block_size = 2680
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with an unrelated close tag </
        self.archex.read_block_size = 1526
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
class ArchiveExportTest(unittest.TestCase):


    def setUp(self):
        # todo: use mocks?
        self.repo = Mock(spec=Repository)
        self.obj = Mock() #spec=DigitalObject)
        self.obj.pid = 'synctest:1'
        self.archex = ArchiveExport(self.obj, self.repo)

        # set up a request session that can load file uris, so
        # fixtures can be used as export data
        self.session = requests.session()
        self.session.mount('file://', LocalFileAdapter())

    def test_get_datastream_info(self):
        dsinfo = self.archex.get_datastream_info('''<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="text/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>''')
        self.assertEqual('DC.2', dsinfo['id'])
        self.assertEqual('text/xml', dsinfo['mimetype'])
        self.assertEqual('771', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('f53aec07f2607f536bac7ee03dbbfe7c', dsinfo['digest'])
        self.assertEqual('2012-10-11T14:13:03.658Z', dsinfo['created'])

        # datastream info split across chunks
        self.archex.end_of_last_chunk = '''<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="te'''
        dsinfo = self.archex.get_datastream_info('''xt/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>''')
        self.assertEqual('DC.2', dsinfo['id'])
        self.assertEqual('text/xml', dsinfo['mimetype'])
        self.assertEqual('f53aec07f2607f536bac7ee03dbbfe7c', dsinfo['digest'])

        # sample etd record with longer datastream info
        etd_ds = '''</foxml:datastreamVersion><foxml:datastreamVersion ID="RELS-EXT.9" LABEL="Relationships to other objects" CREATED="2009-09-18T19:36:04.235Z" MIMETYPE="application/rdf+xml" FORMAT_URI="info:fedora/fedora-system:FedoraRELSExt-1.0" SIZE="716">
<foxml:contentDigest TYPE="MD5" DIGEST="168fb675e5fcded1a3b8cc7251877744"/>'''

        self.archex.end_of_last_chunk = ''
        dsinfo = self.archex.get_datastream_info(etd_ds)
        self.assertEqual('RELS-EXT.9', dsinfo['id'])
        self.assertEqual('application/rdf+xml', dsinfo['mimetype'])
        self.assertEqual('716', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('168fb675e5fcded1a3b8cc7251877744', dsinfo['digest'])

        # getting audit record id instead of datastream id
        audit_trail_dsinfo = '''<audit:record ID="AUDREC48">
<audit:process type="Fedora API-M"/>
<audit:action>modifyDatastreamByValue</audit:action>
<audit:componentID>RELS-EXT</audit:componentID>
<audit:responsibility>fedoraAdmin</audit:responsibility>
<audit:date>2016-07-09T10:31:50.971Z</audit:date>
<audit:justification>datastream fixity check</audit:justification>
</audit:record>
</audit:auditTrail>
</foxml:xmlContent>
</foxml:datastreamVersion>
</foxml:datastream>
<foxml:datastream ID="content" STATE="A" CONTROL_GROUP="M" VERSIONABLE="false">
<foxml:datastreamVersion ID="content.0" LABEL="1199disk3" CREATED="2014-04-30T17:30:06.949Z" MIMETYPE="application/x-aff" SIZE="1277">
<foxml:contentDigest TYPE="MD5" DIGEST="2271e4a2678f69ce3f4a97ab07c06cbe"/>
<foxml:binaryContent>'''
        dsinfo = self.archex.get_datastream_info(audit_trail_dsinfo)
        self.assertEqual('content.0', dsinfo['id'])
        self.assertEqual('application/x-aff', dsinfo['mimetype'])
        self.assertEqual('1277', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('2271e4a2678f69ce3f4a97ab07c06cbe', dsinfo['digest'])

        # chunk contains multiple ds ids
        multi_dsinfo = '''</foxml:binaryContent>
</foxml:datastreamVersion>
<foxml:datastreamVersion ID="MODS.1" LABEL="MODS Metadata" CREATED="2015-06-24T17:26:39.154Z" MIMETYPE="text/xml" FORMAT_URI="http://www.loc.gov/mods/v3" SIZE="345">
<foxml:contentDigest TYPE="MD5" DIGEST="3d866951c5d2f4e665fd518a8d9433f2"/>
<foxml:binaryContent>

</foxml:datastreamVersion>
</foxml:datastream>
<foxml:datastream ID="VIDEO" STATE="A" CONTROL_GROUP="M" VERSIONABLE="true">
<foxml:datastreamVersion ID="VIDEO.0" LABEL="tape3" CREATED="2015-06-23T16:43:39.107Z" MIMETYPE="video/quicktime" SIZE="433279317">
<foxml:contentDigest TYPE="MD5" DIGEST="4fb5a23ee4c5d17cbd8b6d1f73fc6b8e"/>
'''
        dsinfo = self.archex.get_datastream_info(multi_dsinfo)
        self.assertEqual('VIDEO.0', dsinfo['id'])
        self.assertEqual('video/quicktime', dsinfo['mimetype'])
        self.assertEqual('433279317', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('4fb5a23ee4c5d17cbd8b6d1f73fc6b8e', dsinfo['digest'])

    def test_object_data(self):
        # mock api to read export data from a local fixture filie
        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()
        def mock_upload(data, *args, **kwargs):
            list(data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        mockapi.base_url = 'http://fedora.example.co/fedora'
        self.obj.api = self.repo.api = mockapi
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location')

        # other tests?

        # set read block size artificially low to test chunked handling
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location')

        # test with second fixture - multiple small encoded datastreams
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        response = self.session.get('file://%s' % FIXTURES['sync2_export'])
        mockapi.export.return_value = response
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location')

    def test_object_data_split_bincontent(self):
        # explictly test handling of binary content tag split over
        # chunk boundaries

        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()
        def mock_upload(data, *args, **kwargs):
            list(data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        self.obj.api = self.repo.api = mockapi

        # test binary content tag split across chunks
        self.archex = ArchiveExport(self.obj, self.repo)
        # use a block size that will split the fixture in the middle of
        # the first binary content tag
        self.archex.read_block_size = 2688
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with just the < in foxml:binaryContent
        self.archex.read_block_size = 2680
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with an unrelated close tag </
        self.archex.read_block_size = 1526
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')


    def test_encoded_datastream(self):
        # data content within a single chunk of data
        mockapi = Mock()
        mockapi.export.return_value = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi.upload.return_value = 'uploaded://1'
        self.obj.api = self.repo.api = mockapi

        section = self.archex.get_next_section()
        # get binary datastream info from first section
        dsinfo = self.archex.get_datastream_info(section)
        # fixture only has one binary content block
        # get binarycontent tag out of the way
        self.archex.get_next_section()
        # next section will be file contents
        self.archex.within_file = True
        dscontent = b''.join(self.archex.encoded_datastream())
        # check decoded size and MD5 match data from fixture
        self.assertEqual(int(dsinfo['size']), len(dscontent))
        self.assertEqual(dsinfo['digest'], md5sum(dscontent))

        # data content across multiple chunks
        mockapi.export.return_value = self.session.get('file://%s' % FIXTURES['sync1_export'])
        self.obj.api = self.repo.api = mockapi
        # set read block size artificially low to ensure
        # datastream content is spread across multiple chunks
        self.archex.read_block_size = 1024

        finished = False
        # iterate through the data, similar to object_data method,
        # but only handle binary content
        while not finished:
            try:
                section = self.archex.get_next_section()
            except StopIteration:
                finished = True

            # find the section with starting binary content
            if section == '<foxml:binaryContent>':
                # then decode the subsequent content
                self.archex.within_file = True
                dscontent = ''.join(self.archex.encoded_datastream())

                self.assertEqual(int(dsinfo['size']), len(dscontent))
                self.assertEqual(dsinfo['digest'], md5sum(dscontent))

                # stop processing
                finished = True
Exemple #7
0
class ArchiveExportTest(unittest.TestCase):
    def setUp(self):
        # todo: use mocks?
        self.repo = Mock(spec=Repository)
        self.obj = Mock()  #spec=DigitalObject)
        self.obj.pid = 'synctest:1'
        self.archex = ArchiveExport(self.obj, self.repo)

        # set up a request session that can load file uris, so
        # fixtures can be used as export data
        self.session = requests.session()
        self.session.mount('file://', LocalFileAdapter())

    def test_get_datastream_info(self):
        dsinfo = self.archex.get_datastream_info(
            '''<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="text/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>''')
        self.assertEqual('DC.2', dsinfo['id'])
        self.assertEqual('text/xml', dsinfo['mimetype'])
        self.assertEqual('771', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('f53aec07f2607f536bac7ee03dbbfe7c', dsinfo['digest'])
        self.assertEqual('2012-10-11T14:13:03.658Z', dsinfo['created'])

        # datastream info split across chunks
        self.archex.end_of_last_chunk = '''<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="te'''
        dsinfo = self.archex.get_datastream_info(
            '''xt/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>''')
        self.assertEqual('DC.2', dsinfo['id'])
        self.assertEqual('text/xml', dsinfo['mimetype'])
        self.assertEqual('f53aec07f2607f536bac7ee03dbbfe7c', dsinfo['digest'])

        # sample etd record with longer datastream info
        etd_ds = '''</foxml:datastreamVersion><foxml:datastreamVersion ID="RELS-EXT.9" LABEL="Relationships to other objects" CREATED="2009-09-18T19:36:04.235Z" MIMETYPE="application/rdf+xml" FORMAT_URI="info:fedora/fedora-system:FedoraRELSExt-1.0" SIZE="716">
<foxml:contentDigest TYPE="MD5" DIGEST="168fb675e5fcded1a3b8cc7251877744"/>'''

        self.archex.end_of_last_chunk = ''
        dsinfo = self.archex.get_datastream_info(etd_ds)
        self.assertEqual('RELS-EXT.9', dsinfo['id'])
        self.assertEqual('application/rdf+xml', dsinfo['mimetype'])
        self.assertEqual('716', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('168fb675e5fcded1a3b8cc7251877744', dsinfo['digest'])

        # getting audit record id instead of datastream id
        audit_trail_dsinfo = '''<audit:record ID="AUDREC48">
<audit:process type="Fedora API-M"/>
<audit:action>modifyDatastreamByValue</audit:action>
<audit:componentID>RELS-EXT</audit:componentID>
<audit:responsibility>fedoraAdmin</audit:responsibility>
<audit:date>2016-07-09T10:31:50.971Z</audit:date>
<audit:justification>datastream fixity check</audit:justification>
</audit:record>
</audit:auditTrail>
</foxml:xmlContent>
</foxml:datastreamVersion>
</foxml:datastream>
<foxml:datastream ID="content" STATE="A" CONTROL_GROUP="M" VERSIONABLE="false">
<foxml:datastreamVersion ID="content.0" LABEL="1199disk3" CREATED="2014-04-30T17:30:06.949Z" MIMETYPE="application/x-aff" SIZE="1277">
<foxml:contentDigest TYPE="MD5" DIGEST="2271e4a2678f69ce3f4a97ab07c06cbe"/>
<foxml:binaryContent>'''
        dsinfo = self.archex.get_datastream_info(audit_trail_dsinfo)
        self.assertEqual('content.0', dsinfo['id'])
        self.assertEqual('application/x-aff', dsinfo['mimetype'])
        self.assertEqual('1277', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('2271e4a2678f69ce3f4a97ab07c06cbe', dsinfo['digest'])

        # chunk contains multiple ds ids
        multi_dsinfo = '''</foxml:binaryContent>
</foxml:datastreamVersion>
<foxml:datastreamVersion ID="MODS.1" LABEL="MODS Metadata" CREATED="2015-06-24T17:26:39.154Z" MIMETYPE="text/xml" FORMAT_URI="http://www.loc.gov/mods/v3" SIZE="345">
<foxml:contentDigest TYPE="MD5" DIGEST="3d866951c5d2f4e665fd518a8d9433f2"/>
<foxml:binaryContent>

</foxml:datastreamVersion>
</foxml:datastream>
<foxml:datastream ID="VIDEO" STATE="A" CONTROL_GROUP="M" VERSIONABLE="true">
<foxml:datastreamVersion ID="VIDEO.0" LABEL="tape3" CREATED="2015-06-23T16:43:39.107Z" MIMETYPE="video/quicktime" SIZE="433279317">
<foxml:contentDigest TYPE="MD5" DIGEST="4fb5a23ee4c5d17cbd8b6d1f73fc6b8e"/>
'''
        dsinfo = self.archex.get_datastream_info(multi_dsinfo)
        self.assertEqual('VIDEO.0', dsinfo['id'])
        self.assertEqual('video/quicktime', dsinfo['mimetype'])
        self.assertEqual('433279317', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('4fb5a23ee4c5d17cbd8b6d1f73fc6b8e', dsinfo['digest'])

    def test_object_data(self):
        # mock api to read export data from a local fixture filie
        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(
                data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        mockapi.base_url = 'http://fedora.example.co/fedora'
        self.obj.api = self.repo.api = mockapi
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )

        # other tests?

        # set read block size artificially low to test chunked handling
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )

        # test with second fixture - multiple small encoded datastreams
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        response = self.session.get('file://%s' % FIXTURES['sync2_export'])
        mockapi.export.return_value = response
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )

    def test_object_data_split_bincontent(self):
        # explictly test handling of binary content tag split over
        # chunk boundaries

        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(
                data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        self.obj.api = self.repo.api = mockapi

        # test binary content tag split across chunks
        self.archex = ArchiveExport(self.obj, self.repo)
        # use a block size that will split the fixture in the middle of
        # the first binary content tag
        self.archex.read_block_size = 2688
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with just the < in foxml:binaryContent
        self.archex.read_block_size = 2680
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with an unrelated close tag </
        self.archex.read_block_size = 1526
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

    def test_encoded_datastream(self):
        # data content within a single chunk of data
        mockapi = Mock()
        mockapi.export.return_value = self.session.get(
            'file://%s' % FIXTURES['sync1_export'])
        mockapi.upload.return_value = 'uploaded://1'
        self.obj.api = self.repo.api = mockapi

        section = self.archex.get_next_section()
        # get binary datastream info from first section
        dsinfo = self.archex.get_datastream_info(section)
        # fixture only has one binary content block
        # get binarycontent tag out of the way
        self.archex.get_next_section()
        # next section will be file contents
        self.archex.within_file = True
        dscontent = b''.join(self.archex.encoded_datastream())
        # check decoded size and MD5 match data from fixture
        self.assertEqual(int(dsinfo['size']), len(dscontent))
        self.assertEqual(dsinfo['digest'], md5sum(dscontent))

        # data content across multiple chunks
        mockapi.export.return_value = self.session.get(
            'file://%s' % FIXTURES['sync1_export'])
        self.obj.api = self.repo.api = mockapi
        # set read block size artificially low to ensure
        # datastream content is spread across multiple chunks
        self.archex.read_block_size = 1024

        finished = False
        # iterate through the data, similar to object_data method,
        # but only handle binary content
        while not finished:
            try:
                section = self.archex.get_next_section()
            except StopIteration:
                finished = True

            # find the section with starting binary content
            if section == '<foxml:binaryContent>':
                # then decode the subsequent content
                self.archex.within_file = True
                dscontent = ''.join(self.archex.encoded_datastream())

                self.assertEqual(int(dsinfo['size']), len(dscontent))
                self.assertEqual(dsinfo['digest'], md5sum(dscontent))

                # stop processing
                finished = True
Exemple #8
0
    def test_object_data(self):
        # mock api to read export data from a local fixture filie
        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(
                data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        mockapi.base_url = 'http://fedora.example.co/fedora'
        self.obj.api = self.repo.api = mockapi
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )

        # other tests?

        # set read block size artificially low to test chunked handling
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )

        # test with second fixture - multiple small encoded datastreams
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        response = self.session.get('file://%s' % FIXTURES['sync2_export'])
        mockapi.export.return_value = response
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )
Exemple #9
0
class ArchiveExportTest(unittest.TestCase):
    def setUp(self):
        # todo: use mocks?
        self.repo = Mock(spec=Repository)
        self.obj = Mock()  # spec=DigitalObject)
        self.obj.pid = "synctest:1"
        self.archex = ArchiveExport(self.obj, self.repo)

        # set up a request session that can load file uris, so
        # fixtures can be used as export data
        self.session = requests.session()
        self.session.mount("file://", LocalFileAdapter())

    def test_get_datastream_info(self):
        dsinfo = self.archex.get_datastream_info(
            """<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="text/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>"""
        )
        self.assertEqual("DC.2", dsinfo["id"])
        self.assertEqual("text/xml", dsinfo["mimetype"])
        self.assertEqual("771", dsinfo["size"])
        self.assertEqual("MD5", dsinfo["type"])
        self.assertEqual("f53aec07f2607f536bac7ee03dbbfe7c", dsinfo["digest"])
        self.assertEqual("2012-10-11T14:13:03.658Z", dsinfo["created"])

        # datastream info split across chunks
        self.archex.end_of_last_chunk = (
            """<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="te"""
        )
        dsinfo = self.archex.get_datastream_info(
            """xt/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>"""
        )
        self.assertEqual("DC.2", dsinfo["id"])
        self.assertEqual("text/xml", dsinfo["mimetype"])
        self.assertEqual("f53aec07f2607f536bac7ee03dbbfe7c", dsinfo["digest"])

        # sample etd record with longer datastream info
        etd_ds = """</foxml:datastreamVersion><foxml:datastreamVersion ID="RELS-EXT.9" LABEL="Relationships to other objects" CREATED="2009-09-18T19:36:04.235Z" MIMETYPE="application/rdf+xml" FORMAT_URI="info:fedora/fedora-system:FedoraRELSExt-1.0" SIZE="716">
<foxml:contentDigest TYPE="MD5" DIGEST="168fb675e5fcded1a3b8cc7251877744"/>"""

        self.archex.end_of_last_chunk = ""
        dsinfo = self.archex.get_datastream_info(etd_ds)
        self.assertEqual("RELS-EXT.9", dsinfo["id"])
        self.assertEqual("application/rdf+xml", dsinfo["mimetype"])
        self.assertEqual("716", dsinfo["size"])
        self.assertEqual("MD5", dsinfo["type"])
        self.assertEqual("168fb675e5fcded1a3b8cc7251877744", dsinfo["digest"])

    def test_object_data(self):
        # mock api to read export data from a local fixture filie
        response = self.session.get("file://%s" % FIXTURES["sync1_export"])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(data)  # consume the generator so datastream processing happens
            return "uploaded://1"

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        mockapi.base_url = "http://fedora.example.co/fedora"
        self.obj.api = self.repo.api = mockapi
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            "object data for ingest should include upload id as content location",
        )

        # other tests?

        # set read block size artificially low to test chunked handling
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            "object data for ingest should include upload id as content location",
        )

        # test with second fixture - multiple small encoded datastreams
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        response = self.session.get("file://%s" % FIXTURES["sync2_export"])
        mockapi.export.return_value = response
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            "object data for ingest should include upload id as content location",
        )

    def test_object_data_split_bincontent(self):
        # explictly test handling of binary content tag split over
        # chunk boundaries

        response = self.session.get("file://%s" % FIXTURES["sync1_export"])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(data)  # consume the generator so datastream processing happens
            return "uploaded://1"

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        self.obj.api = self.repo.api = mockapi

        # test binary content tag split across chunks
        self.archex = ArchiveExport(self.obj, self.repo)
        # use a block size that will split the fixture in the middle of
        # the first binary content tag
        self.archex.read_block_size = 2688
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with just the < in foxml:binaryContent
        self.archex.read_block_size = 2680
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with an unrelated close tag </
        self.archex.read_block_size = 1526
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(etree.XML(foxml) is not None, "object data should be valid xml")
        self.assert_(
            b"foxml:binaryContent" not in foxml, "object data for ingest should not include binaryContent tags"
        )

    def test_encoded_datastream(self):
        # data content within a single chunk of data
        mockapi = Mock()
        mockapi.export.return_value = self.session.get("file://%s" % FIXTURES["sync1_export"])
        mockapi.upload.return_value = "uploaded://1"
        self.obj.api = self.repo.api = mockapi

        section = self.archex.get_next_section()
        # get binary datastream info from first section
        dsinfo = self.archex.get_datastream_info(section)
        # fixture only has one binary content block
        # get binarycontent tag out of the way
        self.archex.get_next_section()
        # next section will be file contents
        self.archex.within_file = True
        dscontent = b"".join(self.archex.encoded_datastream())
        # check decoded size and MD5 match data from fixture
        self.assertEqual(int(dsinfo["size"]), len(dscontent))
        self.assertEqual(dsinfo["digest"], md5sum(dscontent))

        # data content across multiple chunks
        mockapi.export.return_value = self.session.get("file://%s" % FIXTURES["sync1_export"])
        self.obj.api = self.repo.api = mockapi
        # set read block size artificially low to ensure
        # datastream content is spread across multiple chunks
        self.archex.read_block_size = 1024

        finished = False
        # iterate through the data, similar to object_data method,
        # but only handle binary content
        while not finished:
            try:
                section = self.archex.get_next_section()
            except StopIteration:
                finished = True

            # find the section with starting binary content
            if section == "<foxml:binaryContent>":
                # then decode the subsequent content
                self.archex.within_file = True
                dscontent = "".join(self.archex.encoded_datastream())

                self.assertEqual(int(dsinfo["size"]), len(dscontent))
                self.assertEqual(dsinfo["digest"], md5sum(dscontent))

                # stop processing
                finished = True
Exemple #10
0
class ArchiveExportTest(unittest.TestCase):


    def setUp(self):
        # todo: use mocks?
        self.repo = Mock(spec=Repository)
        self.obj = Mock() #spec=DigitalObject)
        self.obj.pid = 'synctest:1'
        self.archex = ArchiveExport(self.obj, self.repo)

        # set up a request session that can load file uris, so
        # fixtures can be used as export data
        self.session = requests.session()
        self.session.mount('file://', LocalFileAdapter())

    def test_get_datastream_info(self):
        dsinfo = self.archex.get_datastream_info('''<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="text/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>''')
        self.assertEqual('DC.2', dsinfo['id'])
        self.assertEqual('text/xml', dsinfo['mimetype'])
        self.assertEqual('771', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('f53aec07f2607f536bac7ee03dbbfe7c', dsinfo['digest'])
        self.assertEqual('2012-10-11T14:13:03.658Z', dsinfo['created'])

        # datastream info split across chunks
        self.archex.end_of_last_chunk = '''<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="te'''
        dsinfo = self.archex.get_datastream_info('''xt/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>''')
        self.assertEqual('DC.2', dsinfo['id'])
        self.assertEqual('text/xml', dsinfo['mimetype'])
        self.assertEqual('f53aec07f2607f536bac7ee03dbbfe7c', dsinfo['digest'])

    def test_object_data(self):
        # mock api to read export data from a local fixture filie
        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()
        def mock_upload(data, *args, **kwargs):
            list(data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        mockapi.base_url = 'http://fedora.example.co/fedora'
        self.obj.api = self.repo.api = mockapi
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location')

        # other tests?

        # set read block size artificially low to test chunked handling
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location')

        # test with second fixture - multiple small encoded datastreams
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        response = self.session.get('file://%s' % FIXTURES['sync2_export'])
        mockapi.export.return_value = response
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location')

    def test_object_data_split_bincontent(self):
        # explictly test handling of binary content tag split over
        # chunk boundaries

        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()
        def mock_upload(data, *args, **kwargs):
            list(data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        self.obj.api = self.repo.api = mockapi

        # test binary content tag split across chunks
        self.archex = ArchiveExport(self.obj, self.repo)
        # use a block size that will split the fixture in the middle of
        # the first binary content tag
        self.archex.read_block_size = 2688
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with just the < in foxml:binaryContent
        self.archex.read_block_size = 2680
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with an unrelated close tag </
        self.archex.read_block_size = 1526
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(etree.XML(foxml) is not None,
            'object data should be valid xml')
        self.assert_(b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')


    def test_encoded_datastream(self):
        # data content within a single chunk of data
        mockapi = Mock()
        mockapi.export.return_value = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi.upload.return_value = 'uploaded://1'
        self.obj.api = self.repo.api = mockapi

        section = self.archex.get_next_section()
        # get binary datastream info from first section
        dsinfo = self.archex.get_datastream_info(section)
        # fixture only has one binary content block
        # get binarycontent tag out of the way
        self.archex.get_next_section()
        # next section will be file contents
        self.archex.within_file = True
        dscontent = b''.join(self.archex.encoded_datastream())
        # check decoded size and MD5 match data from fixture
        self.assertEqual(int(dsinfo['size']), len(dscontent))
        self.assertEqual(dsinfo['digest'], md5sum(dscontent))

        # data content across multiple chunks
        mockapi.export.return_value = self.session.get('file://%s' % FIXTURES['sync1_export'])
        self.obj.api = self.repo.api = mockapi
        # set read block size artificially low to ensure
        # datastream content is spread across multiple chunks
        self.archex.read_block_size = 1024

        finished = False
        # iterate through the data, similar to object_data method,
        # but only handle binary content
        while not finished:
            try:
                section = self.archex.get_next_section()
            except StopIteration:
                finished = True

            # find the section with starting binary content
            if section == '<foxml:binaryContent>':
                # then decode the subsequent content
                self.archex.within_file = True
                dscontent = ''.join(self.archex.encoded_datastream())

                self.assertEqual(int(dsinfo['size']), len(dscontent))
                self.assertEqual(dsinfo['digest'], md5sum(dscontent))

                # stop processing
                finished = True
Exemple #11
0
class ArchiveExportTest(unittest.TestCase):
    def setUp(self):
        # todo: use mocks?
        self.repo = Mock(spec=Repository)
        self.obj = Mock()  #spec=DigitalObject)
        self.obj.pid = 'synctest:1'
        self.archex = ArchiveExport(self.obj, self.repo)

        # set up a request session that can load file uris, so
        # fixtures can be used as export data
        self.session = requests.session()
        self.session.mount('file://', LocalFileAdapter())

    def test_get_datastream_info(self):
        dsinfo = self.archex.get_datastream_info(
            '''<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="text/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>''')
        self.assertEqual('DC.2', dsinfo['id'])
        self.assertEqual('text/xml', dsinfo['mimetype'])
        self.assertEqual('771', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('f53aec07f2607f536bac7ee03dbbfe7c', dsinfo['digest'])
        self.assertEqual('2012-10-11T14:13:03.658Z', dsinfo['created'])

        # datastream info split across chunks
        self.archex.end_of_last_chunk = '''<foxml:datastreamVersion ID="DC.2" LABEL="Dublin Core" CREATED="2012-10-11T14:13:03.658Z" MIMETYPE="te'''
        dsinfo = self.archex.get_datastream_info(
            '''xt/xml" FORMAT_URI="http://www.openarchives.org/OAI/2.0/oai_dc/" SIZE="771">
<foxml:contentDigest TYPE="MD5" DIGEST="f53aec07f2607f536bac7ee03dbbfe7c"/>''')
        self.assertEqual('DC.2', dsinfo['id'])
        self.assertEqual('text/xml', dsinfo['mimetype'])
        self.assertEqual('f53aec07f2607f536bac7ee03dbbfe7c', dsinfo['digest'])

        # sample etd record with longer datastream info
        etd_ds = '''</foxml:datastreamVersion><foxml:datastreamVersion ID="RELS-EXT.9" LABEL="Relationships to other objects" CREATED="2009-09-18T19:36:04.235Z" MIMETYPE="application/rdf+xml" FORMAT_URI="info:fedora/fedora-system:FedoraRELSExt-1.0" SIZE="716">
<foxml:contentDigest TYPE="MD5" DIGEST="168fb675e5fcded1a3b8cc7251877744"/>'''

        self.archex.end_of_last_chunk = ''
        dsinfo = self.archex.get_datastream_info(etd_ds)
        self.assertEqual('RELS-EXT.9', dsinfo['id'])
        self.assertEqual('application/rdf+xml', dsinfo['mimetype'])
        self.assertEqual('716', dsinfo['size'])
        self.assertEqual('MD5', dsinfo['type'])
        self.assertEqual('168fb675e5fcded1a3b8cc7251877744', dsinfo['digest'])

    def test_object_data(self):
        # mock api to read export data from a local fixture filie
        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(
                data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        mockapi.base_url = 'http://fedora.example.co/fedora'
        self.obj.api = self.repo.api = mockapi
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )

        # other tests?

        # set read block size artificially low to test chunked handling
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )

        # test with second fixture - multiple small encoded datastreams
        self.archex = ArchiveExport(self.obj, self.repo)
        self.archex.read_block_size = 1024
        response = self.session.get('file://%s' % FIXTURES['sync2_export'])
        mockapi.export.return_value = response
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')
        self.assert_(
            b'<foxml:contentLocation REF="uploaded://1" TYPE="URL"/>' in foxml,
            'object data for ingest should include upload id as content location'
        )

    def test_object_data_split_bincontent(self):
        # explictly test handling of binary content tag split over
        # chunk boundaries

        response = self.session.get('file://%s' % FIXTURES['sync1_export'])
        mockapi = Mock()

        def mock_upload(data, *args, **kwargs):
            list(
                data)  # consume the generator so datastream processing happens
            return 'uploaded://1'

        mockapi.upload = mock_upload
        mockapi.export.return_value = response
        self.obj.api = self.repo.api = mockapi

        # test binary content tag split across chunks
        self.archex = ArchiveExport(self.obj, self.repo)
        # use a block size that will split the fixture in the middle of
        # the first binary content tag
        self.archex.read_block_size = 2688
        data = self.archex.object_data()
        foxml = data.getvalue()

        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with just the < in foxml:binaryContent
        self.archex.read_block_size = 2680
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

        self.archex = ArchiveExport(self.obj, self.repo)
        # this blocksize ends with an unrelated close tag </
        self.archex.read_block_size = 1526
        data = self.archex.object_data()
        foxml = data.getvalue()
        self.assert_(
            etree.XML(foxml) is not None, 'object data should be valid xml')
        self.assert_(
            b'foxml:binaryContent' not in foxml,
            'object data for ingest should not include binaryContent tags')

    def test_encoded_datastream(self):
        # data content within a single chunk of data
        mockapi = Mock()
        mockapi.export.return_value = self.session.get(
            'file://%s' % FIXTURES['sync1_export'])
        mockapi.upload.return_value = 'uploaded://1'
        self.obj.api = self.repo.api = mockapi

        section = self.archex.get_next_section()
        # get binary datastream info from first section
        dsinfo = self.archex.get_datastream_info(section)
        # fixture only has one binary content block
        # get binarycontent tag out of the way
        self.archex.get_next_section()
        # next section will be file contents
        self.archex.within_file = True
        dscontent = b''.join(self.archex.encoded_datastream())
        # check decoded size and MD5 match data from fixture
        self.assertEqual(int(dsinfo['size']), len(dscontent))
        self.assertEqual(dsinfo['digest'], md5sum(dscontent))

        # data content across multiple chunks
        mockapi.export.return_value = self.session.get(
            'file://%s' % FIXTURES['sync1_export'])
        self.obj.api = self.repo.api = mockapi
        # set read block size artificially low to ensure
        # datastream content is spread across multiple chunks
        self.archex.read_block_size = 1024

        finished = False
        # iterate through the data, similar to object_data method,
        # but only handle binary content
        while not finished:
            try:
                section = self.archex.get_next_section()
            except StopIteration:
                finished = True

            # find the section with starting binary content
            if section == '<foxml:binaryContent>':
                # then decode the subsequent content
                self.archex.within_file = True
                dscontent = ''.join(self.archex.encoded_datastream())

                self.assertEqual(int(dsinfo['size']), len(dscontent))
                self.assertEqual(dsinfo['digest'], md5sum(dscontent))

                # stop processing
                finished = True