def test_build_ex_18(self):
     """Resource Dump Manifest with 2 entries and some metadata"""
     rdm = ResourceDumpManifest()
     rdm.up = 'http://example.com/dataset1/capabilitylist.xml'
     rdm.md_at = "2013-01-03T09:00:00Z"
     rdm.md_completed = "2013-01-03T09:02:00Z"
     rdm.add(
         Resource(uri='http://example.com/res1',
                  lastmod='2013-01-02T13:00:00Z',
                  md5='1584abdf8ebdc9802ac0c6a7402c03b6',
                  length=8876,
                  mime_type='text/html',
                  path='/resources/res1'))
     rdm.add(
         Resource(
             uri='http://example.com/res2',
             lastmod='2013-01-02T14:00:00Z',
             md5='1e0d5cb8ef6ba40c99b14c0237be735e',
             sha256=
             '854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784',
             length=14599,
             mime_type='application/pdf',
             path='/resources/res2'))
     ex_xml = self._open_ex('resourcesync_ex_18').read()
     self._assert_xml_equal(rdm.as_xml(), ex_xml)
Example #2
0
    def write_zip(self, resources=None, dumpfile=None):
        """Write a ZIP format dump file.

        Writes a ZIP file containing the resources in the iterable resources along with
        a manifest file manifest.xml (written first). No checks on the size of files
        or total size are performed, this is expected to have been done beforehand.
        """
        compression = (ZIP_DEFLATED if self.compress else ZIP_STORED)
        zf = ZipFile(dumpfile,
                     mode="w",
                     compression=compression,
                     allowZip64=True)
        # Write resources first
        rdm = ResourceDumpManifest(resources=resources)
        real_path = {}
        for resource in resources:
            archive_path = self.archive_path(resource.path)
            real_path[archive_path] = resource.path
            resource.path = archive_path
        zf.writestr('manifest.xml', rdm.as_xml())
        # Add all files in the resources
        for resource in resources:
            zf.write(real_path[resource.path], arcname=resource.path)
        zf.close()
        zipsize = os.path.getsize(dumpfile)
        self.logger.info("Wrote ZIP file dump %s with size %d bytes" %
                         (dumpfile, zipsize))
Example #3
0
 def test01_as_xml(self):
     rdm = ResourceDumpManifest()
     rdm.add( Resource('a.zip',timestamp=1) )
     rdm.add( Resource('b.zip',timestamp=2) )
     xml = rdm.as_xml()
     self.assertTrue( re.search(r'<rs:md .*capability="resourcedump-manifest"', xml), 'XML has capability' )
     self.assertTrue( re.search(r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' ) 
 def test01_as_xml(self):
     rdm = ResourceDumpManifest()
     rdm.add(Resource('a.zip', timestamp=1))
     rdm.add(Resource('b.zip', timestamp=2))
     xml = rdm.as_xml()
     self.assertTrue(
         re.search(r'<rs:md .*capability="resourcedump-manifest"', xml),
         'XML has capability')
     self.assertTrue(
         re.search(
             r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>',
             xml), 'XML has resource a')
Example #5
0
 def test00_dump_zip_resource_list(self):
     rl=ResourceDumpManifest()
     rl.add( Resource('http://ex.org/a', length=7, path='resync/test/testdata/a') )
     rl.add( Resource('http://ex.org/b', length=21, path='resync/test/testdata/b') )
     d=Dump()
     zipf=os.path.join(self.tmpdir,"test00_dump.zip")
     d.write_zip(resources=rl,dumpfile=zipf) # named args
     self.assertTrue( os.path.exists(zipf) )
     self.assertTrue( zipfile.is_zipfile(zipf) )
     zo=zipfile.ZipFile(zipf,'r')
     self.assertEqual( len(zo.namelist()), 3 )
     zo.close()
     os.unlink(zipf)
Example #6
0
 def test_ex_05(self):
     """resourcesync_ex_5 is a simple resource dump manifest with two files listed"""
     rdm=ResourceDumpManifest()
     rdm.parse('tests/testdata/examples_from_spec/resourcesync_ex_5.xml')
     self.assertEqual( len(rdm.resources), 2, '2 resources')
     sms = sorted(rdm.uris())
     self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] )
     self.assertEqual( rdm.resources['http://example.com/res1'].lastmod, '2013-01-03T03:00:00Z' )
     self.assertEqual( rdm.resources['http://example.com/res1'].md5, '1584abdf8ebdc9802ac0c6a7402c03b6' )
     self.assertEqual( rdm.resources['http://example.com/res1'].path, '/resources/res1' )
     self.assertEqual( rdm.resources['http://example.com/res2'].lastmod, '2013-01-03T04:00:00Z' )
     self.assertEqual( rdm.resources['http://example.com/res2'].md5, '1e0d5cb8ef6ba40c99b14c0237be735e' )
     self.assertEqual( rdm.resources['http://example.com/res2'].path, '/resources/res2' )
    def test10_parse(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md at="2013-08-08" capability="resourcedump-manifest"/>\
<url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" path="/res1" /></url>\
<url><loc>http://example.com/res2</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" path="/res2"/></url>\
</urlset>'
        rdm=ResourceDumpManifest()
        rdm.parse(fh=StringIO.StringIO(xml))
        self.assertEqual( len(rdm.resources), 2, 'got 2 resource dumps')
        self.assertEqual( rdm.capability, 'resourcedump-manifest', 'capability set' )
        self.assertEqual( rdm.md_at, '2013-08-08' )
        self.assertTrue( 'http://example.com/res1' in rdm.resources )
        self.assertTrue( rdm.resources['http://example.com/res1'].length, 12 )
        self.assertTrue( rdm.resources['http://example.com/res1'].path, '/res1' )
        self.assertTrue( 'http://example.com/res2' in rdm.resources )
        self.assertTrue( rdm.resources['http://example.com/res2'].path, '/res2' )
    def test10_parse(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md at="2013-08-08" capability="resourcedump-manifest"/>\
<url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" path="/res1" /></url>\
<url><loc>http://example.com/res2</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" path="/res2"/></url>\
</urlset>'

        rdm = ResourceDumpManifest()
        rdm.parse(fh=StringIO.StringIO(xml))
        self.assertEqual(len(rdm.resources), 2, 'got 2 resource dumps')
        self.assertEqual(rdm.capability, 'resourcedump-manifest',
                         'capability set')
        self.assertEqual(rdm.md_at, '2013-08-08')
        self.assertTrue('http://example.com/res1' in rdm.resources)
        self.assertTrue(rdm.resources['http://example.com/res1'].length, 12)
        self.assertTrue(rdm.resources['http://example.com/res1'].path, '/res1')
        self.assertTrue('http://example.com/res2' in rdm.resources)
        self.assertTrue(rdm.resources['http://example.com/res2'].path, '/res2')
    def test12_parse_bad_capability(self):
        # the <rs:md capability="bad_capability".. should give error
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="bad_capability" from="2013-01-01"/>\
<url><loc>http://example.com/a.zip</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" /></url>\
</urlset>'

        rdm = ResourceDumpManifest()
        self.assertRaises(SitemapParseError, rdm.parse, fh=io.StringIO(xml))
    def test11_parse_no_capability(self):
        # For a resource dump this should be an error
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md from="2013-01-01"/>\
<url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" path="/res1" /></url>\
<url><loc>http://example.com/res2</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" path="/res2"/></url>\
</urlset>'

        rdm = ResourceDumpManifest()
        self.assertRaises(SitemapParseError, rdm.parse, fh=io.StringIO(xml))
Example #11
0
 def test_build_ex_05(self):
     """Simple Resource Dump Manifest document """
     rdm = ResourceDumpManifest()
     rdm.md_at = '2013-01-03T09:00:00Z'
     rdm.add( Resource(uri='http://example.com/res1',
                       lastmod='2013-01-03T03:00:00Z',
                       md5='1584abdf8ebdc9802ac0c6a7402c03b6',
                       path='/resources/res1') )
     rdm.add( Resource(uri='http://example.com/res2',
                       lastmod='2013-01-03T04:00:00Z',
                       md5='1e0d5cb8ef6ba40c99b14c0237be735e',
                       path='/resources/res2') )
     ex_xml = self._open_ex('resourcesync_ex_5').read()
     self._assert_xml_equal( rdm.as_xml(), ex_xml )
Example #12
0
    def get_resource_dump_manifest(self, record_id):
        """
        Get resource dump manifest.

        :param record_id: Identifier of record.
        :return: (xml) content of resourcedumpmanifest
        """
        _validation = self._validation(record_id)
        if self.resource_dump_manifest and _validation:
            rdm = ResourceDumpManifest()
            rdm.up = '{}resync/{}/resourcedump.xml'.format(
                request.url_root, self.repository_id)
            record = WekoRecord.get_record_by_pid(record_id)
            if record:
                for file in record.files:
                    current_app.logger.debug(file.info())
                    file_info = file.info()
                    path = 'recid_{}/{}'.format(record.get('recid'),
                                                file_info.get('key'))
                    lastmod = str(datetime.datetime.utcnow().replace(
                        tzinfo=datetime.timezone.utc).isoformat())
                    rdm.add(
                        Resource(
                            '{}record/{}/files/{}'.format(
                                request.url_root, record.get('recid'),
                                file_info.get('key')),
                            lastmod=lastmod,
                            sha256=file_info.get('checksum').split(':')[1],
                            length=str(file_info.get('size')),
                            path=path))
            return rdm.as_xml()
        return None
Example #13
0
File: dump.py Project: EHRI/resync
    def write_zip(self, resources=None, dumpfile=None):
        """Write a ZIP format dump file

        Writes a ZIP file containing the resources in the iterable resources along with
        a manifest file manifest.xml (written first). No checks on the size of files
        or total size are performed, this is expected to have been done beforehand.
        """
        compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED )
        zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True)
        # Write resources first
        rdm = ResourceDumpManifest(resources=resources)
        real_path = {}
        for resource in resources:
            archive_path = self.archive_path(resource.path)
            real_path[archive_path] = resource.path
            resource.path = archive_path
        zf.writestr('manifest.xml',rdm.as_xml())
        # Add all files in the resources
        for resource in resources:
            zf.write(real_path[resource.path], arcname=resource.path)
        zf.close()
        zipsize = os.path.getsize(dumpfile)
        self.logger.info("Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize))
Example #14
0
 def test_build_ex_18(self):
     """Resource Dump Manifest with 2 entries and some metadata"""
     rdm = ResourceDumpManifest()
     rdm.up='http://example.com/dataset1/capabilitylist.xml'
     rdm.md_at="2013-01-03T09:00:00Z"
     rdm.md_completed="2013-01-03T09:02:00Z"
     rdm.add( Resource( uri='http://example.com/res1',
                        lastmod='2013-01-02T13:00:00Z',
                        md5='1584abdf8ebdc9802ac0c6a7402c03b6',
                        length=8876,
                        mime_type='text/html',
                        path='/resources/res1') )
     rdm.add( Resource( uri='http://example.com/res2',
                        lastmod='2013-01-02T14:00:00Z',
                        md5='1e0d5cb8ef6ba40c99b14c0237be735e',
                        sha256='854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784',
                        length=14599,
                        mime_type='application/pdf',
                        path='/resources/res2') )
     ex_xml = self._open_ex('resourcesync_ex_18').read()
     self._assert_xml_equal( rdm.as_xml(), ex_xml )
Example #15
0
 def test00_dump_zip_resource_list(self):
     rl = ResourceDumpManifest()
     rl.add(Resource('http://ex.org/a', length=7, path='tests/testdata/a'))
     rl.add(Resource('http://ex.org/b', length=21, path='tests/testdata/b'))
     d = Dump()
     zipf = os.path.join(self.tmpdir, "test00_dump.zip")
     d.write_zip(resources=rl, dumpfile=zipf)  # named args
     self.assertTrue(os.path.exists(zipf))
     self.assertTrue(zipfile.is_zipfile(zipf))
     zo = zipfile.ZipFile(zipf, 'r')
     self.assertEqual(len(zo.namelist()), 3)
     zo.close()
     os.unlink(zipf)
    def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True):
        """
        Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name
        will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the
        zip.
        --  The resync.Dump.write_zip method used in this method has the side effect of changing local paths in
            resourcelist into paths relative in zip.
        :param resourcelist: resources to zip
        :param prefix: prefix of the zip file
        :param write_list: True if resourcelist should be written to local disc. Default: False
        :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True
        :return: the created zip as a resync.Resource.
        """

        md_at = (
            None
        )  # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library.
        index = -1
        zipfiles = sorted(glob(os.path.join(self.publish_dir, prefix + "*.zip")))
        if len(zipfiles) > 0:
            last_zip_file = zipfiles[len(zipfiles) - 1]
            basename = os.path.basename(last_zip_file)
            index = int(re.findall("\d+", basename)[0])

        zip_name = "%s%05d" % (prefix, index + 1)
        if write_list:
            # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest.
            rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w")
            rl_file.write(resourcelist.as_xml())
            rl_file.close()

        zip_path = os.path.join(self.publish_dir, zip_name + ".zip")
        dump = Dump()
        dump.path_prefix = self.resource_dir
        dump.write_zip(resourcelist, zip_path)  # paths in resourcelist will be stripped.
        md_completed = (
            None
        )  # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library.
        # print "Zipped %d resources in %s" % (len(resourcelist), zip_path)

        loc = self.publish_url + zip_name + ".zip"  # mandatory
        lastmod = self.last_modified(resourcelist)  # optional
        md_type = "application/zip"  # recommended
        md_length = os.stat(zip_path).st_size
        md5 = compute_md5_for_file(zip_path)

        zip_resource = Resource(
            uri=loc,
            lastmod=lastmod,
            length=md_length,
            md5=md5,
            mime_type=md_type,
            md_at=md_at,
            md_completed=md_completed,
        )
        if write_manifest:
            rdm = ResourceDumpManifest(resources=resourcelist.resources)
            rdm_file = open(os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w")
            rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml"
            rdm_file.write(rdm.as_xml())
            rdm_file.close()
            zip_resource.link_set(rel="content", href=rdm_url)

        return zip_resource
    def create_zip(self,
                   resourcelist,
                   prefix,
                   write_list=False,
                   write_manifest=True):
        """
        Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name
        will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the
        zip.
        --  The resync.Dump.write_zip method used in this method has the side effect of changing local paths in
            resourcelist into paths relative in zip.
        :param resourcelist: resources to zip
        :param prefix: prefix of the zip file
        :param write_list: True if resourcelist should be written to local disc. Default: False
        :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True
        :return: the created zip as a resync.Resource.
        """

        md_at = None  # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library.
        index = -1
        zipfiles = sorted(
            glob(os.path.join(self.publish_dir, prefix + "*.zip")))
        if len(zipfiles) > 0:
            last_zip_file = zipfiles[len(zipfiles) - 1]
            basename = os.path.basename(last_zip_file)
            index = int(re.findall('\d+', basename)[0])

        zip_name = "%s%05d" % (prefix, index + 1)
        if (write_list):
            # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest.
            rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"),
                           "w")
            rl_file.write(resourcelist.as_xml())
            rl_file.close()

        zip_path = os.path.join(self.publish_dir, zip_name + ".zip")
        dump = Dump()
        dump.path_prefix = self.resource_dir
        dump.write_zip(resourcelist,
                       zip_path)  # paths in resourcelist will be stripped.
        md_completed = None  # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library.
        #print "Zipped %d resources in %s" % (len(resourcelist), zip_path)

        loc = self.publish_url + zip_name + ".zip"  # mandatory
        lastmod = self.last_modified(resourcelist)  # optional
        md_type = "application/zip"  # recommended
        md_length = os.stat(zip_path).st_size
        md5 = compute_md5_for_file(zip_path)

        zip_resource = Resource(uri=loc,
                                lastmod=lastmod,
                                length=md_length,
                                md5=md5,
                                mime_type=md_type,
                                md_at=md_at,
                                md_completed=md_completed)
        if write_manifest:
            rdm = ResourceDumpManifest(resources=resourcelist.resources)
            rdm_file = open(
                os.path.join(self.publish_dir,
                             PREFIX_MANIFEST + zip_name + ".xml"), "w")
            rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml"
            rdm_file.write(rdm.as_xml())
            rdm_file.close()
            zip_resource.link_set(rel="content", href=rdm_url)

        return zip_resource