def test_build_ex_18(self): """Resource Dump Manifest with 2 entries and some metadata""" rdm = ResourceDumpManifest() rdm.up = 'http://example.com/dataset1/capabilitylist.xml' rdm.md_at = "2013-01-03T09:00:00Z" rdm.md_completed = "2013-01-03T09:02:00Z" rdm.add( Resource(uri='http://example.com/res1', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6', length=8876, mime_type='text/html', path='/resources/res1')) rdm.add( Resource( uri='http://example.com/res2', lastmod='2013-01-02T14:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e', sha256= '854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784', length=14599, mime_type='application/pdf', path='/resources/res2')) ex_xml = self._open_ex('resourcesync_ex_18').read() self._assert_xml_equal(rdm.as_xml(), ex_xml)
def write_zip(self, resources=None, dumpfile=None): """Write a ZIP format dump file. Writes a ZIP file containing the resources in the iterable resources along with a manifest file manifest.xml (written first). No checks on the size of files or total size are performed, this is expected to have been done beforehand. """ compression = (ZIP_DEFLATED if self.compress else ZIP_STORED) zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True) # Write resources first rdm = ResourceDumpManifest(resources=resources) real_path = {} for resource in resources: archive_path = self.archive_path(resource.path) real_path[archive_path] = resource.path resource.path = archive_path zf.writestr('manifest.xml', rdm.as_xml()) # Add all files in the resources for resource in resources: zf.write(real_path[resource.path], arcname=resource.path) zf.close() zipsize = os.path.getsize(dumpfile) self.logger.info("Wrote ZIP file dump %s with size %d bytes" % (dumpfile, zipsize))
def test01_as_xml(self): rdm = ResourceDumpManifest() rdm.add( Resource('a.zip',timestamp=1) ) rdm.add( Resource('b.zip',timestamp=2) ) xml = rdm.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcedump-manifest"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test01_as_xml(self): rdm = ResourceDumpManifest() rdm.add(Resource('a.zip', timestamp=1)) rdm.add(Resource('b.zip', timestamp=2)) xml = rdm.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcedump-manifest"', xml), 'XML has capability') self.assertTrue( re.search( r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a')
def test00_dump_zip_resource_list(self): rl=ResourceDumpManifest() rl.add( Resource('http://ex.org/a', length=7, path='resync/test/testdata/a') ) rl.add( Resource('http://ex.org/b', length=21, path='resync/test/testdata/b') ) d=Dump() zipf=os.path.join(self.tmpdir,"test00_dump.zip") d.write_zip(resources=rl,dumpfile=zipf) # named args self.assertTrue( os.path.exists(zipf) ) self.assertTrue( zipfile.is_zipfile(zipf) ) zo=zipfile.ZipFile(zipf,'r') self.assertEqual( len(zo.namelist()), 3 ) zo.close() os.unlink(zipf)
def test_ex_05(self): """resourcesync_ex_5 is a simple resource dump manifest with two files listed""" rdm=ResourceDumpManifest() rdm.parse('tests/testdata/examples_from_spec/resourcesync_ex_5.xml') self.assertEqual( len(rdm.resources), 2, '2 resources') sms = sorted(rdm.uris()) self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] ) self.assertEqual( rdm.resources['http://example.com/res1'].lastmod, '2013-01-03T03:00:00Z' ) self.assertEqual( rdm.resources['http://example.com/res1'].md5, '1584abdf8ebdc9802ac0c6a7402c03b6' ) self.assertEqual( rdm.resources['http://example.com/res1'].path, '/resources/res1' ) self.assertEqual( rdm.resources['http://example.com/res2'].lastmod, '2013-01-03T04:00:00Z' ) self.assertEqual( rdm.resources['http://example.com/res2'].md5, '1e0d5cb8ef6ba40c99b14c0237be735e' ) self.assertEqual( rdm.resources['http://example.com/res2'].path, '/resources/res2' )
def test10_parse(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md at="2013-08-08" capability="resourcedump-manifest"/>\ <url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" path="/res1" /></url>\ <url><loc>http://example.com/res2</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" path="/res2"/></url>\ </urlset>' rdm=ResourceDumpManifest() rdm.parse(fh=StringIO.StringIO(xml)) self.assertEqual( len(rdm.resources), 2, 'got 2 resource dumps') self.assertEqual( rdm.capability, 'resourcedump-manifest', 'capability set' ) self.assertEqual( rdm.md_at, '2013-08-08' ) self.assertTrue( 'http://example.com/res1' in rdm.resources ) self.assertTrue( rdm.resources['http://example.com/res1'].length, 12 ) self.assertTrue( rdm.resources['http://example.com/res1'].path, '/res1' ) self.assertTrue( 'http://example.com/res2' in rdm.resources ) self.assertTrue( rdm.resources['http://example.com/res2'].path, '/res2' )
def test10_parse(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md at="2013-08-08" capability="resourcedump-manifest"/>\ <url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" path="/res1" /></url>\ <url><loc>http://example.com/res2</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" path="/res2"/></url>\ </urlset>' rdm = ResourceDumpManifest() rdm.parse(fh=StringIO.StringIO(xml)) self.assertEqual(len(rdm.resources), 2, 'got 2 resource dumps') self.assertEqual(rdm.capability, 'resourcedump-manifest', 'capability set') self.assertEqual(rdm.md_at, '2013-08-08') self.assertTrue('http://example.com/res1' in rdm.resources) self.assertTrue(rdm.resources['http://example.com/res1'].length, 12) self.assertTrue(rdm.resources['http://example.com/res1'].path, '/res1') self.assertTrue('http://example.com/res2' in rdm.resources) self.assertTrue(rdm.resources['http://example.com/res2'].path, '/res2')
def test12_parse_bad_capability(self): # the <rs:md capability="bad_capability".. should give error xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="bad_capability" from="2013-01-01"/>\ <url><loc>http://example.com/a.zip</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" /></url>\ </urlset>' rdm = ResourceDumpManifest() self.assertRaises(SitemapParseError, rdm.parse, fh=io.StringIO(xml))
def test11_parse_no_capability(self): # For a resource dump this should be an error xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md from="2013-01-01"/>\ <url><loc>http://example.com/res1</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" path="/res1" /></url>\ <url><loc>http://example.com/res2</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" path="/res2"/></url>\ </urlset>' rdm = ResourceDumpManifest() self.assertRaises(SitemapParseError, rdm.parse, fh=io.StringIO(xml))
def test_build_ex_05(self): """Simple Resource Dump Manifest document """ rdm = ResourceDumpManifest() rdm.md_at = '2013-01-03T09:00:00Z' rdm.add( Resource(uri='http://example.com/res1', lastmod='2013-01-03T03:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6', path='/resources/res1') ) rdm.add( Resource(uri='http://example.com/res2', lastmod='2013-01-03T04:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e', path='/resources/res2') ) ex_xml = self._open_ex('resourcesync_ex_5').read() self._assert_xml_equal( rdm.as_xml(), ex_xml )
def get_resource_dump_manifest(self, record_id): """ Get resource dump manifest. :param record_id: Identifier of record. :return: (xml) content of resourcedumpmanifest """ _validation = self._validation(record_id) if self.resource_dump_manifest and _validation: rdm = ResourceDumpManifest() rdm.up = '{}resync/{}/resourcedump.xml'.format( request.url_root, self.repository_id) record = WekoRecord.get_record_by_pid(record_id) if record: for file in record.files: current_app.logger.debug(file.info()) file_info = file.info() path = 'recid_{}/{}'.format(record.get('recid'), file_info.get('key')) lastmod = str(datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat()) rdm.add( Resource( '{}record/{}/files/{}'.format( request.url_root, record.get('recid'), file_info.get('key')), lastmod=lastmod, sha256=file_info.get('checksum').split(':')[1], length=str(file_info.get('size')), path=path)) return rdm.as_xml() return None
def write_zip(self, resources=None, dumpfile=None): """Write a ZIP format dump file Writes a ZIP file containing the resources in the iterable resources along with a manifest file manifest.xml (written first). No checks on the size of files or total size are performed, this is expected to have been done beforehand. """ compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED ) zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True) # Write resources first rdm = ResourceDumpManifest(resources=resources) real_path = {} for resource in resources: archive_path = self.archive_path(resource.path) real_path[archive_path] = resource.path resource.path = archive_path zf.writestr('manifest.xml',rdm.as_xml()) # Add all files in the resources for resource in resources: zf.write(real_path[resource.path], arcname=resource.path) zf.close() zipsize = os.path.getsize(dumpfile) self.logger.info("Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize))
def test_build_ex_18(self): """Resource Dump Manifest with 2 entries and some metadata""" rdm = ResourceDumpManifest() rdm.up='http://example.com/dataset1/capabilitylist.xml' rdm.md_at="2013-01-03T09:00:00Z" rdm.md_completed="2013-01-03T09:02:00Z" rdm.add( Resource( uri='http://example.com/res1', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6', length=8876, mime_type='text/html', path='/resources/res1') ) rdm.add( Resource( uri='http://example.com/res2', lastmod='2013-01-02T14:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e', sha256='854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784', length=14599, mime_type='application/pdf', path='/resources/res2') ) ex_xml = self._open_ex('resourcesync_ex_18').read() self._assert_xml_equal( rdm.as_xml(), ex_xml )
def test00_dump_zip_resource_list(self): rl = ResourceDumpManifest() rl.add(Resource('http://ex.org/a', length=7, path='tests/testdata/a')) rl.add(Resource('http://ex.org/b', length=21, path='tests/testdata/b')) d = Dump() zipf = os.path.join(self.tmpdir, "test00_dump.zip") d.write_zip(resources=rl, dumpfile=zipf) # named args self.assertTrue(os.path.exists(zipf)) self.assertTrue(zipfile.is_zipfile(zipf)) zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(len(zo.namelist()), 3) zo.close() os.unlink(zipf)
def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True): """ Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the zip. -- The resync.Dump.write_zip method used in this method has the side effect of changing local paths in resourcelist into paths relative in zip. :param resourcelist: resources to zip :param prefix: prefix of the zip file :param write_list: True if resourcelist should be written to local disc. Default: False :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True :return: the created zip as a resync.Resource. """ md_at = ( None ) # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. index = -1 zipfiles = sorted(glob(os.path.join(self.publish_dir, prefix + "*.zip"))) if len(zipfiles) > 0: last_zip_file = zipfiles[len(zipfiles) - 1] basename = os.path.basename(last_zip_file) index = int(re.findall("\d+", basename)[0]) zip_name = "%s%05d" % (prefix, index + 1) if write_list: # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest. rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w") rl_file.write(resourcelist.as_xml()) rl_file.close() zip_path = os.path.join(self.publish_dir, zip_name + ".zip") dump = Dump() dump.path_prefix = self.resource_dir dump.write_zip(resourcelist, zip_path) # paths in resourcelist will be stripped. md_completed = ( None ) # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. # print "Zipped %d resources in %s" % (len(resourcelist), zip_path) loc = self.publish_url + zip_name + ".zip" # mandatory lastmod = self.last_modified(resourcelist) # optional md_type = "application/zip" # recommended md_length = os.stat(zip_path).st_size md5 = compute_md5_for_file(zip_path) zip_resource = Resource( uri=loc, lastmod=lastmod, length=md_length, md5=md5, mime_type=md_type, md_at=md_at, md_completed=md_completed, ) if write_manifest: rdm = ResourceDumpManifest(resources=resourcelist.resources) rdm_file = open(os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w") rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml" rdm_file.write(rdm.as_xml()) rdm_file.close() zip_resource.link_set(rel="content", href=rdm_url) return zip_resource
def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True): """ Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the zip. -- The resync.Dump.write_zip method used in this method has the side effect of changing local paths in resourcelist into paths relative in zip. :param resourcelist: resources to zip :param prefix: prefix of the zip file :param write_list: True if resourcelist should be written to local disc. Default: False :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True :return: the created zip as a resync.Resource. """ md_at = None # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. index = -1 zipfiles = sorted( glob(os.path.join(self.publish_dir, prefix + "*.zip"))) if len(zipfiles) > 0: last_zip_file = zipfiles[len(zipfiles) - 1] basename = os.path.basename(last_zip_file) index = int(re.findall('\d+', basename)[0]) zip_name = "%s%05d" % (prefix, index + 1) if (write_list): # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest. rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w") rl_file.write(resourcelist.as_xml()) rl_file.close() zip_path = os.path.join(self.publish_dir, zip_name + ".zip") dump = Dump() dump.path_prefix = self.resource_dir dump.write_zip(resourcelist, zip_path) # paths in resourcelist will be stripped. md_completed = None # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. #print "Zipped %d resources in %s" % (len(resourcelist), zip_path) loc = self.publish_url + zip_name + ".zip" # mandatory lastmod = self.last_modified(resourcelist) # optional md_type = "application/zip" # recommended md_length = os.stat(zip_path).st_size md5 = compute_md5_for_file(zip_path) zip_resource = Resource(uri=loc, lastmod=lastmod, length=md_length, md5=md5, mime_type=md_type, md_at=md_at, md_completed=md_completed) if write_manifest: rdm = ResourceDumpManifest(resources=resourcelist.resources) rdm_file = open( os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w") rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml" rdm_file.write(rdm.as_xml()) rdm_file.close() zip_resource.link_set(rel="content", href=rdm_url) return zip_resource