def write_zip(self, resources=None, dumpfile=None): """Write a ZIP format dump file. Writes a ZIP file containing the resources in the iterable resources along with a manifest file manifest.xml (written first). No checks on the size of files or total size are performed, this is expected to have been done beforehand. """ compression = (ZIP_DEFLATED if self.compress else ZIP_STORED) zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True) # Write resources first rdm = ResourceDumpManifest(resources=resources) real_path = {} for resource in resources: archive_path = self.archive_path(resource.path) real_path[archive_path] = resource.path resource.path = archive_path zf.writestr('manifest.xml', rdm.as_xml()) # Add all files in the resources for resource in resources: zf.write(real_path[resource.path], arcname=resource.path) zf.close() zipsize = os.path.getsize(dumpfile) self.logger.info("Wrote ZIP file dump %s with size %d bytes" % (dumpfile, zipsize))
def test_build_ex_18(self): """Resource Dump Manifest with 2 entries and some metadata""" rdm = ResourceDumpManifest() rdm.up = 'http://example.com/dataset1/capabilitylist.xml' rdm.md_at = "2013-01-03T09:00:00Z" rdm.md_completed = "2013-01-03T09:02:00Z" rdm.add( Resource(uri='http://example.com/res1', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6', length=8876, mime_type='text/html', path='/resources/res1')) rdm.add( Resource( uri='http://example.com/res2', lastmod='2013-01-02T14:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e', sha256= '854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784', length=14599, mime_type='application/pdf', path='/resources/res2')) ex_xml = self._open_ex('resourcesync_ex_18').read() self._assert_xml_equal(rdm.as_xml(), ex_xml)
def get_resource_dump_manifest(self, record_id): """ Get resource dump manifest. :param record_id: Identifier of record. :return: (xml) content of resourcedumpmanifest """ _validation = self._validation(record_id) if self.resource_dump_manifest and _validation: rdm = ResourceDumpManifest() rdm.up = '{}resync/{}/resourcedump.xml'.format( request.url_root, self.repository_id) record = WekoRecord.get_record_by_pid(record_id) if record: for file in record.files: current_app.logger.debug(file.info()) file_info = file.info() path = 'recid_{}/{}'.format(record.get('recid'), file_info.get('key')) lastmod = str(datetime.datetime.utcnow().replace( tzinfo=datetime.timezone.utc).isoformat()) rdm.add( Resource( '{}record/{}/files/{}'.format( request.url_root, record.get('recid'), file_info.get('key')), lastmod=lastmod, sha256=file_info.get('checksum').split(':')[1], length=str(file_info.get('size')), path=path)) return rdm.as_xml() return None
def test01_as_xml(self): rdm = ResourceDumpManifest() rdm.add( Resource('a.zip',timestamp=1) ) rdm.add( Resource('b.zip',timestamp=2) ) xml = rdm.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcedump-manifest"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def test01_as_xml(self): rdm = ResourceDumpManifest() rdm.add(Resource('a.zip', timestamp=1)) rdm.add(Resource('b.zip', timestamp=2)) xml = rdm.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcedump-manifest"', xml), 'XML has capability') self.assertTrue( re.search( r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a')
def test_build_ex_05(self): """Simple Resource Dump Manifest document """ rdm = ResourceDumpManifest() rdm.md_at = '2013-01-03T09:00:00Z' rdm.add( Resource(uri='http://example.com/res1', lastmod='2013-01-03T03:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6', path='/resources/res1') ) rdm.add( Resource(uri='http://example.com/res2', lastmod='2013-01-03T04:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e', path='/resources/res2') ) ex_xml = self._open_ex('resourcesync_ex_5').read() self._assert_xml_equal( rdm.as_xml(), ex_xml )
def test_build_ex_18(self): """Resource Dump Manifest with 2 entries and some metadata""" rdm = ResourceDumpManifest() rdm.up='http://example.com/dataset1/capabilitylist.xml' rdm.md_at="2013-01-03T09:00:00Z" rdm.md_completed="2013-01-03T09:02:00Z" rdm.add( Resource( uri='http://example.com/res1', lastmod='2013-01-02T13:00:00Z', md5='1584abdf8ebdc9802ac0c6a7402c03b6', length=8876, mime_type='text/html', path='/resources/res1') ) rdm.add( Resource( uri='http://example.com/res2', lastmod='2013-01-02T14:00:00Z', md5='1e0d5cb8ef6ba40c99b14c0237be735e', sha256='854f61290e2e197a11bc91063afce22e43f8ccc655237050ace766adc68dc784', length=14599, mime_type='application/pdf', path='/resources/res2') ) ex_xml = self._open_ex('resourcesync_ex_18').read() self._assert_xml_equal( rdm.as_xml(), ex_xml )
def write_zip(self, resources=None, dumpfile=None): """Write a ZIP format dump file Writes a ZIP file containing the resources in the iterable resources along with a manifest file manifest.xml (written first). No checks on the size of files or total size are performed, this is expected to have been done beforehand. """ compression = ( ZIP_DEFLATED if self.compress else ZIP_STORED ) zf = ZipFile(dumpfile, mode="w", compression=compression, allowZip64=True) # Write resources first rdm = ResourceDumpManifest(resources=resources) real_path = {} for resource in resources: archive_path = self.archive_path(resource.path) real_path[archive_path] = resource.path resource.path = archive_path zf.writestr('manifest.xml',rdm.as_xml()) # Add all files in the resources for resource in resources: zf.write(real_path[resource.path], arcname=resource.path) zf.close() zipsize = os.path.getsize(dumpfile) self.logger.info("Wrote ZIP file dump %s with size %d bytes" % (dumpfile,zipsize))
def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True): """ Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the zip. -- The resync.Dump.write_zip method used in this method has the side effect of changing local paths in resourcelist into paths relative in zip. :param resourcelist: resources to zip :param prefix: prefix of the zip file :param write_list: True if resourcelist should be written to local disc. Default: False :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True :return: the created zip as a resync.Resource. """ md_at = None # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. index = -1 zipfiles = sorted( glob(os.path.join(self.publish_dir, prefix + "*.zip"))) if len(zipfiles) > 0: last_zip_file = zipfiles[len(zipfiles) - 1] basename = os.path.basename(last_zip_file) index = int(re.findall('\d+', basename)[0]) zip_name = "%s%05d" % (prefix, index + 1) if (write_list): # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest. rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w") rl_file.write(resourcelist.as_xml()) rl_file.close() zip_path = os.path.join(self.publish_dir, zip_name + ".zip") dump = Dump() dump.path_prefix = self.resource_dir dump.write_zip(resourcelist, zip_path) # paths in resourcelist will be stripped. md_completed = None # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. #print "Zipped %d resources in %s" % (len(resourcelist), zip_path) loc = self.publish_url + zip_name + ".zip" # mandatory lastmod = self.last_modified(resourcelist) # optional md_type = "application/zip" # recommended md_length = os.stat(zip_path).st_size md5 = compute_md5_for_file(zip_path) zip_resource = Resource(uri=loc, lastmod=lastmod, length=md_length, md5=md5, mime_type=md_type, md_at=md_at, md_completed=md_completed) if write_manifest: rdm = ResourceDumpManifest(resources=resourcelist.resources) rdm_file = open( os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w") rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml" rdm_file.write(rdm.as_xml()) rdm_file.close() zip_resource.link_set(rel="content", href=rdm_url) return zip_resource
def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True): """ Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the zip. -- The resync.Dump.write_zip method used in this method has the side effect of changing local paths in resourcelist into paths relative in zip. :param resourcelist: resources to zip :param prefix: prefix of the zip file :param write_list: True if resourcelist should be written to local disc. Default: False :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True :return: the created zip as a resync.Resource. """ md_at = ( None ) # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. index = -1 zipfiles = sorted(glob(os.path.join(self.publish_dir, prefix + "*.zip"))) if len(zipfiles) > 0: last_zip_file = zipfiles[len(zipfiles) - 1] basename = os.path.basename(last_zip_file) index = int(re.findall("\d+", basename)[0]) zip_name = "%s%05d" % (prefix, index + 1) if write_list: # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest. rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w") rl_file.write(resourcelist.as_xml()) rl_file.close() zip_path = os.path.join(self.publish_dir, zip_name + ".zip") dump = Dump() dump.path_prefix = self.resource_dir dump.write_zip(resourcelist, zip_path) # paths in resourcelist will be stripped. md_completed = ( None ) # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. # print "Zipped %d resources in %s" % (len(resourcelist), zip_path) loc = self.publish_url + zip_name + ".zip" # mandatory lastmod = self.last_modified(resourcelist) # optional md_type = "application/zip" # recommended md_length = os.stat(zip_path).st_size md5 = compute_md5_for_file(zip_path) zip_resource = Resource( uri=loc, lastmod=lastmod, length=md_length, md5=md5, mime_type=md_type, md_at=md_at, md_completed=md_completed, ) if write_manifest: rdm = ResourceDumpManifest(resources=resourcelist.resources) rdm_file = open(os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w") rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml" rdm_file.write(rdm.as_xml()) rdm_file.close() zip_resource.link_set(rel="content", href=rdm_url) return zip_resource