def test00_dump_zip_resource_list(self): rl = ResourceDumpManifest() rl.add(Resource('http://ex.org/a', length=7, path='tests/testdata/a')) rl.add(Resource('http://ex.org/b', length=21, path='tests/testdata/b')) d = Dump() zipf = os.path.join(self.tmpdir, "test00_dump.zip") d.write_zip(resources=rl, dumpfile=zipf) # named args self.assertTrue(os.path.exists(zipf)) self.assertTrue(zipfile.is_zipfile(zipf)) zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(len(zo.namelist()), 3) zo.close() os.unlink(zipf)
def test01_dump_zip_change_list(self): cl=ChangeDumpManifest() cl.add( Resource('http://ex.org/a', length=7, path='resync/test/testdata/a', change="updated") ) cl.add( Resource('http://ex.org/b', length=21, path='resync/test/testdata/b', change="updated") ) d=Dump() zipf=os.path.join(self.tmpdir,"test01_dump.zip") d.write_zip(cl,zipf) # positional args self.assertTrue( os.path.exists(zipf) ) self.assertTrue( zipfile.is_zipfile(zipf) ) zo=zipfile.ZipFile(zipf,'r') self.assertEqual( len(zo.namelist()), 3 ) zo.close() os.unlink(zipf)
def test00_dump_zip_resource_list(self): rl=ResourceDumpManifest() rl.add( Resource('http://ex.org/a', length=7, path='resync/test/testdata/a') ) rl.add( Resource('http://ex.org/b', length=21, path='resync/test/testdata/b') ) d=Dump() zipf=os.path.join(self.tmpdir,"test00_dump.zip") d.write_zip(resources=rl,dumpfile=zipf) # named args self.assertTrue( os.path.exists(zipf) ) self.assertTrue( zipfile.is_zipfile(zipf) ) zo=zipfile.ZipFile(zipf,'r') self.assertEqual( len(zo.namelist()), 3 ) zo.close() os.unlink(zipf)
def test01_dump_zip_change_list(self): cl = ChangeDumpManifest() cl.add(Resource('http://ex.org/a', length=7, path='tests/testdata/a', change="updated")) cl.add(Resource('http://ex.org/b', length=21, path='tests/testdata/b', change="updated")) d = Dump() zipf = os.path.join(self.tmpdir, "test01_dump.zip") d.write_zip(cl, zipf) # positional args self.assertTrue(os.path.exists(zipf)) self.assertTrue(zipfile.is_zipfile(zipf)) zo = zipfile.ZipFile(zipf, 'r') self.assertEqual(len(zo.namelist()), 3) zo.close() os.unlink(zipf)
def generator() -> [SitemapData, Resource]: resourcedump = None ordinal = self.find_ordinal(Capability.resourcedump.name) resource_count = 0 doc_start = None resource_generator = self.resource_generator() for resource_count, resource in resource_generator(resource_metadata): # stuff resource into resourcedump if resourcedump is None: # resourcedump = ResourceDumpManifest() resourcedump = ResourceDump() doc_start = defaults.w3c_now() resourcedump.md_at = doc_start resourcedump.add(resource) # under conditions: yield the current resourcedump if resource_count % self.param.max_items_in_list == 0: ordinal += 1 doc_end = defaults.w3c_now() resourcedump.md_completed = doc_end d = Dump(resources = resourcedump) zipf = self.param.abs_metadata_path("rd_" + str(ordinal) + ".zip") print (str(zipf)) d.write_zip(resources=resourcedump, dumpfile=zipf) dumpResource = Resource(uri=str(zipf)) yield dumpResource resourcedump = None # under conditions: yield the current and last resourcedump if resourcedump: ordinal += 1 doc_end = defaults.w3c_now() resourcedump.md_completed = doc_end d = Dump() zipf = self.param.abs_metadata_path("rd_" + str(ordinal) + ".zip") print (str(zipf)) dumpResource = Resource(uri=str(zipf)) yield dumpResource d.write_zip(resources=resourcedump, dumpfile=zipf)
def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True): """ Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the zip. -- The resync.Dump.write_zip method used in this method has the side effect of changing local paths in resourcelist into paths relative in zip. :param resourcelist: resources to zip :param prefix: prefix of the zip file :param write_list: True if resourcelist should be written to local disc. Default: False :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True :return: the created zip as a resync.Resource. """ md_at = None # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. index = -1 zipfiles = sorted( glob(os.path.join(self.publish_dir, prefix + "*.zip"))) if len(zipfiles) > 0: last_zip_file = zipfiles[len(zipfiles) - 1] basename = os.path.basename(last_zip_file) index = int(re.findall('\d+', basename)[0]) zip_name = "%s%05d" % (prefix, index + 1) if (write_list): # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest. rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w") rl_file.write(resourcelist.as_xml()) rl_file.close() zip_path = os.path.join(self.publish_dir, zip_name + ".zip") dump = Dump() dump.path_prefix = self.resource_dir dump.write_zip(resourcelist, zip_path) # paths in resourcelist will be stripped. md_completed = None # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. #print "Zipped %d resources in %s" % (len(resourcelist), zip_path) loc = self.publish_url + zip_name + ".zip" # mandatory lastmod = self.last_modified(resourcelist) # optional md_type = "application/zip" # recommended md_length = os.stat(zip_path).st_size md5 = compute_md5_for_file(zip_path) zip_resource = Resource(uri=loc, lastmod=lastmod, length=md_length, md5=md5, mime_type=md_type, md_at=md_at, md_completed=md_completed) if write_manifest: rdm = ResourceDumpManifest(resources=resourcelist.resources) rdm_file = open( os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w") rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml" rdm_file.write(rdm.as_xml()) rdm_file.close() zip_resource.link_set(rel="content", href=rdm_url) return zip_resource
def generator(changedump=None) -> [SitemapData, ChangeDump]: resource_generator = self.resource_generator() self.update_previous_state() prev_r = self.previous_resources curr_r = { resource.uri: resource for count, resource in resource_generator(resource_metadata) } created = [r for r in curr_r.values() if r.uri not in prev_r] updated = [ r for r in curr_r.values() if r.uri in prev_r and r.md5 != prev_r[r.uri].md5 ] deleted = [r for r in prev_r.values() if r.uri not in curr_r] unchang = [ r for r in curr_r.values() if r.uri in prev_r and r.md5 == prev_r[r.uri].md5 ] # remove lastmod from deleted resource metadata for resource in deleted: resource.lastmod = None num_created = len(created) num_updated = len(updated) num_deleted = len(deleted) tot_changes = num_created + num_updated + num_deleted self.observers_inform(self, ExecutorEvent.found_changes, created=num_created, updated=num_updated, deleted=num_deleted, unchanged=len(unchang)) all_changes = { "created": created, "updated": updated, "deleted": deleted } ordinal = self.find_ordinal(Capability.changedump.name) resource_count = 0 if changedump: ordinal -= 1 resource_count = len(changedump) if resource_count >= self.param.max_items_in_list: changedump = None ordinal += 1 resource_count = 0 for kv in all_changes.items(): for resource in kv[1]: if changedump is None: changedump = ChangeDump() changedump.md_from = self.date_changedump_from resource.change = kv[ 0] # type of change: created, updated or deleted resource.md_datetime = self.date_start_processing changedump.add(resource) resource_count += 1 # under conditions: yield the current changedump if resource_count % self.param.max_items_in_list == 0: ordinal += 1 # sitemap_data = self.finish_sitemap(ordinal, changedump) d = Dump(resources=changedump) # zipf = os.path.join('/tmp', "cd_" + str(ordinal) + ".zip") zipf = self.param.abs_metadata_path("cd_" + str(ordinal) + ".zip") print(str(zipf)) d.write_zip(resources=changedump, dumpfile=zipf) doc_end = defaults.w3c_now() sitemap_data = self.finish_sitemap( ordinal, changedump, doc_start=self.date_start_processing, doc_end=doc_end) # dumpResource = ChangeDump(Resource(uri=str(zipf))) dumpResource = ChangeDump(uri=str(zipf)) # yield sitemap_data, changedump yield sitemap_data, dumpResource # yield sitemap_data, zipf # yield zipf changedump = None # under conditions: yield the current and last changedump if changedump and tot_changes > 0: ordinal += 1 doc_end = defaults.w3c_now() changedump.md_completed = doc_end d = Dump() zipf = self.param.abs_metadata_path("cd_" + str(ordinal) + ".zip") print(str(zipf)) sitemap_data = self.finish_sitemap( ordinal, changedump, doc_start=self.date_start_processing, doc_end=doc_end) # dumpResource = ChangeDump(Resource(uri=str(zipf))) dumpResource = ChangeDump(uri=str(zipf)) # dumpResource = ChangeDump(uri=str(zipf)) # yield sitemap_data, changedump yield sitemap_data, dumpResource # yield sitemap_data, zipf # yield zipf d.write_zip(resources=changedump, dumpfile=zipf)
def create_zip(self, resourcelist, prefix, write_list=False, write_manifest=True): """ Dump local resources in resourcelist to a zip file with the specified prefix. The index in the zip file name will be 1 higher than the last zip file index with the same prefix. A manifest.xml will be included in the zip. -- The resync.Dump.write_zip method used in this method has the side effect of changing local paths in resourcelist into paths relative in zip. :param resourcelist: resources to zip :param prefix: prefix of the zip file :param write_list: True if resourcelist should be written to local disc. Default: False :param write_manifest: True if a separate manifest file should be written to disc, False otherwise. Default: True :return: the created zip as a resync.Resource. """ md_at = ( None ) # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. index = -1 zipfiles = sorted(glob(os.path.join(self.publish_dir, prefix + "*.zip"))) if len(zipfiles) > 0: last_zip_file = zipfiles[len(zipfiles) - 1] basename = os.path.basename(last_zip_file) index = int(re.findall("\d+", basename)[0]) zip_name = "%s%05d" % (prefix, index + 1) if write_list: # this is the given resourcelist with local paths. As such it is *not* the resourcedump_manifest. rl_file = open(os.path.join(self.publish_dir, zip_name + ".xml"), "w") rl_file.write(resourcelist.as_xml()) rl_file.close() zip_path = os.path.join(self.publish_dir, zip_name + ".zip") dump = Dump() dump.path_prefix = self.resource_dir dump.write_zip(resourcelist, zip_path) # paths in resourcelist will be stripped. md_completed = ( None ) # w3cdt.datetime_to_str(no_fractions=True) # attribute gets lost in read > write cycle with resync library. # print "Zipped %d resources in %s" % (len(resourcelist), zip_path) loc = self.publish_url + zip_name + ".zip" # mandatory lastmod = self.last_modified(resourcelist) # optional md_type = "application/zip" # recommended md_length = os.stat(zip_path).st_size md5 = compute_md5_for_file(zip_path) zip_resource = Resource( uri=loc, lastmod=lastmod, length=md_length, md5=md5, mime_type=md_type, md_at=md_at, md_completed=md_completed, ) if write_manifest: rdm = ResourceDumpManifest(resources=resourcelist.resources) rdm_file = open(os.path.join(self.publish_dir, PREFIX_MANIFEST + zip_name + ".xml"), "w") rdm_url = self.publish_url + PREFIX_MANIFEST + zip_name + ".xml" rdm_file.write(rdm.as_xml()) rdm_file.close() zip_resource.link_set(rel="content", href=rdm_url) return zip_resource