def publish(self): """ Try and publish or remove zip end if something went wrong. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ if not os.path.isdir(self.resource_dir): os.makedirs(self.resource_dir) #print "Created %s" % self.resource_dir if not os.path.isdir(self.publish_dir): os.makedirs(self.publish_dir) #print "Created %s" % self.publish_dir try: return self.do_publish() except: # Something went wrong. Best we can do is clean up end of zip chain. zip_end_files = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) for ze_file in zip_end_files: os.remove(ze_file) print "error recovery: removed %s" % ze_file zip_end_xmls = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml")) for ze_xml in zip_end_xmls: os.remove(ze_xml) print "error recovery: removed %s" % ze_xml zip_end_manis = glob( os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml")) for ze_mani in zip_end_manis: os.remove(ze_mani) print "error recovery: removed %s" % ze_mani # remove zip-end entries from resource-dump.xml rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) rs_dump = ResourceDump() if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) prefix = self.publish_url + PREFIX_END_PART for uri in rs_dump.resources.keys(): if uri.startswith(prefix): del rs_dump.resources[uri] print "error recovery: removed %s from %s" % (uri, rs_dump_path) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) print "error recovery: walk through error recovery completed. Now raising ..." raise
def test_ex_04(self): """resourcesync_ex_4 is a simple resource dump with one ZIP listed""" rd=ResourceDump() rd.parse('tests/testdata/examples_from_spec/resourcesync_ex_4.xml') self.assertEqual( len(rd.resources), 1, '1 resources') self.assertTrue( 'http://example.com/resourcedump.zip' in rd.resources ) self.assertEqual( rd.resources['http://example.com/resourcedump.zip'].lastmod, '2013-01-03T09:00:00Z' )
def test01_as_xml(self): rd = ResourceDump() rd.add( Resource('a.zip',timestamp=1) ) rd.add( Resource('b.zip',timestamp=2) ) xml = rd.as_xml() self.assertTrue( re.search(r'<rs:md .*capability="resourcedump"', xml), 'XML has capability' ) self.assertTrue( re.search(r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a' )
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename( self.publish_dir)).rstrip('\n') print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def publish(self): """ Try and publish or remove zip end if something went wrong. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ if not os.path.isdir(self.resource_dir): os.makedirs(self.resource_dir) # print "Created %s" % self.resource_dir if not os.path.isdir(self.publish_dir): os.makedirs(self.publish_dir) # print "Created %s" % self.publish_dir try: return self.do_publish() except: # Something went wrong. Best we can do is clean up end of zip chain. zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) for ze_file in zip_end_files: os.remove(ze_file) print "error recovery: removed %s" % ze_file zip_end_xmls = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml")) for ze_xml in zip_end_xmls: os.remove(ze_xml) print "error recovery: removed %s" % ze_xml zip_end_manis = glob(os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml")) for ze_mani in zip_end_manis: os.remove(ze_mani) print "error recovery: removed %s" % ze_mani # remove zip-end entries from resource-dump.xml rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) rs_dump = ResourceDump() if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) prefix = self.publish_url + PREFIX_END_PART for uri in rs_dump.resources.keys(): if uri.startswith(prefix): del rs_dump.resources[uri] print "error recovery: removed %s from %s" % (uri, rs_dump_path) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) print "error recovery: walk through error recovery completed. Now raising ..." raise
def test01_as_xml(self): rd = ResourceDump() rd.add(Resource('a.zip', timestamp=1)) rd.add(Resource('b.zip', timestamp=2)) xml = rd.as_xml() self.assertTrue(re.search(r'<rs:md .*capability="resourcedump"', xml), 'XML has capability') self.assertTrue( re.search( r'<url><loc>a.zip</loc><lastmod>1970-01-01T00:00:01Z</lastmod></url>', xml), 'XML has resource a')
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename(self.publish_dir)).rstrip("\n") print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def test02_parse(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md at="2013-01-01" capability="resourcedump"/>\ <url><loc>http://example.com/a.zip</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12345" /></url>\ <url><loc>http://example.com/b.zip</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="56789" /></url>\ </urlset>' rd = ResourceDump() rd.parse(fh=io.StringIO(xml)) self.assertEqual(len(rd.resources), 2, 'got 2 resource dumps') self.assertEqual(rd.capability, 'resourcedump', 'capability set') self.assertEqual(rd.md_at, '2013-01-01') self.assertTrue('http://example.com/a.zip' in rd.resources) self.assertTrue(rd.resources['http://example.com/a.zip'].length, 12345) self.assertTrue('http://example.com/b.zip' in rd.resources) self.assertTrue(rd.resources['http://example.com/b.zip'].length, 56789)
def test10_parse(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md at="2013-01-01" capability="resourcedump"/>\ <url><loc>http://example.com/a.zip</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12345" /></url>\ <url><loc>http://example.com/b.zip</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="56789" /></url>\ </urlset>' rd=ResourceDump() rd.parse(fh=io.StringIO(xml)) self.assertEqual( len(rd.resources), 2, 'got 2 resource dumps') self.assertEqual( rd.capability, 'resourcedump', 'capability set' ) self.assertEqual( rd.md_at, '2013-01-01' ) self.assertTrue( 'http://example.com/a.zip' in rd.resources ) self.assertTrue( rd.resources['http://example.com/a.zip'].length, 12345 ) self.assertTrue( 'http://example.com/b.zip' in rd.resources ) self.assertTrue( rd.resources['http://example.com/b.zip'].length, 56789 )
def test03_parse_no_capability(self): # For a resource dump this should be an error xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md from="2013-01-01"/>\ <url><loc>http://example.com/a.zip</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" /></url>\ </urlset>' rd = ResourceDump() self.assertRaises(SitemapParseError, rd.parse, fh=io.StringIO(xml))
def test04_parse_bad_capability(self): # the <rs:md capability="bad_capability".. should give error xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="bad_capability" from="2013-01-01"/>\ <url><loc>http://example.com/bad_res_1</loc><lastmod>2012-03-14T18:37:36Z</lastmod></url>\ </urlset>' rd = ResourceDump() self.assertRaises(SitemapParseError, rd.parse, fh=io.StringIO(xml))
def test_build_ex_13(self): """Capability List document with 4 entries""" cl = CapabilityList() cl.describedby = 'http://example.com/info_about_set1_of_resources.xml' cl.up = 'http://example.com/resourcesync_description.xml' cl.add_capability( capability=ResourceList( uri='http://example.com/dataset1/resourcelist.xml' ) ) cl.add_capability( capability=ResourceDump( uri='http://example.com/dataset1/resourcedump.xml' ) ) cl.add_capability( capability=ChangeList( uri='http://example.com/dataset1/changelist.xml' ) ) cl.add_capability( capability=ChangeDump( uri='http://example.com/dataset1/changedump.xml' ) ) ex_xml = self._open_ex('resourcesync_ex_13').read() self._assert_xml_equal( cl.as_xml(), ex_xml )
def test_build_ex_04(self): """Simple Resource Dump document """ rd = ResourceDump() rd.md_at = '2013-01-03T09:00:00Z' rd.add( Resource(uri='http://example.com/resourcedump.zip', lastmod='2013-01-03T09:00:00Z') ) ex_xml = self._open_ex('resourcesync_ex_4').read() self._assert_xml_equal( rd.as_xml(), ex_xml )
def get_resource_dump_xml(self, from_date=None, to_date=None): """ Get content of resource dump. :return: (xml) resource dump content """ if not self._validation(): return None from .utils import parse_date if from_date: from_date = parse_date(from_date) if to_date: to_date = parse_date(to_date) r = get_items_by_index_tree(self.repository_id) rd = ResourceDump() rd.up = INVENIO_CAPABILITY_URL.format(request.url_root) for item in r: if item: resource_date = parse_date(item.get('_source').get('_updated')) if from_date and from_date > resource_date: continue if to_date and to_date < resource_date: continue id_item = item.get('_source').get('control_number') url = '{}resync/{}/{}/file_content.zip'.format( request.url_root, self.repository_id, str(id_item)) rs = Resource(url, lastmod=item.get('_source').get('_updated'), ln=[]) if self.resource_dump_manifest: href = '{}resync/{}/{}/resourcedump_manifest.xml'.format( request.url_root, self.repository_id, str(id_item)) rs.ln.append({ 'rel': 'contents', 'href': href, 'type': 'application/xml' }) rd.add(rs) return rd.as_xml()
def do_publish(self): """ Publish resources found in resource_dir in accordance with the Resource Sync Framework. Resources will be packaged in ZIP file format. The amount of resources that will be packaged in one zip file is bound to max_files_in_zip. Successive packages will be created if more than max_files_in_zip resources have to be published. Packages that reach the limit of max_files_in_zip are marked as complete. Any remainder of resources are packaged in a zip file marked as zip end. WARNING: This method removes resources that are published in packages marked as complete from resource_dir. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ count_def_resources = 0 diff_end_resources = 0 path_zip_end_old, rl_end_old = self.get_state_published() new_zips = ResourceDump() state_changed = False exhausted = False while not exhausted: resourcelist, exhausted = self.list_resources_chunk() if len(resourcelist) == self.max_files_compressed: # complete zip state_changed = True count_def_resources += len(resourcelist) zip_resource = self.create_zip(resourcelist, PREFIX_COMPLETED_PART, False, self.write_separate_manifest) new_zips.add(zip_resource) # move resources from resource_dir for resource in resourcelist: r_path = os.path.join(self.resource_dir, resource.path) if self.move_resources: shutil.move(r_path, self.publish_dir) else: os.remove(r_path) elif not self.is_same(resourcelist, rl_end_old): assert exhausted state_changed = True if len(resourcelist) > 0: diff_end_resources += len(resourcelist) zip_resource = self.create_zip(resourcelist, PREFIX_END_PART, True, self.write_separate_manifest) new_zips.add(zip_resource) # publish new metadata. Exclude zip_end_old if state_changed: self.publish_metadata(new_zips, path_zip_end_old) # remove old zip end file, resource list and manifest; # account for difference of resources provisionally packaged. if state_changed and path_zip_end_old: diff_end_resources -= len(rl_end_old) os.remove(path_zip_end_old) os.remove(os.path.splitext(path_zip_end_old)[0] + ".xml") manifest = PREFIX_MANIFEST + os.path.splitext(os.path.basename(path_zip_end_old))[0] + ".xml" manifest_file = os.path.join(self.publish_dir, manifest) if os.path.isfile(manifest_file): os.remove(manifest_file) return state_changed, count_def_resources, diff_end_resources
def do_publish(self): """ Publish resources found in resource_dir in accordance with the Resource Sync Framework. Resources will be packaged in ZIP file format. The amount of resources that will be packaged in one zip file is bound to max_files_in_zip. Successive packages will be created if more than max_files_in_zip resources have to be published. Packages that reach the limit of max_files_in_zip are marked as complete. Any remainder of resources are packaged in a zip file marked as zip end. WARNING: This method removes resources that are published in packages marked as complete from resource_dir. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ count_def_resources = 0 diff_end_resources = 0 path_zip_end_old, rl_end_old = self.get_state_published() new_zips = ResourceDump() state_changed = False exhausted = False while not exhausted: resourcelist, exhausted = self.list_resources_chunk() if len(resourcelist) == self.max_files_compressed: # complete zip state_changed = True count_def_resources += len(resourcelist) zip_resource = self.create_zip(resourcelist, PREFIX_COMPLETED_PART, False, self.write_separate_manifest) new_zips.add(zip_resource) # move resources from resource_dir for resource in resourcelist: r_path = os.path.join(self.resource_dir, resource.path) if self.move_resources: shutil.move(r_path, self.publish_dir) else: os.remove(r_path) elif not self.is_same(resourcelist, rl_end_old): assert exhausted state_changed = True if len(resourcelist) > 0: diff_end_resources += len(resourcelist) zip_resource = self.create_zip( resourcelist, PREFIX_END_PART, True, self.write_separate_manifest) new_zips.add(zip_resource) # publish new metadata. Exclude zip_end_old if state_changed: self.publish_metadata(new_zips, path_zip_end_old) # remove old zip end file, resource list and manifest; # account for difference of resources provisionally packaged. if state_changed and path_zip_end_old: diff_end_resources -= len(rl_end_old) os.remove(path_zip_end_old) os.remove(os.path.splitext(path_zip_end_old)[0] + ".xml") manifest = PREFIX_MANIFEST + os.path.splitext( os.path.basename(path_zip_end_old))[0] + ".xml" manifest_file = os.path.join(self.publish_dir, manifest) if os.path.isfile(manifest_file): os.remove(manifest_file) return state_changed, count_def_resources, diff_end_resources
def test05_write(self): rd = ResourceDump() rd.add(Resource('aa.zip', timestamp=1)) rd.add(Resource('bb.zip', timestamp=2)) dumpf = os.path.join(self.tmpdir, "test05_dump.xml") rd.write(basename=dumpf) self.assertTrue(os.path.exists(dumpf)) # Now read that back rd2 = ResourceDump() rd2.parse(dumpf) self.assertEqual(len(rd2), 2) self.assertEqual(rd2.uris(), ['aa.zip', 'bb.zip'])
def test_build_ex_17(self): """Resource Dump with 3 entries and some metadata""" rd = ResourceDump() rd.up='http://example.com/dataset1/capabilitylist.xml' rd.md_at="2013-01-03T09:00:00Z" rd.md_completed="2013-01-03T09:04:00Z" z1 = Resource( uri='http://example.com/resourcedump-part1.zip', mime_type="application/zip", length=4765, md_at="2013-01-03T09:00:00Z", md_completed="2013-01-03T09:02:00Z" ) z1.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part1.xml", mime_type="application/xml" ) rd.add( z1 ) z2 = Resource( uri='http://example.com/resourcedump-part2.zip', mime_type="application/zip", length=9875, md_at="2013-01-03T09:01:00Z", md_completed="2013-01-03T09:03:00Z" ) z2.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part2.xml", mime_type="application/xml" ) rd.add( z2 ) z3 = Resource( uri='http://example.com/resourcedump-part3.zip', mime_type="application/zip", length=2298, md_at="2013-01-03T09:03:00Z", md_completed="2013-01-03T09:04:00Z" ) z3.link_set( rel="contents", href="http://example.com/resourcedump_manifest-part3.xml", mime_type="application/xml" ) rd.add( z3 ) ex_xml = self._open_ex('resourcesync_ex_17').read() self._assert_xml_equal( rd.as_xml(), ex_xml )