def get_state_published(self): """ See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist (with local paths) of resources published in the zip end file. :return: - the path to the zip end file or None if there is no zip end file. - the resourcelist of resources published in zip end file or an empty list if there is no zip end file. """ path_zip_end_old = None rl_end_old = ResourceList() zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) if len(zip_end_files) > 1: raise RuntimeError( "Found more than one %s*.zip files. Inconsistent structure of %s." % (PREFIX_END_PART, self.publish_dir) ) elif len(zip_end_files) == 1: path_zip_end_old = zip_end_files[0] if path_zip_end_old: rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r") sm = Sitemap() sm.parse_xml(rl_file, resources=rl_end_old) rl_file.close() return path_zip_end_old, rl_end_old
def publish(self): """ Try and publish or remove zip end if something went wrong. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ if not os.path.isdir(self.resource_dir): os.makedirs(self.resource_dir) #print "Created %s" % self.resource_dir if not os.path.isdir(self.publish_dir): os.makedirs(self.publish_dir) #print "Created %s" % self.publish_dir try: return self.do_publish() except: # Something went wrong. Best we can do is clean up end of zip chain. zip_end_files = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) for ze_file in zip_end_files: os.remove(ze_file) print "error recovery: removed %s" % ze_file zip_end_xmls = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml")) for ze_xml in zip_end_xmls: os.remove(ze_xml) print "error recovery: removed %s" % ze_xml zip_end_manis = glob( os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml")) for ze_mani in zip_end_manis: os.remove(ze_mani) print "error recovery: removed %s" % ze_mani # remove zip-end entries from resource-dump.xml rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) rs_dump = ResourceDump() if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) prefix = self.publish_url + PREFIX_END_PART for uri in rs_dump.resources.keys(): if uri.startswith(prefix): del rs_dump.resources[uri] print "error recovery: removed %s from %s" % (uri, rs_dump_path) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) print "error recovery: walk through error recovery completed. Now raising ..." raise
def test33_write(self): # ResourceList rl = ResourceList() rl.add(Resource(uri='http://example.com/test/a', timestamp=1)) rl.add(Resource(uri='http://example.com/test/b', timestamp=1)) rl.add(Resource(uri='http://example.com/test/c', timestamp=1)) rl_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist.xml') rl.write(basename=rl_filename) with open(rl_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertFalse(s.parsed_index) # ResourceListIndex rli = ResourceList() rli.add(Resource(uri='http://example.com/test/resourcelist00000.xml', timestamp=1)) rli.add(Resource(uri='http://example.com/test/resourcelist00001.xml', timestamp=1)) rli.add(Resource(uri='http://example.com/test/resourcelist00002.xml', timestamp=1)) rli.sitemapindex = True rli_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist-index.xml') rli.write(basename=rli_filename) with open(rli_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertTrue(s.parsed_index)
def read_sitemap(self, path, sitemap=None): if sitemap is None: sitemap = ListBaseWithIndex() with open(path, "r", encoding="utf-8") as file: sm = Sitemap() sm.parse_xml(file, resources=sitemap) return sitemap
def get_state_published(self): """ See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist (with local paths) of resources published in the zip end file. :return: - the path to the zip end file or None if there is no zip end file. - the resourcelist of resources published in zip end file or an empty list if there is no zip end file. """ path_zip_end_old = None rl_end_old = ResourceList() zip_end_files = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) if len(zip_end_files) > 1: raise RuntimeError( "Found more than one %s*.zip files. Inconsistent structure of %s." % (PREFIX_END_PART, self.publish_dir)) elif len(zip_end_files) == 1: path_zip_end_old = zip_end_files[0] if path_zip_end_old: rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r") sm = Sitemap() sm.parse_xml(rl_file, resources=rl_end_old) rl_file.close() return path_zip_end_old, rl_end_old
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename( self.publish_dir)).rstrip('\n') print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def all_resources(self): all_resources = {} # search for resourcelists resourcelist_files = sorted( glob(self.paras.abs_metadata_path("resourcelist_*.xml"))) for rl_file_name in resourcelist_files: resourcelist = ResourceList() with open(rl_file_name, "r", encoding="utf-8") as rl_file: sm = Sitemap() sm.parse_xml(rl_file, resources=resourcelist) all_resources.update({ resource.uri: resource for resource in resourcelist.resources }) # search for changelists changelist_files = sorted( glob(self.paras.abs_metadata_path("changelist_*.xml"))) for cl_file_name in changelist_files: changelist = ChangeList() with open(cl_file_name, "r", encoding="utf-8") as cl_file: sm = Sitemap() sm.parse_xml(cl_file, resources=changelist) for resource in changelist.resources: if resource.change == "created" or resource.change == "updated": all_resources.update({resource.uri: resource}) elif resource.change == "deleted" and resource.uri in all_resources: del all_resources[resource.uri] return all_resources
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename(self.publish_dir)).rstrip("\n") print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def synchronize(self): """ Publish the resources found in source_dir in accordance with the Resourcesync Framework in sink_dir. """ if not os.path.isdir(self.source_dir): os.makedirs(self.source_dir) print "Created %s" % self.source_dir if not os.path.isdir(self.sink_dir): os.makedirs(self.sink_dir) print "Created %s" % self.sink_dir self.handshake = self.verify_handshake() if self.handshake is None: return #################### # print "Synchronizing state as of %s" % self.handshake ### initial resource description wellknown = os.path.join(self.sink_dir, RS_WELL_KNOWN) if not os.path.isdir(wellknown): os.makedirs(wellknown) src_desc = SourceDescription() new_src_desc = True # Load existing resource-description, if any. if os.path.isfile(self.src_desc_path): new_src_desc = False with open(self.src_desc_path, "r") as src_desc_file: sm = Sitemap() sm.parse_xml(src_desc_file, resources=src_desc) count_lists = len(src_desc.resources) ### resources in subdirectories or main directory ### the existance of FILE_INDEX indicates whether resources reside directly in source_dir or in subdirectories. index_file = os.path.join(self.source_dir, FILE_INDEX) if os.path.isfile(index_file): for dirname in os.walk(self.source_dir).next()[1]: source = os.path.join(self.source_dir, dirname) sink = os.path.join(self.sink_dir, dirname) publish_url = self.publish_url + dirname + "/" self.__execute_sync__(source, sink, publish_url, src_desc) else: self.__execute_sync__(self.source_dir, self.sink_dir, self.publish_url, src_desc) if new_src_desc or count_lists != len(src_desc.resources): ### publish resource description with open(self.src_desc_path, "w") as src_desc_file: src_desc_file.write(src_desc.as_xml()) print "New resource description. See %s" % self.src_desc_url self.report()
def publish(self): """ Try and publish or remove zip end if something went wrong. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ if not os.path.isdir(self.resource_dir): os.makedirs(self.resource_dir) # print "Created %s" % self.resource_dir if not os.path.isdir(self.publish_dir): os.makedirs(self.publish_dir) # print "Created %s" % self.publish_dir try: return self.do_publish() except: # Something went wrong. Best we can do is clean up end of zip chain. zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) for ze_file in zip_end_files: os.remove(ze_file) print "error recovery: removed %s" % ze_file zip_end_xmls = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml")) for ze_xml in zip_end_xmls: os.remove(ze_xml) print "error recovery: removed %s" % ze_xml zip_end_manis = glob(os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml")) for ze_mani in zip_end_manis: os.remove(ze_mani) print "error recovery: removed %s" % ze_mani # remove zip-end entries from resource-dump.xml rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) rs_dump = ResourceDump() if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) prefix = self.publish_url + PREFIX_END_PART for uri in rs_dump.resources.keys(): if uri.startswith(prefix): del rs_dump.resources[uri] print "error recovery: removed %s from %s" % (uri, rs_dump_path) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) print "error recovery: walk through error recovery completed. Now raising ..." raise
def synchronize(self): """ Publish the resources found in source_dir in accordance with the Resourcesync Framework in sink_dir. """ if not os.path.isdir(self.source_dir): os.makedirs(self.source_dir) print "Created %s" % self.source_dir if not os.path.isdir(self.sink_dir): os.makedirs(self.sink_dir) print "Created %s" % self.sink_dir self.handshake = self.verify_handshake() if self.handshake is None: return #################### # print "Synchronizing state as of %s" % self.handshake ### initial resource description wellknown = os.path.join(self.sink_dir, RS_WELL_KNOWN) if not os.path.isdir(wellknown): os.makedirs(wellknown) src_desc = SourceDescription() new_src_desc = True # Load existing resource-description, if any. if os.path.isfile(self.src_desc_path): new_src_desc = False with open(self.src_desc_path, "r") as src_desc_file: sm = Sitemap() sm.parse_xml(src_desc_file, resources=src_desc) count_lists = len(src_desc.resources) ### resources in subdirectories or main directory ### the existance of FILE_INDEX indicates whether resources reside directly in source_dir or in subdirectories. index_file = os.path.join(self.source_dir, FILE_INDEX) if os.path.isfile(index_file): for dirname in os.walk(self.source_dir).next()[1]: source = os.path.join(self.source_dir, dirname) sink = os.path.join(self.sink_dir, dirname) publish_url = self.publish_url + dirname + "/" self.__execute_sync__(source, sink, publish_url, src_desc) else: self.__execute_sync__(self.source_dir, self.sink_dir, self.publish_url, src_desc) if new_src_desc or count_lists != len(src_desc.resources): ### publish resource description with open(self.src_desc_path, "w") as src_desc_file: src_desc_file.write(src_desc.as_xml()) print "New resource description. See %s" % self.src_desc_url self.report()
def test_21_parse_sitemapindex(self): s=Sitemap() si = s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>'), sitemapindex=True ) self.assertEqual( len(si.resources), 2, '2 sitemaps') sms = sorted(si.uris()) self.assertEqual( sms, ['aaa','bbb'] ) # add a couple more s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), resources=si ) self.assertTrue( s.parsed_index, 'was a sitemapindex') self.assertEqual( len(si.resources), 4, '4 sitemaps total') sms = sorted(si.uris()) self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )
def explore_uri(self, uri, caps): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s=Sitemap() print "Reading %s" % (uri) try: list = s.parse_xml(urllib.urlopen(uri)) except IOError as e: raise ClientFatalError("Cannot read %s (%s)" % (uri,str(e))) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] if (s.parsed_index): capability += 'index' print "Parsed %s document with %d entries:" % (capability,num_entries) if (caps is not None and capability not in caps): print "WARNING - expected a %s document" % (','.join(caps)) to_show = num_entries if (num_entries>21): to_show = 20 # What entries are allowed? # FIXME - not complete if (capability == 'capabilitylistindex'): entry_caps = ['capabilitylist'] elif (capability == 'capabilitylist'): entry_caps = ['resourcelist','changelist','resourcedump','changedump','changelistindex'] elif (capability == 'changelistindex'): entry_caps = ['changelist'] n = 0 options = {} for r in list.resources: if (n>=to_show): print "(not showing remaining %d entries)" % (num_entries-n) last n+=1 options[str(n)]=r print "[%d] %s" % (n,r.uri) if (r.capability is not None): warning = '' if (r.capability not in entry_caps): warning = " (EXPECTED %s)" % (' or '.join(entry_caps)) print " %s%s" % (r.capability,warning) elif (len(entry_caps)==1): r.capability=entry_caps[0] print " capability not specified, should be %s" % (r.capability) while (True): inp = raw_input( "Follow [number or q(uit)]?" ) if (inp in options.keys()): break if (inp == 'q'): return('','',inp) caps = [ options[inp].capability ] if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents if (caps is None): caps = ['capabilitylist'] return( options[inp].uri, caps, inp )
def parse_document(self): """Parse any ResourceSync document and show information Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ s = Sitemap() self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) try: list = s.parse_xml(urllib.urlopen(self.sitemap)) except IOError as e: raise ClientFatalError("Cannot read document (%s)" % str(e)) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] print "Parsed %s document with %d entries" % (capability, num_entries) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for resource in list: print '[%d] %s' % (n, str(resource)) n += 1 if (n >= to_show): break
def test_19_parse_with_bad_rs_ln(self): xmlstart='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="resourcelist"/>\ <url><loc>http://example.com/file_a</loc>' xmlend='</url></urlset>' s=Sitemap() # # missing href xml=xmlstart+'<rs:ln rel="duplicate"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # missing rel xml=xmlstart+'<rs:ln href="http://example.com/"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # bad length xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" length="a"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # bad pri xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # and finally OK with errors fixes xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>'+xmlend rc = s.parse_xml(fh=io.StringIO(xml)) self.assertEqual( len(rc.resources), 1, 'good at last, extra attribute ignored' )
def sync_incremental(map, counter, base_url, from_date, to_date): """Run resync incremental.""" # init_logging(verbose=True) from .resync import ResourceSyncClient client = ResourceSyncClient() client.ignore_failures = True try: single_sync_incremental(map, counter, base_url, from_date, to_date) return True except MapperError as e: current_app.logger.info(e) paths = map[0].rsplit('/', 1) map[0] = paths[0] except Exception as e: # maybe url contain a list of changelist, instead of changelist current_app.logger.info(e) s = Sitemap() try: docs = s.parse_xml(url_or_file_open(base_url)) except IOError as ioerror: raise ioerror if docs: for doc in docs: # make sure sub url is a changelist/ changedump capability = read_capability(doc.uri) if capability is None: raise ('Bad URL, not a changelist/changedump,' ' cannot sync incremental') if capability != 'changelist' and capability != 'changedump': raise ('Bad URL, not a changelist/changedump,' ' cannot sync incremental') single_sync_incremental(map, counter, doc.uri, from_date, to_date) return True raise e
def parse_document(self): """Parse any ResourceSync document and show information Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ s=Sitemap() self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) try: list = s.parse_xml(urllib.urlopen(self.sitemap)) except IOError as e: raise ClientFatalError("Cannot read document (%s)" % str(e)) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] print "Parsed %s document with %d entries" % (capability,num_entries) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for resource in list: print '[%d] %s' % (n,str(resource)) n+=1 if ( n >= to_show ): break
def parse_xml(self, fh=None, etree=None, resources=None, capability=None, sitemapindex=None): """Parse XML Sitemap and add to resources object. Reads from fh or etree and adds resources to a resorces object (which must support the add method). Returns the resources object. Also sets self.resources_created to be the number of resources created. We adopt a very lax approach here. The parsing is properly namespace aware but we search just for the elements wanted and leave everything else alone. This method will read either sitemap or sitemapindex documents. Behavior depends on the sitemapindex parameter: - None - will read either - False - SitemapIndexError exception if sitemapindex detected - True - SitemapIndexError exception if sitemap detected Will set self.parsed_index based on whether a sitemap or sitemapindex document was read: - False - sitemap - True - sitemapindex """ sitemap = Sitemap() self.res_container = sitemap.parse_xml(fh=fh, etree=etree, resources=resources, capability=capability, sitemapindex=sitemapindex) return self.res_container
def test_18_parse_with_rs_ln_on_resource(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability=\"resourcelist\"/>\ <url>\ <loc>http://example.com/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\ <rs:md hash=\"md5:r2d2\" length=\"12345\" />\ <rs:ln rel=\"duplicate\" href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02\" pri=\"1\" />\ <rs:ln rel=\"num2\" href=\"http://m2.example.com/res1\"/>\ <rs:ln rel=\"num3\" href=\"http://m3.example.com/res1\"/>\ </url>\ <url>\ <loc>http://example.com/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\ <rs:md length=\"32\" />\ </url>\ </urlset>' s=Sitemap() rc=s.parse_xml(fh=io.StringIO(xml)) self.assertFalse( s.parsed_index, 'was a sitemap') self.assertEqual( s.resources_created, 2, 'got 2 resources') i = iter(rc) r1 = next(i) r2 = next(i) self.assertEqual( r1.uri, 'http://example.com/file_a' ) self.assertEqual( r1.ln[0]['rel'], 'duplicate' ) self.assertEqual( r1.ln[0]['href'], 'http://mirror1.example.com/res1' ) self.assertEqual( r1.ln[0]['modified'], '2013-01-02' ) self.assertEqual( r1.ln[0]['pri'], 1 ) self.assertEqual( r2.uri, 'http://example.com/file_b' )
def test_19_parse_with_bad_rs_ln(self): xmlstart = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="resourcelist"/>\ <url><loc>http://example.com/file_a</loc>' xmlend = '</url></urlset>' s = Sitemap() # # missing href xml = xmlstart + '<rs:ln rel="duplicate"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # missing rel xml = xmlstart + '<rs:ln href="http://example.com/"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # bad length xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="a"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # bad pri xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # and finally OK with errors fixes xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>' + xmlend rc = s.parse_xml(fh=io.StringIO(xml)) self.assertEqual(len(rc.resources), 1, 'good at last, extra attribute ignored')
def test_18_parse_with_rs_ln_on_resource(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability=\"resourcelist\"/>\ <url>\ <loc>http://example.com/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\ <rs:md hash=\"md5:r2d2\" length=\"12345\" />\ <rs:ln rel=\"duplicate\" href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02\" pri=\"1\" />\ <rs:ln rel=\"num2\" href=\"http://m2.example.com/res1\"/>\ <rs:ln rel=\"num3\" href=\"http://m3.example.com/res1\"/>\ </url>\ <url>\ <loc>http://example.com/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\ <rs:md length=\"32\" />\ </url>\ </urlset>' s = Sitemap() rc = s.parse_xml(fh=io.StringIO(xml)) self.assertFalse(s.parsed_index, 'was a sitemap') self.assertEqual(s.resources_created, 2, 'got 2 resources') i = iter(rc) r1 = next(i) r2 = next(i) self.assertEqual(r1.uri, 'http://example.com/file_a') self.assertEqual(r1.ln[0]['rel'], 'duplicate') self.assertEqual(r1.ln[0]['href'], 'http://mirror1.example.com/res1') self.assertEqual(r1.ln[0]['modified'], '2013-01-02') self.assertEqual(r1.ln[0]['pri'], 1) self.assertEqual(r2.uri, 'http://example.com/file_b')
def test_20_parse_sitemapindex_empty(self): s = Sitemap() si = s.parse_xml(fh=io.StringIO( '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </sitemapindex>' ), sitemapindex=True) self.assertTrue(s.parsed_index, 'was a sitemapindex') self.assertEqual(len(si.resources), 0, '0 sitemaps')
def test_22_parse_sitemapindex_file(self): s=Sitemap() fh=open('tests/testdata/sitemapindex1/sitemap.xml','r') si = s.parse_xml( fh=fh, sitemapindex=True ) self.assertTrue( s.parsed_index, 'was a sitemapindex') self.assertEqual( len(si.resources), 3, '3 sitemaps') sms = sorted(si.uris()) self.assertEqual( sms, ['http://localhost:8888/sitemap00000.xml','http://localhost:8888/sitemap00001.xml','http://localhost:8888/sitemap00002.xml'] )
def test_21_parse_sitemapindex(self): s = Sitemap() si = s.parse_xml(fh=io.StringIO( '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>' ), sitemapindex=True) self.assertEqual(len(si.resources), 2, '2 sitemaps') sms = sorted(si.uris()) self.assertEqual(sms, ['aaa', 'bbb']) # add a couple more s.parse_xml(fh=io.StringIO( '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>' ), resources=si) self.assertTrue(s.parsed_index, 'was a sitemapindex') self.assertEqual(len(si.resources), 4, '4 sitemaps total') sms = sorted(si.uris()) self.assertEqual(sms, ['aaa', 'bbb', 'cc', 'dd'])
def generator(): for file_name in self.paras.last_sitemaps: listbase = ListBaseWithIndex() if os.path.exists(file_name): with open(file_name, "r", encoding="utf-8") as lb_file: sm = Sitemap() sm.parse_xml(lb_file, resources=listbase) for resource in listbase.resources: if resource.change is None or not resource.change == "deleted": path, relpath = self.extract_paths(resource.uri) yield resource, path, relpath else: LOG.warning("Unable to read sitemap: %s" % file_name) self.count_errors += 1 self.observers_inform( self, ResourceAuditorEvent.site_map_not_found, file=file_name)
def read_source(self): """ Read the source_uri and parse it to source_document. :return: True if the document was downloaded and parsed without exceptions, False otherwise. """ session = requests.Session() try: response = session.get(self.source_uri) self.source_status = response.status_code self.logger.debug("Read %s, status %s" % (self.source_uri, str(self.source_status))) assert self.source_status == 200, "Invalid response status: %d" % self.source_status text = response.text root = ET.fromstring(text) self.is_index = root.tag == SITEMAP_INDEX_ROOT etree = ET.ElementTree(root) sitemap = Sitemap() self.source_document = sitemap.parse_xml(etree=etree) # the source_document is a resync.resource_container.ResourceContainer capability = self.source_document.capability assert capability == self.capability, "Capability is not %s but %s" % (self.capability, capability) # anyone interested in sitemaps? for processor_listener in processor_listeners: processor_listener.event_sitemap_received(self.source_uri, capability, text) self.describedby_url = self.source_document.describedby self.up_url = self.source_document.up # to a parent non-index document self.index_url = self.source_document.index # to a parent index document self.status = Status.document except requests.exceptions.ConnectionError as err: self.logger.debug("%s No connection: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except xml.etree.ElementTree.ParseError as err: self.logger.debug("%s ParseError: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except resync.sitemap.SitemapParseError as err: self.logger.debug("%s Unreadable source: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except AssertionError as err: self.logger.debug("%s Error: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) finally: session.close() return self.status == Status.document
def test_11_parse_2(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"12\" /></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"32\" /></url>\ </urlset>' s=Sitemap() i=s.parse_xml(fh=io.StringIO(xml)) self.assertFalse( s.parsed_index, 'was a sitemap') self.assertEqual( s.resources_created, 2, 'got 2 resources')
def read_capability(url): """Read capability of an url.""" s = Sitemap() capability = None try: document = s.parse_xml(url_or_file_open(url)) except IOError as e: raise e if 'capability' in document.md: capability = document.md['capability'] return capability
def test_11_parse_2(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"12\" /></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"32\" /></url>\ </urlset>' s = Sitemap() i = s.parse_xml(fh=io.StringIO(xml)) self.assertFalse(s.parsed_index, 'was a sitemap') self.assertEqual(s.resources_created, 2, 'got 2 resources')
def test_22_parse_sitemapindex_file(self): s = Sitemap() fh = open('tests/testdata/sitemapindex1/sitemap.xml', 'r') si = s.parse_xml(fh=fh, sitemapindex=True) self.assertTrue(s.parsed_index, 'was a sitemapindex') self.assertEqual(len(si.resources), 3, '3 sitemaps') sms = sorted(si.uris()) self.assertEqual(sms, [ 'http://localhost:8888/sitemap00000.xml', 'http://localhost:8888/sitemap00001.xml', 'http://localhost:8888/sitemap00002.xml' ])
def test_13_parse_multi_lastmod(self): xml_start='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>uri:a</loc>' xml_end='</url></urlset>' s=Sitemap() two_lastmod='<lastmod>2013-01-01</lastmod><lastmod>2013-01-02</lastmod>' self.assertRaises( SitemapParseError, s.parse_xml, io.StringIO(xml_start+two_lastmod+xml_end)) # While it not ideal to omit, <lastmod> is not required and # thus either empty lastmod or lastmod with just an attribute # and no content are not ambiguous and thus should be accepted # with resulting None for resource.lastmod mt_lastmod='<lastmod></lastmod>' i=s.parse_xml(fh=io.StringIO(xml_start+mt_lastmod+xml_end)) self.assertEqual( s.resources_created, 1 ) self.assertEqual( i.resources[0].lastmod, None ) mt_lastmod_att='<lastmod att="value"/>' i=s.parse_xml(fh=io.StringIO(xml_start+mt_lastmod_att+xml_end)) self.assertEqual( s.resources_created, 1 ) self.assertEqual( i.resources[0].lastmod, None )
def test33_write(self): # ResourceList rl = ResourceList() rl.add(Resource(uri='http://example.com/test/a', timestamp=1)) rl.add(Resource(uri='http://example.com/test/b', timestamp=1)) rl.add(Resource(uri='http://example.com/test/c', timestamp=1)) rl_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist.xml') rl.write(basename=rl_filename) with open(rl_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertFalse(s.parsed_index) # ResourceListIndex rli = ResourceList() rli.add( Resource(uri='http://example.com/test/resourcelist00000.xml', timestamp=1)) rli.add( Resource(uri='http://example.com/test/resourcelist00001.xml', timestamp=1)) rli.add( Resource(uri='http://example.com/test/resourcelist00002.xml', timestamp=1)) rli.sitemapindex = True rli_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist-index.xml') rli.write(basename=rli_filename) with open(rli_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertTrue(s.parsed_index)
def test_13_parse_multi_lastmod(self): xml_start = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>uri:a</loc>' xml_end = '</url></urlset>' s = Sitemap() two_lastmod = '<lastmod>2013-01-01</lastmod><lastmod>2013-01-02</lastmod>' self.assertRaises(SitemapParseError, s.parse_xml, io.StringIO(xml_start + two_lastmod + xml_end)) # While it not ideal to omit, <lastmod> is not required and # thus either empty lastmod or lastmod with just an attribute # and no content are not ambiguous and thus should be accepted # with resulting None for resource.lastmod mt_lastmod = '<lastmod></lastmod>' i = s.parse_xml(fh=io.StringIO(xml_start + mt_lastmod + xml_end)) self.assertEqual(s.resources_created, 1) self.assertEqual(i.resources[0].lastmod, None) mt_lastmod_att = '<lastmod att="value"/>' i = s.parse_xml(fh=io.StringIO(xml_start + mt_lastmod_att + xml_end)) self.assertEqual(s.resources_created, 1) self.assertEqual(i.resources[0].lastmod, None)
def get_from_date_from_url(url): """Get smallest timestamp from url and parse to string.""" s = Sitemap() try: document = s.parse_xml(url_or_file_open(url)) except IOError as e: raise e date_list = [] for item in document.resources: if item.timestamp: date_list.append(item.timestamp) if len(date_list) > 0: from_date = dt.fromtimestamp(min(date_list)) return from_date.strftime("%Y-%m-%d")
def update_previous_state(self): if self.previous_resources is None: self.previous_resources = {} # search for resourcelists self.resourcelist_files = sorted( glob(self.param.abs_metadata_path("changedump_*.xml"))) for rl_file_name in self.resourcelist_files: resourcelist = ResourceList() with open(rl_file_name, "r", encoding="utf-8") as rl_file: sm = Sitemap() sm.parse_xml(rl_file, resources=resourcelist) self.date_resourcelist_completed = resourcelist.md_completed if self.date_resourcelist_completed is None: self.date_resourcelist_completed = resourcelist.md_at self.previous_resources.update({ resource.uri: resource for resource in resourcelist.resources }) # search for changedumps self.changedump_files = sorted( glob(self.param.abs_metadata_path("changedump_*.xml"))) for cl_file_name in self.changedump_files: changedump = ChangeDump() with open(cl_file_name, "r", encoding="utf-8") as cl_file: sm = Sitemap() sm.parse_xml(cl_file, resources=changedump) for resource in changedump.resources: if resource.change == "created" or resource.change == "updated": self.previous_resources.update( {resource.uri: resource}) elif resource.change == "deleted" and resource.uri in self.previous_resources: del self.previous_resources[resource.uri]
def explore_uri(self, uri, checks, caps, show_back=True): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s = Sitemap() print "Reading %s" % (uri) options = {} capability = None try: if (caps == 'resource'): self.explore_show_head(uri, check_headers=checks) else: list = s.parse_xml(urllib.urlopen(uri)) (options, capability) = self.explore_show_summary( list, s.parsed_index, caps) except IOError as e: print "Cannot read %s (%s)\nGoing back" % (uri, str(e)) return ('', '', '', 'b') except Exception as e: print "Cannot parse %s (%s)\nGoing back" % (uri, str(e)) return ('', '', '', 'b') while (True): # don't offer number option for no resources/capabilities num_prompt = '' if (len(options) == 0) else 'number, ' up_prompt = 'b(ack), ' if (show_back) else '' inp = raw_input("Follow [%s%sq(uit)]?" % (num_prompt, up_prompt)) if (inp in options.keys()): break if (inp == 'q' or inp == 'b'): return ('', '', '', inp) checks = {} if (options[inp].capability is None): if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents caps = ['capabilitylist'] elif (capability in [ 'resourcelist', 'changelist', 'resourcedump', 'changedump' ]): caps = 'resource' else: r = options[inp] caps = [r.capability] if (r.length is not None): checks['content-length'] = r.length if (r.lastmod is not None): checks['last-modified'] = r.lastmod # FIXME - could do sanity check here and issue warnings if odd return (options[inp].uri, checks, caps, inp)
def test_10_sitemap(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md hash="md5:Q2hlY2sgSW50ZWdyaXR5IQ==" length=\"12\" /></url>\ </urlset>' s=Sitemap() i=s.parse_xml(fh=io.StringIO(xml)) self.assertFalse( s.parsed_index, 'was a sitemap') self.assertEqual( s.resources_created, 1, 'got 1 resources') for r in i.resources: self.assertTrue( r is not None, 'got the uri expected') self.assertEqual( r.uri, 'http://e.com/a' ) self.assertEqual( r.lastmod, '2012-03-14T18:37:36Z' ) self.assertEqual( r.length, 12 ) self.assertEqual( r.md5, 'Q2hlY2sgSW50ZWdyaXR5IQ==' )
def test_all_simple(self): """Just try to read each one""" for ex in ("ex_2_1.xml","ex_2_2.xml","ex_2_3.xml","ex_2_4.xml", "ex_2_5.xml","ex_2_6.xml","ex_2_7.xml", "ex_4_1.xml","ex_4_2.xml","ex_4_3.xml", "ex_5_1.xml","ex_5_2.xml","ex_5_3.xml", "ex_6_1.xml", "ex_7_1.xml","ex_7_2.xml","ex_7_3.xml", "ex_8_1.xml","ex_8_2.xml","ex_8_3.xml","ex_8_4.xml", "ex_8_5.xml","ex_8_6.xml","ex_8_7.xml","ex_8_8.xml", "ex_8_9.xml", "ex_9_1.xml","ex_9_2.xml","ex_9_3.xml", "ex_10_1.xml","ex_10_2.xml"): s=Sitemap() fh = open( 'resync/test/testdata/examples_from_spec/%s' % (ex), 'r') si = s.parse_xml( fh=fh )
def test_10_sitemap(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md hash="md5:Q2hlY2sgSW50ZWdyaXR5IQ==" length=\"12\" /></url>\ </urlset>' s = Sitemap() i = s.parse_xml(fh=io.StringIO(xml)) self.assertFalse(s.parsed_index, 'was a sitemap') self.assertEqual(s.resources_created, 1, 'got 1 resources') for r in i.resources: self.assertTrue(r is not None, 'got the uri expected') self.assertEqual(r.uri, 'http://e.com/a') self.assertEqual(r.lastmod, '2012-03-14T18:37:36Z') self.assertEqual(r.length, 12) self.assertEqual(r.md5, 'Q2hlY2sgSW50ZWdyaXR5IQ==')
def explore_uri(self, uri, checks, caps, show_back=True): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s=Sitemap() print "Reading %s" % (uri) options={} capability=None try: if (caps=='resource'): self.explore_show_head(uri,check_headers=checks) else: list = s.parse_xml(urllib.urlopen(uri)) (options,capability)=self.explore_show_summary(list,s.parsed_index,caps) except IOError as e: print "Cannot read %s (%s)\nGoing back" % (uri,str(e)) return('','','','b') except Exception as e: print "Cannot parse %s (%s)\nGoing back" % (uri,str(e)) return('','','','b') while (True): # don't offer number option for no resources/capabilities num_prompt = '' if (len(options)==0) else 'number, ' up_prompt = 'b(ack), ' if (show_back) else '' inp = raw_input( "Follow [%s%sq(uit)]?" % (num_prompt,up_prompt) ) if (inp in options.keys()): break if (inp == 'q' or inp == 'b'): return('','','',inp) checks = {} if ( options[inp].capability is None ): if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents caps = ['capabilitylist'] elif (capability in ['resourcelist','changelist', 'resourcedump','changedump']): caps = 'resource' else: r = options[inp] caps = [r.capability] if (r.length is not None): checks['content-length']=r.length if (r.lastmod is not None): checks['last-modified']=r.lastmod # FIXME - could do sanity check here and issue warnings if odd return( options[inp].uri, checks, caps, inp )
def test_30_parse_change_list(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\ </urlset>' s=Sitemap() s.resource_class=Resource c=s.parse_xml(fh=io.StringIO(xml)) self.assertEqual( s.resources_created, 2, 'got 2 resources') i = iter(c) r1 = next(i) self.assertEqual( r1.uri, '/tmp/rs_test/src/file_a' ) self.assertEqual( r1.change, 'updated' ) r2 = next(i) self.assertEqual( r2.uri, '/tmp/rs_test/src/file_b' ) self.assertEqual( r2.change, None )
def test_30_parse_change_list(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\ </urlset>' s = Sitemap() s.resource_class = Resource c = s.parse_xml(fh=io.StringIO(xml)) self.assertEqual(s.resources_created, 2, 'got 2 resources') i = iter(c) r1 = next(i) self.assertEqual(r1.uri, '/tmp/rs_test/src/file_a') self.assertEqual(r1.change, 'updated') r2 = next(i) self.assertEqual(r2.uri, '/tmp/rs_test/src/file_b') self.assertEqual(r2.change, None)
def test_all_simple_read(self): """Just try to read each one""" for ex in ('archives_ex_2_1','archives_ex_2_2', 'archives_ex_3_1','archives_ex_3_2', 'archives_ex_4_1', 'archives_ex_5_1', 'archives_ex_6_1', 'resourcesync_ex_1','resourcesync_ex_2','resourcesync_ex_3', 'resourcesync_ex_4','resourcesync_ex_5','resourcesync_ex_6', 'resourcesync_ex_7','resourcesync_ex_8','resourcesync_ex_12', 'resourcesync_ex_13','resourcesync_ex_14','resourcesync_ex_15', 'resourcesync_ex_16','resourcesync_ex_17','resourcesync_ex_18', 'resourcesync_ex_19','resourcesync_ex_20','resourcesync_ex_21', 'resourcesync_ex_22','resourcesync_ex_23','resourcesync_ex_24', 'resourcesync_ex_25','resourcesync_ex_26','resourcesync_ex_27', 'resourcesync_ex_28','resourcesync_ex_29','resourcesync_ex_30', 'resourcesync_ex_31','resourcesync_ex_32','resourcesync_ex_33'): s=Sitemap() fh = self._open_ex(ex) si = s.parse_xml( fh=fh )
def test_21_parse_multi_sitemapindex(self): s = Sitemap() fh = open('tests/testdata/sitemapindex2/sitemap.xml', 'r') si = s.parse_xml(fh=fh, sitemapindex=True) self.assertTrue(s.parsed_index, 'was a sitemapindex') self.assertEqual(len(si.resources), 3, '3 sitemaps listed')
def read_source(self): """ Read the source_uri and parse it to source_document. :return: True if the document was downloaded and parsed without exceptions, False otherwise. """ session = requests.Session() try: response = session.get(self.source_uri) self.source_status = response.status_code self.logger.debug("Read %s, status %s" % (self.source_uri, str(self.source_status))) assert self.source_status == 200, "Invalid response status: %d" % self.source_status text = response.text root = ET.fromstring(text) self.is_index = root.tag == SITEMAP_INDEX_ROOT etree = ET.ElementTree(root) sitemap = Sitemap() self.source_document = sitemap.parse_xml(etree=etree) # the source_document is a resync.resource_container.ResourceContainer capability = self.source_document.capability assert capability == self.capability, \ "Capability is not %s but %s" % (self.capability, capability) # anyone interested in sitemaps? for processor_listener in processor_listeners: processor_listener.event_sitemap_received( self.source_uri, capability, text) self.describedby_url = self.source_document.describedby self.up_url = self.source_document.up # to a parent non-index document self.index_url = self.source_document.index # to a parent index document self.status = Status.document except requests.exceptions.ConnectionError as err: self.logger.debug("%s No connection: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except xml.etree.ElementTree.ParseError as err: self.logger.debug("%s ParseError: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except resync.sitemap.SitemapParseError as err: self.logger.debug("%s Unreadable source: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except AssertionError as err: self.logger.debug("%s Error: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) finally: session.close() return self.status == Status.document
def read_sitemap(self, path, sitemap_instance): with open(path, "r", encoding="utf-8") as file: sm = Sitemap() sm.parse_xml(file, resources=sitemap_instance) return sitemap_instance
def base_line(self, unzipdir): """ Synchronize the unzipped contents of a resource dump with the local resources :param unzipdir: the directory of the unzipped packed contents. :return: """ manifest_file_name = os.path.join(unzipdir, "manifest.xml") try: sitemap = Sitemap() manifest_doc = sitemap.parse_xml(fh=manifest_file_name) # the manifest_doc is a resync.resource_container.ResourceContainer capability = manifest_doc.capability assert capability == CAPA_RESOURCEDUMP_MANIFEST, "Capability is not %s but %s" % ( CAPA_RESOURCEDUMP_MANIFEST, capability) self.status = Status.parsed self.__inform_sitemap_received__(capability, manifest_file_name) config = Config() netloc = config.boolean_prop(Config.key_use_netloc, False) base_uri, destination = DestinationMap().find_destination( self.pack_uri, netloc=netloc) assert destination is not None, "Found no destination folder in DestinationMap" mapper = Mapper((base_uri, destination)) rlb = ResourceListBuilder(mapper=mapper) dst_resource_list = rlb.from_disk() # Compares on uri same, updated, deleted, created = dst_resource_list.compare( manifest_doc) raise NotImplementedError("This class is not fully implemented.") print(len(same), len(updated), len(deleted), len(created)) print("same") for resource in same: print(resource) print("updated") for resource in updated: print(resource) print("deleted") for resource in deleted: print(resource) print("created") for resource in created: print(resource) base_uri, local_path = DestinationMap().find_local_path( resource.uri) print(base_uri, local_path) except AssertionError as err: self.logger.debug("%s Error: %s" % (self.pack_uri, str(err))) self.status = Status.parse_error self.exceptions.append(err) except SitemapParseError as err: self.logger.debug("%s Unreadable source: %s" % (self.source_uri, str(err))) self.status = Status.parse_error self.exceptions.append(err) self.status = Status.processed_with_exceptions if self.has_exceptions( ) else Status.processed
def test_20_parse_sitemapindex_empty(self): s=Sitemap() si = s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </sitemapindex>'), sitemapindex=True ) self.assertTrue( s.parsed_index, 'was a sitemapindex') self.assertEqual( len(si.resources), 0, '0 sitemaps')
def test_21_parse_multi_sitemapindex(self): s = Sitemap() fh=open('tests/testdata/sitemapindex2/sitemap.xml','r') si = s.parse_xml( fh=fh, sitemapindex=True ) self.assertTrue( s.parsed_index, 'was a sitemapindex') self.assertEqual( len(si.resources), 3, '3 sitemaps listed')
def explore_uri(self, explorer_resource, show_back=True): """INTERACTIVE exploration of capabilities document(s) starting at a given URI Will flag warnings if the document is not of type listed in caps """ uri = explorer_resource.uri caps = explorer_resource.acceptable_capabilities checks = explorer_resource.checks print "Reading %s" % (uri) options={} capability=None try: if (caps=='resource'): # Not expecting a capability document self.explore_show_head(uri,check_headers=checks) else: s=Sitemap() list = s.parse_xml(urllib.urlopen(uri)) (options,capability)=self.explore_show_summary(list,s.parsed_index,caps,context=uri) except IOError as e: print "Cannot read %s (%s)" % (uri,str(e)) except Exception as e: print "Cannot parse %s (%s)" % (uri,str(e)) # # Loop until we have some valide input # while (True): # don't offer number option for no resources/capabilities num_prompt = '' if (len(options)==0) else 'number, ' up_prompt = 'b(ack), ' if (show_back) else '' input = raw_input( "Follow [%s%sq(uit)]?" % (num_prompt,up_prompt) ) if (input in options.keys()): break if (input == 'q'): raise ExplorerQuit() if (input == 'b'): return(None) # # Got input that is one of the options # checks = {} r = options[input] if ( r.capability is None ): if (capability in ['resourcelist','changelist', 'resourcedump','changedump']): caps = 'resource' else: caps = self.allowed_entries(capability) elif (r.capability is 'resource'): caps = r.capability else: caps = [r.capability] # Record anything we know about the resource to check if (r.length is not None): checks['content-length']=r.length if (r.lastmod is not None): checks['last-modified']=r.lastmod if (r.mime_type is not None): checks['content-type']=r.mime_type # FIXME - could add fixity checks here too return( XResource(options[input].uri, caps, checks) )