def explore_uri(self, uri, caps): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s=Sitemap() print "Reading %s" % (uri) try: list = s.parse_xml(urllib.urlopen(uri)) except IOError as e: raise ClientFatalError("Cannot read %s (%s)" % (uri,str(e))) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] if (s.parsed_index): capability += 'index' print "Parsed %s document with %d entries:" % (capability,num_entries) if (caps is not None and capability not in caps): print "WARNING - expected a %s document" % (','.join(caps)) to_show = num_entries if (num_entries>21): to_show = 20 # What entries are allowed? # FIXME - not complete if (capability == 'capabilitylistindex'): entry_caps = ['capabilitylist'] elif (capability == 'capabilitylist'): entry_caps = ['resourcelist','changelist','resourcedump','changedump','changelistindex'] elif (capability == 'changelistindex'): entry_caps = ['changelist'] n = 0 options = {} for r in list.resources: if (n>=to_show): print "(not showing remaining %d entries)" % (num_entries-n) last n+=1 options[str(n)]=r print "[%d] %s" % (n,r.uri) if (r.capability is not None): warning = '' if (r.capability not in entry_caps): warning = " (EXPECTED %s)" % (' or '.join(entry_caps)) print " %s%s" % (r.capability,warning) elif (len(entry_caps)==1): r.capability=entry_caps[0] print " capability not specified, should be %s" % (r.capability) while (True): inp = raw_input( "Follow [number or q(uit)]?" ) if (inp in options.keys()): break if (inp == 'q'): return('','',inp) caps = [ options[inp].capability ] if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents if (caps is None): caps = ['capabilitylist'] return( options[inp].uri, caps, inp )
def sync_incremental(map, counter, base_url, from_date, to_date): """Run resync incremental.""" # init_logging(verbose=True) from .resync import ResourceSyncClient client = ResourceSyncClient() client.ignore_failures = True try: single_sync_incremental(map, counter, base_url, from_date, to_date) return True except MapperError as e: current_app.logger.info(e) paths = map[0].rsplit('/', 1) map[0] = paths[0] except Exception as e: # maybe url contain a list of changelist, instead of changelist current_app.logger.info(e) s = Sitemap() try: docs = s.parse_xml(url_or_file_open(base_url)) except IOError as ioerror: raise ioerror if docs: for doc in docs: # make sure sub url is a changelist/ changedump capability = read_capability(doc.uri) if capability is None: raise ('Bad URL, not a changelist/changedump,' ' cannot sync incremental') if capability != 'changelist' and capability != 'changedump': raise ('Bad URL, not a changelist/changedump,' ' cannot sync incremental') single_sync_incremental(map, counter, doc.uri, from_date, to_date) return True raise e
def read_reference_sitemap(self, ref_sitemap, name="reference"): """Read reference sitemap and return the inventory name parameter just uses in output messages to say what type of sitemap is being read. """ sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) self.logger.info("Reading %s sitemap(s) from %s ..." % (name, ref_sitemap)) i = sitemap.read(ref_sitemap) num_entries = len(i) self.logger.warning( "Read %s sitemap with %d entries in %d sitemaps" % (name, num_entries, sitemap.sitemaps_created) ) if self.verbose: to_show = 100 override_str = " (override with --max-sitemap-entries)" if self.max_sitemap_entries: to_show = self.max_sitemap_entries override_str = "" if num_entries > to_show: print "Showing first %d entries sorted by URI%s..." % (to_show, override_str) n = 0 for r in i: print r n += 1 if n >= to_show: break return i
def parse_document(self): """Parse any ResourceSync document and show information Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ s=Sitemap() self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) try: list = s.parse_xml(urllib.urlopen(self.sitemap)) except IOError as e: raise ClientFatalError("Cannot read document (%s)" % str(e)) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] print "Parsed %s document with %d entries" % (capability,num_entries) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries>to_show): print "Showing first %d entries sorted by URI%s..." % (to_show,override_str) n=0 for resource in list: print '[%d] %s' % (n,str(resource)) n+=1 if ( n >= to_show ): break
def read_reference_sitemap(self, ref_sitemap, name='reference'): """Read reference sitemap and return the inventory name parameter just uses in output messages to say what type of sitemap is being read. """ sitemap = Sitemap(allow_multifile=self.allow_multifile, mapper=self.mapper) self.logger.info("Reading %s sitemap(s) from %s ..." % (name, ref_sitemap)) i = sitemap.read(ref_sitemap) num_entries = len(i) self.logger.warning("Read %s sitemap with %d entries in %d sitemaps" % (name, num_entries, sitemap.sitemaps_created)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for r in i: print r n += 1 if (n >= to_show): break return (i)
def test2_pretty_output(self): ib = InventoryBuilder() ib.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() s.pretty_xml=True self.assertEqual(s.inventory_as_xml(i),'<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-03-14T17:46:04</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2012-03-14T17:46:25</lastmod><rs:size>45</rs:size></url>\n</urlset>' )
def publish(self): """ Try and publish or remove zip end if something went wrong. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ if not os.path.isdir(self.resource_dir): os.makedirs(self.resource_dir) #print "Created %s" % self.resource_dir if not os.path.isdir(self.publish_dir): os.makedirs(self.publish_dir) #print "Created %s" % self.publish_dir try: return self.do_publish() except: # Something went wrong. Best we can do is clean up end of zip chain. zip_end_files = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) for ze_file in zip_end_files: os.remove(ze_file) print "error recovery: removed %s" % ze_file zip_end_xmls = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml")) for ze_xml in zip_end_xmls: os.remove(ze_xml) print "error recovery: removed %s" % ze_xml zip_end_manis = glob( os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml")) for ze_mani in zip_end_manis: os.remove(ze_mani) print "error recovery: removed %s" % ze_mani # remove zip-end entries from resource-dump.xml rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) rs_dump = ResourceDump() if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) prefix = self.publish_url + PREFIX_END_PART for uri in rs_dump.resources.keys(): if uri.startswith(prefix): del rs_dump.resources[uri] print "error recovery: removed %s from %s" % (uri, rs_dump_path) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) print "error recovery: walk through error recovery completed. Now raising ..." raise
def test_19_parse_with_bad_rs_ln(self): xmlstart = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="resourcelist"/>\ <url><loc>http://example.com/file_a</loc>' xmlend = '</url></urlset>' s = Sitemap() # # missing href xml = xmlstart + '<rs:ln rel="duplicate"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # missing rel xml = xmlstart + '<rs:ln href="http://example.com/"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # bad length xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="a"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # bad pri xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # and finally OK with errors fixes xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>' + xmlend rc = s.parse_xml(fh=io.StringIO(xml)) self.assertEqual(len(rc.resources), 1, 'good at last, extra attribute ignored')
def test_18_parse_with_rs_ln_on_resource(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability=\"resourcelist\"/>\ <url>\ <loc>http://example.com/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\ <rs:md hash=\"md5:r2d2\" length=\"12345\" />\ <rs:ln rel=\"duplicate\" href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02\" pri=\"1\" />\ <rs:ln rel=\"num2\" href=\"http://m2.example.com/res1\"/>\ <rs:ln rel=\"num3\" href=\"http://m3.example.com/res1\"/>\ </url>\ <url>\ <loc>http://example.com/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\ <rs:md length=\"32\" />\ </url>\ </urlset>' s=Sitemap() rc=s.parse_xml(fh=io.StringIO(xml)) self.assertFalse( s.parsed_index, 'was a sitemap') self.assertEqual( s.resources_created, 2, 'got 2 resources') i = iter(rc) r1 = next(i) r2 = next(i) self.assertEqual( r1.uri, 'http://example.com/file_a' ) self.assertEqual( r1.ln[0]['rel'], 'duplicate' ) self.assertEqual( r1.ln[0]['href'], 'http://mirror1.example.com/res1' ) self.assertEqual( r1.ln[0]['modified'], '2013-01-02' ) self.assertEqual( r1.ln[0]['pri'], 1 ) self.assertEqual( r2.uri, 'http://example.com/file_b' )
def parse_document(self): """Parse any ResourceSync document and show information Will use sitemap URI taken either from explicit self.sitemap_name or derived from the mappings supplied. """ s = Sitemap() self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) try: list = s.parse_xml(urllib.urlopen(self.sitemap)) except IOError as e: raise ClientFatalError("Cannot read document (%s)" % str(e)) num_entries = len(list.resources) capability = '(unknown capability)' if ('capability' in list.md): capability = list.md['capability'] print "Parsed %s document with %d entries" % (capability, num_entries) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for resource in list: print '[%d] %s' % (n, str(resource)) n += 1 if (n >= to_show): break
def test_19_parse_with_bad_rs_ln(self): xmlstart='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="resourcelist"/>\ <url><loc>http://example.com/file_a</loc>' xmlend='</url></urlset>' s=Sitemap() # # missing href xml=xmlstart+'<rs:ln rel="duplicate"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # missing rel xml=xmlstart+'<rs:ln href="http://example.com/"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # bad length xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" length="a"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # bad pri xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>'+xmlend self.assertRaises( SitemapParseError, s.parse_xml, fh=io.StringIO(xml)) # and finally OK with errors fixes xml=xmlstart+'<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>'+xmlend rc = s.parse_xml(fh=io.StringIO(xml)) self.assertEqual( len(rc.resources), 1, 'good at last, extra attribute ignored' )
def get_state_published(self): """ See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist (with local paths) of resources published in the zip end file. :return: - the path to the zip end file or None if there is no zip end file. - the resourcelist of resources published in zip end file or an empty list if there is no zip end file. """ path_zip_end_old = None rl_end_old = ResourceList() zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) if len(zip_end_files) > 1: raise RuntimeError( "Found more than one %s*.zip files. Inconsistent structure of %s." % (PREFIX_END_PART, self.publish_dir) ) elif len(zip_end_files) == 1: path_zip_end_old = zip_end_files[0] if path_zip_end_old: rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r") sm = Sitemap() sm.parse_xml(rl_file, resources=rl_end_old) rl_file.close() return path_zip_end_old, rl_end_old
def test2_pretty_output(self): ib = InventoryBuilder() ib.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() s.pretty_xml=True self.assertEqual(s.resources_as_xml(i),'<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url>\n</urlset>' )
def parse_xml(self, fh=None, etree=None, resources=None, capability=None, sitemapindex=None): """Parse XML Sitemap and add to resources object. Reads from fh or etree and adds resources to a resorces object (which must support the add method). Returns the resources object. Also sets self.resources_created to be the number of resources created. We adopt a very lax approach here. The parsing is properly namespace aware but we search just for the elements wanted and leave everything else alone. This method will read either sitemap or sitemapindex documents. Behavior depends on the sitemapindex parameter: - None - will read either - False - SitemapIndexError exception if sitemapindex detected - True - SitemapIndexError exception if sitemap detected Will set self.parsed_index based on whether a sitemap or sitemapindex document was read: - False - sitemap - True - sitemapindex """ sitemap = Sitemap() self.res_container = sitemap.parse_xml(fh=fh, etree=etree, resources=resources, capability=capability, sitemapindex=sitemapindex) return self.res_container
def get_state_published(self): """ See if publish_dir has a zip end file. If so, return the path of the zip end file and the resourcelist (with local paths) of resources published in the zip end file. :return: - the path to the zip end file or None if there is no zip end file. - the resourcelist of resources published in zip end file or an empty list if there is no zip end file. """ path_zip_end_old = None rl_end_old = ResourceList() zip_end_files = glob( os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) if len(zip_end_files) > 1: raise RuntimeError( "Found more than one %s*.zip files. Inconsistent structure of %s." % (PREFIX_END_PART, self.publish_dir)) elif len(zip_end_files) == 1: path_zip_end_old = zip_end_files[0] if path_zip_end_old: rl_file = open(os.path.splitext(path_zip_end_old)[0] + ".xml", "r") sm = Sitemap() sm.parse_xml(rl_file, resources=rl_end_old) rl_file.close() return path_zip_end_old, rl_end_old
def test_20_parse_sitemapindex_empty(self): s = Sitemap() si = s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </sitemapindex>') ) self.assertEqual(s.sitemaps_created, 0, "0 sitemaps in sitemapindex") self.assertEqual(len(si.resources), 0, "0 sitemaps")
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename( self.publish_dir)).rstrip('\n') print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def test_18_parse_with_rs_ln_on_resource(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability=\"resourcelist\"/>\ <url>\ <loc>http://example.com/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\ <rs:md hash=\"md5:r2d2\" length=\"12345\" />\ <rs:ln rel=\"duplicate\" href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02\" pri=\"1\" />\ <rs:ln rel=\"num2\" href=\"http://m2.example.com/res1\"/>\ <rs:ln rel=\"num3\" href=\"http://m3.example.com/res1\"/>\ </url>\ <url>\ <loc>http://example.com/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod>\ <rs:md length=\"32\" />\ </url>\ </urlset>' s = Sitemap() rc = s.parse_xml(fh=io.StringIO(xml)) self.assertFalse(s.parsed_index, 'was a sitemap') self.assertEqual(s.resources_created, 2, 'got 2 resources') i = iter(rc) r1 = next(i) r2 = next(i) self.assertEqual(r1.uri, 'http://example.com/file_a') self.assertEqual(r1.ln[0]['rel'], 'duplicate') self.assertEqual(r1.ln[0]['href'], 'http://mirror1.example.com/res1') self.assertEqual(r1.ln[0]['modified'], '2013-01-02') self.assertEqual(r1.ln[0]['pri'], 1) self.assertEqual(r2.uri, 'http://example.com/file_b')
def write_static_inventory(self): """Writes the inventory to the filesystem""" # Generate sitemap in temp directory then = time.time() self.ensure_temp_dir(Source.TEMP_FILE_PATH) inventory = self.generate() basename = Source.TEMP_FILE_PATH + "/sitemap.xml" s=Sitemap() s.max_sitemap_entries=self.config['max_sitemap_entries'] s.mapper=Mapper([self.source.base_uri, Source.TEMP_FILE_PATH]) s.write(inventory, basename) # Delete old sitemap files; move the new ones; delete the temp dir self.rm_sitemap_files(Source.STATIC_FILE_PATH) self.mv_sitemap_files(Source.TEMP_FILE_PATH, Source.STATIC_FILE_PATH) shutil.rmtree(Source.TEMP_FILE_PATH) now = time.time() # Log Sitemap create start event sitemap_size = self.compute_sitemap_size(Source.STATIC_FILE_PATH) log_data = {'time': (now-then), 'no_resources': self.source.resource_count} self.logger.info("Wrote static sitemap inventory. %s" % log_data) sm_write_end = ResourceChange( resource = ResourceChange(self.uri, size=sitemap_size, timestamp=then), changetype = "UPDATED") self.source.notify_observers(sm_write_end)
def read_sitemap(self, path, sitemap=None): if sitemap is None: sitemap = ListBaseWithIndex() with open(path, "r", encoding="utf-8") as file: sm = Sitemap() sm.parse_xml(file, resources=sitemap) return sitemap
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) i = ib.from_disk('resync/test/testdata/dir1','http://example.org/t') s = Sitemap() xml = s.inventory_as_xml(i) self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>',xml), 'size/checksum for file_a') self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>',xml), 'size/checksum for file_b' )
def test_20_parse_sitemapindex_empty(self): s = Sitemap() si = s.parse_xml(fh=io.StringIO( '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </sitemapindex>' ), sitemapindex=True) self.assertTrue(s.parsed_index, 'was a sitemapindex') self.assertEqual(len(si.resources), 0, '0 sitemaps')
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper(['http://example.org/t','resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() xml = s.resources_as_xml(i) self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>',xml) ) #must escape + in md5 self.assertNotEqual( None, re.search('<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>',xml) )
def test_22_parse_sitemapindex_file(self): s=Sitemap() fh=open('tests/testdata/sitemapindex1/sitemap.xml','r') si = s.parse_xml( fh=fh, sitemapindex=True ) self.assertTrue( s.parsed_index, 'was a sitemapindex') self.assertEqual( len(si.resources), 3, '3 sitemaps') sms = sorted(si.uris()) self.assertEqual( sms, ['http://localhost:8888/sitemap00000.xml','http://localhost:8888/sitemap00001.xml','http://localhost:8888/sitemap00002.xml'] )
def publish_metadata(self, new_zips, exluded_zip=None): """ (Re)publish metadata with addition of new_zips. An excluded zip will be removed from previously published metadata. :param new_zips: a resourcelist with newly created zip resources :param exluded_zip: local path to zip file that will be removed from previously published metadata. """ rs_dump_url = self.publish_url + RS_RESOURCE_DUMP_XML rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) capa_list_url = self.publish_url + RS_CAPABILITY_LIST_XML capa_list_path = os.path.join(self.publish_dir, RS_CAPABILITY_LIST_XML) rs_dump = ResourceDump() # Load existing resource-dump, if any. Else set start time. if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) else: rs_dump.md_at = w3cdt.datetime_to_str(no_fractions=True) rs_dump.link_set(rel="up", href=capa_list_url) # Remove excluded zip, if any if exluded_zip: loc = self.publish_url + os.path.basename(exluded_zip) if loc in rs_dump.resources: del rs_dump.resources[loc] else: raise RuntimeError("Could not find %s in %s" % (loc, rs_dump_path)) # Add new zips for resource in new_zips: rs_dump.add(resource) # Write resource-dump.xml rs_dump.md_completed = w3cdt.datetime_to_str(no_fractions=True) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) # There are several ways to decode base64, among them # iri = base64.b64decode(os.path.basename(self.publish_dir)).rstrip('\n') # iri = base64.b64decode(os.path.basename(self.publish_dir), '-_').rstrip('\n') iri = base64.urlsafe_b64decode(os.path.basename(self.publish_dir)).rstrip("\n") print "New %s for graph %s" % (RS_RESOURCE_DUMP_XML, iri) print "See %s" % rs_dump_url # Write capability-list.xml if not os.path.isfile(capa_list_path): capa_list = CapabilityList() capa_list.link_set(rel="up", href=self.src_desc_url) capa_list.add_capability(rs_dump, rs_dump_url) with open(capa_list_path, "w") as capa_list_file: capa_list_file.write(capa_list.as_xml()) print "New %s. See %s" % (RS_CAPABILITY_LIST_XML, capa_list_url)
def test2_pretty_output(self): ib = InventoryBuilder() i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t') s = Sitemap() s.pretty_xml = True self.assertEqual( s.inventory_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-03-14T17:46:04</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2012-03-14T17:46:25</lastmod><rs:size>45</rs:size></url>\n</urlset>' )
def test_22_parse_sitemapindex_file(self): s=Sitemap() fh=open('resync/test/testdata/sitemapindex1/sitemap.xml') si = s.sitemapindex_parse_xml( fh=fh ) self.assertEqual( s.sitemaps_created, 3, '3 sitemaps in sitemapindex') self.assertEqual( len(si.resources), 3, '3 sitemaps') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['http://localhost:8888/sitemap00000.xml','http://localhost:8888/sitemap00001.xml','http://localhost:8888/sitemap00002.xml'] ) self.assertEqual( si.resources['http://localhost:8888/sitemap00000.xml'].lastmod, '2012-06-13T18:09:13Z' )
def save_sitemap(self, sitemap, path): # writing the string sitemap.as_xml() to disk results in encoding=ASCII on some systems. # due to https://docs.python.org/3.4/library/xml.etree.elementtree.html#write sitemap.default_capability() with open(path, "wb") as f: s = Sitemap(pretty_xml=self.para.is_saving_pretty_xml) s.resources_as_xml(sitemap, sitemapindex=sitemap.sitemapindex, fh=f)
def test_11_parse_2(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:size>12</rs:size></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:size>32</rs:size></url>\ </urlset>' s=Sitemap() i=s.resourcelist_parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual( s.resources_created, 2, 'got 2 resources')
def test_11_parse_2(self): xml = "<?xml version='1.0' encoding='UTF-8'?>\n\ <urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://resourcesync.org/change/0.1\">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size></url>\ </urlset>" s = Sitemap() i = s.inventory_parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual(s.resources_created, 2, "got 2 resources")
def test_22_parse_sitemapindex_file(self): s=Sitemap() fh=open('resync/test/testdata/sitemapindex1/sitemap.xml') si = s.sitemapindex_parse_xml( fh=fh ) self.assertEqual( s.sitemaps_created, 3, '3 sitemaps in sitemapindex') self.assertEqual( len(si.resources), 3, '3 sitemaps') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['http://localhost:8888/sitemap00000.xml','http://localhost:8888/sitemap00001.xml','http://localhost:8888/sitemap00002.xml'] ) self.assertEqual( si.resources['http://localhost:8888/sitemap00000.xml'].lastmod, '2012-06-13T18:09:13' )
def test_11_parse_2(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size></url>\ </urlset>' s=Sitemap() i=s.inventory_parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual( s.resources_created, 2, 'got 2 resources')
def test_ex2_1(self): """ex2_1 is a simple resourcelist with 2 resources, no metadata""" s=Sitemap() fh=open('resync/test/testdata/examples_from_spec/ex2_1.xml') si = s.resourcelist_parse_xml( fh=fh ) self.assertEqual( len(si.resources), 2, '2 resources') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] ) self.assertEqual( si.resources['http://example.com/res1'].lastmod, None )
def read_source(self): """ Read the source_uri and parse it to source_document. :return: True if the document was downloaded and parsed without exceptions, False otherwise. """ session = requests.Session() try: response = session.get(self.source_uri) self.source_status = response.status_code self.logger.debug("Read %s, status %s" % (self.source_uri, str(self.source_status))) assert self.source_status == 200, "Invalid response status: %d" % self.source_status text = response.text root = ET.fromstring(text) self.is_index = root.tag == SITEMAP_INDEX_ROOT etree = ET.ElementTree(root) sitemap = Sitemap() self.source_document = sitemap.parse_xml(etree=etree) # the source_document is a resync.resource_container.ResourceContainer capability = self.source_document.capability assert capability == self.capability, "Capability is not %s but %s" % (self.capability, capability) # anyone interested in sitemaps? for processor_listener in processor_listeners: processor_listener.event_sitemap_received(self.source_uri, capability, text) self.describedby_url = self.source_document.describedby self.up_url = self.source_document.up # to a parent non-index document self.index_url = self.source_document.index # to a parent index document self.status = Status.document except requests.exceptions.ConnectionError as err: self.logger.debug("%s No connection: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except xml.etree.ElementTree.ParseError as err: self.logger.debug("%s ParseError: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except resync.sitemap.SitemapParseError as err: self.logger.debug("%s Unreadable source: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) except AssertionError as err: self.logger.debug("%s Error: %s" % (self.source_uri, str(err))) self.status = Status.read_error self.__report__(err) finally: session.close() return self.status == Status.document
def test_11_parse_2(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size></url>\ </urlset>' fh=StringIO.StringIO(xml) s=Sitemap() i=s.inventory_parse_xml(fh) self.assertEqual( s.resources_added, 2, 'got 2 resources')
def synchronize(self): """ Publish the resources found in source_dir in accordance with the Resourcesync Framework in sink_dir. """ if not os.path.isdir(self.source_dir): os.makedirs(self.source_dir) print "Created %s" % self.source_dir if not os.path.isdir(self.sink_dir): os.makedirs(self.sink_dir) print "Created %s" % self.sink_dir self.handshake = self.verify_handshake() if self.handshake is None: return #################### # print "Synchronizing state as of %s" % self.handshake ### initial resource description wellknown = os.path.join(self.sink_dir, RS_WELL_KNOWN) if not os.path.isdir(wellknown): os.makedirs(wellknown) src_desc = SourceDescription() new_src_desc = True # Load existing resource-description, if any. if os.path.isfile(self.src_desc_path): new_src_desc = False with open(self.src_desc_path, "r") as src_desc_file: sm = Sitemap() sm.parse_xml(src_desc_file, resources=src_desc) count_lists = len(src_desc.resources) ### resources in subdirectories or main directory ### the existance of FILE_INDEX indicates whether resources reside directly in source_dir or in subdirectories. index_file = os.path.join(self.source_dir, FILE_INDEX) if os.path.isfile(index_file): for dirname in os.walk(self.source_dir).next()[1]: source = os.path.join(self.source_dir, dirname) sink = os.path.join(self.sink_dir, dirname) publish_url = self.publish_url + dirname + "/" self.__execute_sync__(source, sink, publish_url, src_desc) else: self.__execute_sync__(self.source_dir, self.sink_dir, self.publish_url, src_desc) if new_src_desc or count_lists != len(src_desc.resources): ### publish resource description with open(self.src_desc_path, "w") as src_desc_file: src_desc_file.write(src_desc.as_xml()) print "New resource description. See %s" % self.src_desc_url self.report()
def publish(self): """ Try and publish or remove zip end if something went wrong. :return: ( boolean indicating if change in sink directory or subdirectories, amount of resources definitively packaged, the difference of resources provisionally packaged) """ if not os.path.isdir(self.resource_dir): os.makedirs(self.resource_dir) # print "Created %s" % self.resource_dir if not os.path.isdir(self.publish_dir): os.makedirs(self.publish_dir) # print "Created %s" % self.publish_dir try: return self.do_publish() except: # Something went wrong. Best we can do is clean up end of zip chain. zip_end_files = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.zip")) for ze_file in zip_end_files: os.remove(ze_file) print "error recovery: removed %s" % ze_file zip_end_xmls = glob(os.path.join(self.publish_dir, PREFIX_END_PART + "*.xml")) for ze_xml in zip_end_xmls: os.remove(ze_xml) print "error recovery: removed %s" % ze_xml zip_end_manis = glob(os.path.join(self.publish_dir, PREFIX_MANIFEST + PREFIX_END_PART + "*.xml")) for ze_mani in zip_end_manis: os.remove(ze_mani) print "error recovery: removed %s" % ze_mani # remove zip-end entries from resource-dump.xml rs_dump_path = os.path.join(self.publish_dir, RS_RESOURCE_DUMP_XML) rs_dump = ResourceDump() if os.path.isfile(rs_dump_path): with open(rs_dump_path, "r") as rs_dump_file: sm = Sitemap() sm.parse_xml(rs_dump_file, resources=rs_dump) prefix = self.publish_url + PREFIX_END_PART for uri in rs_dump.resources.keys(): if uri.startswith(prefix): del rs_dump.resources[uri] print "error recovery: removed %s from %s" % (uri, rs_dump_path) with open(rs_dump_path, "w") as rs_dump_file: rs_dump_file.write(rs_dump.as_xml()) print "error recovery: walk through error recovery completed. Now raising ..." raise
def test_11_parse_2(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"12\" /></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length=\"32\" /></url>\ </urlset>' s = Sitemap() i = s.parse_xml(fh=io.StringIO(xml)) self.assertFalse(s.parsed_index, 'was a sitemap') self.assertEqual(s.resources_created, 2, 'got 2 resources')
def read_capability(url): """Read capability of an url.""" s = Sitemap() capability = None try: document = s.parse_xml(url_or_file_open(url)) except IOError as e: raise e if 'capability' in document.md: capability = document.md['capability'] return capability
def test2_pretty_output(self): ib = InventoryBuilder() ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() s.pretty_xml = True self.assertEqual( s.resources_as_xml(i), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\n<url><loc>http://example.org/t/file_a</loc><lastmod>2012-07-25T17:13:46Z</lastmod><rs:size>20</rs:size></url>\n<url><loc>http://example.org/t/file_b</loc><lastmod>2001-09-09T01:46:40Z</lastmod><rs:size>45</rs:size></url>\n</urlset>' )
def write_static_inventory(self): """Writes the inventory to the filesystem""" self.generate() self.delete_sitemap_files() basename = Source.STATIC_FILE_PATH + "/sitemap.xml" then = time.time() s=Sitemap() s.max_sitemap_entries=self.config['max_sitemap_entries'] s.mapper=Mapper([self.source.base_uri, Source.STATIC_FILE_PATH]) s.write(self, basename) now = time.time() print "Wrote static sitemap in %s seconds" % str(now-then)
def test_21_parse_sitemapindex(self): s=Sitemap() si = s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>'), sitemapindex=True ) self.assertEqual( len(si.resources), 2, '2 sitemaps') sms = sorted(si.uris()) self.assertEqual( sms, ['aaa','bbb'] ) # add a couple more s.parse_xml( fh=io.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), resources=si ) self.assertTrue( s.parsed_index, 'was a sitemapindex') self.assertEqual( len(si.resources), 4, '4 sitemaps total') sms = sorted(si.uris()) self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )
def write_changeset(self): """Writes all cached changes to a file; empties the cache""" then = time.time() changeset = self.generate() basename = Source.STATIC_FILE_PATH + "/" + self.current_changeset_file() s=Sitemap() s.max_sitemap_entries=self.config['max_sitemap_entries'] s.mapper=Mapper([self.source.base_uri, Source.STATIC_FILE_PATH]) s.write(changeset, basename) now = time.time() self.previous_changeset_id = self.previous_changeset_id + 1 self.logger.info("Wrote static changeset..")
def test_ex2_2(self): """ex2_2 is a simple resourcelist with 2 resources, some metadata""" s=Sitemap() fh=open('resync/test/testdata/examples_from_spec/ex2_2.xml') si = s.resourcelist_parse_xml( fh=fh ) self.assertEqual( len(si.resources), 2, '2 resources') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['http://example.com/res1','http://example.com/res2'] ) self.assertEqual( si.resources['http://example.com/res1'].lastmod, '2013-01-02T14:00:00Z' ) self.assertEqual( si.resources['http://example.com/res2'].lastmod, '2013-01-02T13:00:00Z' ) self.assertEqual( si.resources['http://example.com/res1'].md5, '1584abdf8ebdc9802ac0c6a7402c03b6' ) self.assertEqual( si.resources['http://example.com/res2'].md5, '1e0d5cb8ef6ba40c99b14c0237be735e' )
def test_22_parse_sitemapindex_file(self): s = Sitemap() fh = open('tests/testdata/sitemapindex1/sitemap.xml', 'r') si = s.parse_xml(fh=fh, sitemapindex=True) self.assertTrue(s.parsed_index, 'was a sitemapindex') self.assertEqual(len(si.resources), 3, '3 sitemaps') sms = sorted(si.uris()) self.assertEqual(sms, [ 'http://localhost:8888/sitemap00000.xml', 'http://localhost:8888/sitemap00001.xml', 'http://localhost:8888/sitemap00002.xml' ])
def test_ex2_3(self): """ex2_3 is a simple changelist with 2 resources""" s=Sitemap() fh=open('resync/test/testdata/examples_from_spec/ex2_3.xml') si = s.resourcelist_parse_xml( fh=fh ) self.assertEqual( len(si.resources), 2, '2 resources') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['http://example.com/res2.pdf','http://example.com/res3.tiff'] ) self.assertEqual( si.resources['http://example.com/res2.pdf'].lastmod, '2013-01-02T18:00:00Z' ) self.assertEqual( si.resources['http://example.com/res3.tiff'].lastmod, '2013-01-02T13:00:00Z' ) self.assertEqual( si.resources['http://example.com/res2.pdf'].change, 'updated' ) self.assertEqual( si.resources['http://example.com/res3.tiff'].change, 'deleted' )
def test_21_parse_sitemapindex(self): s=Sitemap() si = s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>') ) self.assertEqual( s.sitemaps_created, 2, '2 sitemaps in sitemapindex') self.assertEqual( len(si.resources), 2, '2 sitemaps') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['aaa','bbb'] ) # add a couple more s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), sitemapindex=si ) self.assertEqual( s.sitemaps_created, 2, '2 sitemaps created to sitemapindex') self.assertEqual( len(si.resources), 4, '4 sitemaps total') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )
def explore_links_get(self, uri, seen=[]): # Check we haven't been here before if (uri in seen): self.logger.warning("Already see %s, skipping" % (uri)) s = Sitemap(allow_multifile=self.allow_multifile) self.logger.info("Reading sitemap from %s ..." % (uri)) i = s.read(uri, index_only=True) self.logger.warning("Read %s from %s" % (s.read_type, uri)) links = self.extract_links(i, verbose=True) if ('next' in links and links['next'] == uri): self.logger.warning("- self reference \"next\" link") seen[uri] = links return (s.changeset_read, links)
def explore_links_get(self, uri, seen=[]): # Check we haven't been here before if uri in seen: self.logger.warning("Already see %s, skipping" % (uri)) s = Sitemap(allow_multifile=self.allow_multifile) self.logger.info("Reading sitemap from %s ..." % (uri)) i = s.read(uri, index_only=True) self.logger.warning("Read %s from %s" % (s.read_type, uri)) links = self.extract_links(i, verbose=True) if "next" in links and links["next"] == uri: self.logger.warning('- self reference "next" link') seen[uri] = links return (s.changeset_read, links)
def changeset_sitemap(self, outfile=None, ref_sitemap=None, capabilities=None, dump=None): # 1. Get and parse reference sitemap rs = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.verbose): print "Reading sitemap(s) from %s ..." % (ref_sitemap) ri = rs.read(ref_sitemap) num_entries = len(ri) print "Read reference sitemap with %d entries in %d sitemaps" % ( num_entries, rs.sitemaps_created) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for r in ri.resource_uris(): print ri.resources[r] n += 1 if (n >= to_show): break # 2. Set up base_path->base_uri mappings, get inventory from disk disk_inventory = self.inventory # 3. Calculate changeset (num_same, updated, deleted, created) = ri.compare(disk_inventory) changeset = Inventory() changeset.capabilities = capabilities changeset.add(disk_inventory.changeset(updated, changetype='updated')) changeset.add(ri.changeset(deleted, changetype='deleted')) changeset.add(disk_inventory.changeset(created, changetype='created')) # 4. Write out changeset s = Sitemap(verbose=self.verbose, pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.inventory_as_xml(changeset) else: s.write(changeset, basename=outfile) self.write_dump_if_requested(changeset, dump)
def test_10_sitemap(self): xml = "<?xml version='1.0' encoding='UTF-8'?>\n\ <urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://resourcesync.org/change/0.1\">\ <url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:md5>aabbccdd</rs:md5></url>\ </urlset>" s = Sitemap() i = s.inventory_parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual(s.resources_created, 1, "got 1 resources") r = i.resources["http://e.com/a"] self.assertTrue(r is not None, "got the uri expected") self.assertEqual(r.uri, "http://e.com/a") self.assertEqual(r.lastmod, "2012-03-14T18:37:36") self.assertEqual(r.size, 12) self.assertEqual(r.md5, "aabbccdd")
def get_from_date_from_url(url): """Get smallest timestamp from url and parse to string.""" s = Sitemap() try: document = s.parse_xml(url_or_file_open(url)) except IOError as e: raise e date_list = [] for item in document.resources: if item.timestamp: date_list.append(item.timestamp) if len(date_list) > 0: from_date = dt.fromtimestamp(min(date_list)) return from_date.strftime("%Y-%m-%d")
def test_10_sitemap(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\ <url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:md5>aabbccdd</rs:md5></url>\ </urlset>' s=Sitemap() i=s.inventory_parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual( s.resources_created, 1, 'got 1 resources') r=i.resources['http://e.com/a'] self.assertTrue( r is not None, 'got the uri expected') self.assertEqual( r.uri, 'http://e.com/a' ) self.assertEqual( r.lastmod, '2012-03-14T18:37:36' ) self.assertEqual( r.size, 12 ) self.assertEqual( r.md5, 'aabbccdd' )
def all_resources(self): all_resources = {} # search for resourcelists resourcelist_files = sorted( glob(self.paras.abs_metadata_path("resourcelist_*.xml"))) for rl_file_name in resourcelist_files: resourcelist = ResourceList() with open(rl_file_name, "r", encoding="utf-8") as rl_file: sm = Sitemap() sm.parse_xml(rl_file, resources=resourcelist) all_resources.update({ resource.uri: resource for resource in resourcelist.resources }) # search for changelists changelist_files = sorted( glob(self.paras.abs_metadata_path("changelist_*.xml"))) for cl_file_name in changelist_files: changelist = ChangeList() with open(cl_file_name, "r", encoding="utf-8") as cl_file: sm = Sitemap() sm.parse_xml(cl_file, resources=changelist) for resource in changelist.resources: if resource.change == "created" or resource.change == "updated": all_resources.update({resource.uri: resource}) elif resource.change == "deleted" and resource.uri in all_resources: del all_resources[resource.uri] return all_resources
def test_30_parse_changeset(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:changetype>UP</rs:changetype></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size><rs:changeid>123</rs:changeid></url>\ </urlset>' s=Sitemap() s.resource_class=ResourceChange i=s.inventory_parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual( s.resources_created, 2, 'got 2 resources') self.assertEqual( i.resources['/tmp/rs_test/src/file_a'].changetype, 'UP' ) self.assertEqual( i.resources['/tmp/rs_test/src/file_a'].changeid, None ) self.assertEqual( i.resources['/tmp/rs_test/src/file_b'].changetype, None ) self.assertEqual( i.resources['/tmp/rs_test/src/file_b'].changeid, '123' )
def explore_uri(self, uri, checks, caps, show_back=True): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s = Sitemap() print "Reading %s" % (uri) options = {} capability = None try: if (caps == 'resource'): self.explore_show_head(uri, check_headers=checks) else: list = s.parse_xml(urllib.urlopen(uri)) (options, capability) = self.explore_show_summary( list, s.parsed_index, caps) except IOError as e: print "Cannot read %s (%s)\nGoing back" % (uri, str(e)) return ('', '', '', 'b') except Exception as e: print "Cannot parse %s (%s)\nGoing back" % (uri, str(e)) return ('', '', '', 'b') while (True): # don't offer number option for no resources/capabilities num_prompt = '' if (len(options) == 0) else 'number, ' up_prompt = 'b(ack), ' if (show_back) else '' inp = raw_input("Follow [%s%sq(uit)]?" % (num_prompt, up_prompt)) if (inp in options.keys()): break if (inp == 'q' or inp == 'b'): return ('', '', '', inp) checks = {} if (options[inp].capability is None): if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents caps = ['capabilitylist'] elif (capability in [ 'resourcelist', 'changelist', 'resourcedump', 'changedump' ]): caps = 'resource' else: r = options[inp] caps = [r.capability] if (r.length is not None): checks['content-length'] = r.length if (r.lastmod is not None): checks['last-modified'] = r.lastmod # FIXME - could do sanity check here and issue warnings if odd return (options[inp].uri, checks, caps, inp)
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t') s = Sitemap() xml = s.inventory_as_xml(i) self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>', xml), 'size/checksum for file_a') self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>', xml), 'size/checksum for file_b')
def test_01_print(self): i = Inventory() i.add( Resource(uri='a',lastmod='2001-01-01',size=1234) ) i.capabilities['http://example.org/changeset1'] = \ {"type": "changeset", "attributes": ["self next"]} self.assertEqual( len(i.capabilities), 1 ) self.assertEqual( Sitemap().resources_as_xml(i, capabilities=i.capabilities), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/" xmlns:xhtml="http://www.w3.org/1999/xhtml_DEFANGED"><xhtml:link href="http://example.org/changeset1" rel="self next" type="changeset" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:size>1234</rs:size></url></urlset>' )