def test_22_parse_sitemapindex_file(self): s = Sitemap() fh = open('tests/testdata/sitemapindex1/sitemap.xml', 'r') si = s.parse_xml(fh=fh, sitemapindex=True) self.assertTrue(s.parsed_index, 'was a sitemapindex') self.assertEqual(len(si.resources), 3, '3 sitemaps') sms = sorted(si.uris()) self.assertEqual(sms, [ 'http://localhost:8888/sitemap00000.xml', 'http://localhost:8888/sitemap00001.xml', 'http://localhost:8888/sitemap00002.xml' ])
def test_16_parse_valid_xml_but_other(self): s = Sitemap() self.assertRaises( SitemapParseError, s.parse_xml, io.StringIO( '<urlset xmlns="http://example.org/other_namespace"> </urlset>' )) self.assertRaises( SitemapParseError, s.parse_xml, io.StringIO( '<other xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </other>' ))
def test33_write(self): # ResourceList rl = ResourceList() rl.add(Resource(uri='http://example.com/test/a', timestamp=1)) rl.add(Resource(uri='http://example.com/test/b', timestamp=1)) rl.add(Resource(uri='http://example.com/test/c', timestamp=1)) rl_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist.xml') rl.write(basename=rl_filename) with open(rl_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertFalse(s.parsed_index) # ResourceListIndex rli = ResourceList() rli.add( Resource(uri='http://example.com/test/resourcelist00000.xml', timestamp=1)) rli.add( Resource(uri='http://example.com/test/resourcelist00001.xml', timestamp=1)) rli.add( Resource(uri='http://example.com/test/resourcelist00002.xml', timestamp=1)) rli.sitemapindex = True rli_filename = os.path.join(self.tmpdir, 'test33_write_resourcelist-index.xml') rli.write(basename=rli_filename) with open(rli_filename, 'r') as f: s = Sitemap() s.parse_xml(fh=f) self.assertTrue(s.parsed_index)
def test_07_print(self): r1 = Resource(uri='a', lastmod='2001-01-01', length=1234) r2 = Resource(uri='b', lastmod='2002-02-02', length=56789) r3 = Resource(uri='c', lastmod='2003-03-03', length=0) m = ResourceList(md={'capability': 'resourcelist', 'modified': None}) m.add(r1) m.add(r2) m.add(r3) # print m self.assertEqual( Sitemap().resources_as_xml(m), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><rs:md capability=\"resourcelist\" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length=\"0\" /></url></urlset>" )
def explore_links_get(self, uri, seen=[]): # Check we haven't been here before if (uri in seen): self.logger.warning("Already see %s, skipping" % (uri)) s = Sitemap(allow_multifile=self.allow_multifile) self.logger.info("Reading sitemap from %s ..." % (uri)) i = s.read(uri, index_only=True) self.logger.warning("Read %s from %s" % (s.read_type, uri)) links = self.extract_links(i, verbose=True) if ('next' in links and links['next'] == uri): self.logger.warning("- self reference \"next\" link") seen[uri] = links return (s.changeset_read, links)
def test_09_print_subset(self): r1 = Resource(uri='a', lastmod='2001-01-01', size=1234) r2 = Resource(uri='b', lastmod='2002-02-02', size=56789) r3 = Resource(uri='c', lastmod='2003-03-03', size=0) r3 = Resource(uri='d', lastmod='2003-03-04', size=444) m = Inventory() m.add(r1) m.add(r2) m.add(r3) self.assertEqual( Sitemap().inventory_as_xml(m, entries=['d', 'b']), "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://resourcesync.org/change/0.1\"><url><loc>d</loc><lastmod>2003-03-04T00:00:00</lastmod><rs:size>444</rs:size></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00</lastmod><rs:size>56789</rs:size></url></urlset>" )
def test_21_parse_sitemapindex(self): s=Sitemap() si = s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>') ) self.assertEqual( s.sitemaps_created, 2, '2 sitemaps in sitemapindex') self.assertEqual( len(si.resources), 2, '2 sitemaps') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['aaa','bbb'] ) # add a couple more s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), sitemapindex=si ) self.assertEqual( s.sitemaps_created, 2, '2 sitemaps created to sitemapindex') self.assertEqual( len(si.resources), 4, '4 sitemaps total') sms = sorted(si.resources.keys()) self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )
def test_04_resource_str(self): r1 = Resource(uri='4a', lastmod="2013-01-02", length=9999, md5='ab54de') r1.ln = [{ 'rel': 'duplicate', 'pri': '1', 'href': 'http://mirror1.example.com/res1', 'modified': '2013-01-02T18:00:00Z' }] self.assertEqual( Sitemap().resource_as_xml(r1), "<url><loc>4a</loc><lastmod>2013-01-02T00:00:00Z</lastmod><rs:md hash=\"md5:ab54de\" length=\"9999\" /><rs:ln href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02T18:00:00Z\" pri=\"1\" rel=\"duplicate\" /></url>" ) # add another two rs:ln's r1.ln.append({'rel': 'num2'}) r1.ln.append({'rel': 'num3'}) self.assertEqual( Sitemap().resource_as_xml(r1), "<url><loc>4a</loc><lastmod>2013-01-02T00:00:00Z</lastmod><rs:md hash=\"md5:ab54de\" length=\"9999\" /><rs:ln href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02T18:00:00Z\" pri=\"1\" rel=\"duplicate\" /><rs:ln rel=\"num2\" /><rs:ln rel=\"num3\" /></url>" )
def test_10_sitemap(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\ <url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:md5>aabbccdd</rs:md5></url>\ </urlset>' s=Sitemap() i=s.inventory_parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual( s.resources_created, 1, 'got 1 resources') r=i.resources['http://e.com/a'] self.assertTrue( r is not None, 'got the uri expected') self.assertEqual( r.uri, 'http://e.com/a' ) self.assertEqual( r.lastmod, '2012-03-14T18:37:36' ) self.assertEqual( r.size, 12 ) self.assertEqual( r.md5, 'aabbccdd' )
def write_sitemap(self, outfile=None, capabilities=None, dump=None): # Set up base_path->base_uri mappings, get inventory from disk i = self.inventory i.capabilities = capabilities s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.resources_as_xml(i, capabilities=i.capabilities) else: s.write(i, basename=outfile) self.write_dump_if_requested(i, dump)
def get_from_date_from_url(url): """Get smallest timestamp from url and parse to string.""" s = Sitemap() try: document = s.parse_xml(url_or_file_open(url)) except IOError as e: raise e date_list = [] for item in document.resources: if item.timestamp: date_list.append(item.timestamp) if len(date_list) > 0: from_date = dt.fromtimestamp(min(date_list)) return from_date.strftime("%Y-%m-%d")
def test_30_parse_changeset(self): xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:changetype>UP</rs:changetype></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size><rs:changeid>123</rs:changeid></url>\ </urlset>' s=Sitemap() s.resource_class=ResourceChange i=s.inventory_parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual( s.resources_created, 2, 'got 2 resources') self.assertEqual( i.resources['/tmp/rs_test/src/file_a'].changetype, 'UP' ) self.assertEqual( i.resources['/tmp/rs_test/src/file_a'].changeid, None ) self.assertEqual( i.resources['/tmp/rs_test/src/file_b'].changetype, None ) self.assertEqual( i.resources['/tmp/rs_test/src/file_b'].changeid, '123' )
def update_previous_state(self): if self.previous_resources is None: self.previous_resources = {} # search for resourcelists self.resourcelist_files = sorted( glob(self.param.abs_metadata_path("changedump_*.xml"))) for rl_file_name in self.resourcelist_files: resourcelist = ResourceList() with open(rl_file_name, "r", encoding="utf-8") as rl_file: sm = Sitemap() sm.parse_xml(rl_file, resources=resourcelist) self.date_resourcelist_completed = resourcelist.md_completed if self.date_resourcelist_completed is None: self.date_resourcelist_completed = resourcelist.md_at self.previous_resources.update({ resource.uri: resource for resource in resourcelist.resources }) # search for changedumps self.changedump_files = sorted( glob(self.param.abs_metadata_path("changedump_*.xml"))) for cl_file_name in self.changedump_files: changedump = ChangeDump() with open(cl_file_name, "r", encoding="utf-8") as cl_file: sm = Sitemap() sm.parse_xml(cl_file, resources=changedump) for resource in changedump.resources: if resource.change == "created" or resource.change == "updated": self.previous_resources.update( {resource.uri: resource}) elif resource.change == "deleted" and resource.uri in self.previous_resources: del self.previous_resources[resource.uri]
def explore_uri(self, uri, checks, caps, show_back=True): """Interactive exploration of document at uri Will flag warnings if the document is not of type listed in caps """ s = Sitemap() print "Reading %s" % (uri) options = {} capability = None try: if (caps == 'resource'): self.explore_show_head(uri, check_headers=checks) else: list = s.parse_xml(urllib.urlopen(uri)) (options, capability) = self.explore_show_summary( list, s.parsed_index, caps) except IOError as e: print "Cannot read %s (%s)\nGoing back" % (uri, str(e)) return ('', '', '', 'b') except Exception as e: print "Cannot parse %s (%s)\nGoing back" % (uri, str(e)) return ('', '', '', 'b') while (True): # don't offer number option for no resources/capabilities num_prompt = '' if (len(options) == 0) else 'number, ' up_prompt = 'b(ack), ' if (show_back) else '' inp = raw_input("Follow [%s%sq(uit)]?" % (num_prompt, up_prompt)) if (inp in options.keys()): break if (inp == 'q' or inp == 'b'): return ('', '', '', inp) checks = {} if (options[inp].capability is None): if (capability == 'capabilitylistindex'): # all links should be to capabilitylist documents caps = ['capabilitylist'] elif (capability in [ 'resourcelist', 'changelist', 'resourcedump', 'changedump' ]): caps = 'resource' else: r = options[inp] caps = [r.capability] if (r.length is not None): checks['content-length'] = r.length if (r.lastmod is not None): checks['last-modified'] = r.lastmod # FIXME - could do sanity check here and issue warnings if odd return (options[inp].uri, checks, caps, inp)
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t') s = Sitemap() xml = s.inventory_as_xml(i) self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>', xml), 'size/checksum for file_a') self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>', xml), 'size/checksum for file_b')
def test_02_resource_deleted(self): # ResourceChange with deleted r1 = ResourceChange('http://example.org/r/1', 1234, 9999, 'Q2hlY2sgSW50ZWdyaXR5IQ==', changetype='DELETED') self.assertEqual( Sitemap().resource_as_xml(r1), "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><expires>1970-01-01T00:20:34Z</expires><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>" ) # Now make inventory i = Inventory() i.add(r1) inv_xml = Sitemap().resources_as_xml(i) # and try parsing back s = Sitemap() s.resource_class = ResourceChange i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml)) self.assertEqual(len(i), 1) r = iter(i).next() self.assertEqual(r.uri, 'http://example.org/r/1') self.assertEqual(r.timestamp, 1234) self.assertEqual(r.changetype, 'DELETED')
def test_10_sitemap(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md hash="md5:Q2hlY2sgSW50ZWdyaXR5IQ==" length=\"12\" /></url>\ </urlset>' s = Sitemap() i = s.parse_xml(fh=io.StringIO(xml)) self.assertFalse(s.parsed_index, 'was a sitemap') self.assertEqual(s.resources_created, 1, 'got 1 resources') for r in i.resources: self.assertTrue(r is not None, 'got the uri expected') self.assertEqual(r.uri, 'http://e.com/a') self.assertEqual(r.lastmod, '2012-03-14T18:37:36Z') self.assertEqual(r.length, 12) self.assertEqual(r.md5, 'Q2hlY2sgSW50ZWdyaXR5IQ==')
def test_12_parse_multi_loc(self): xml_start = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url>' xml_end = '<lastmod>2012-03-14T18:37:36Z</lastmod></url></urlset>' s = Sitemap() two_loc = '<loc>/tmp/rs_test/src/file_a</loc><loc>/tmp/rs_test/src/file_b</loc>' self.assertRaises(SitemapParseError, s.parse_xml, io.StringIO(xml_start + two_loc + xml_end)) mt_loc = '<loc></loc>' self.assertRaises(SitemapParseError, s.parse_xml, io.StringIO(xml_start + mt_loc + xml_end)) mt_loc_att = '<loc att="value"/>' self.assertRaises(SitemapParseError, s.parse_xml, io.StringIO(xml_start + mt_loc_att + xml_end))
def convert_to_xml(self, resources, sitemap_index=False, fh=None): """Write or return XML for a set of resources in sitemap format. Arguments: - resources - either an iterable or iterator of Resource objects; if there an md attribute this will go to <rs:md> if there an ln attribute this will go to <rs:ln> - sitemapindex - set True to write sitemapindex instead of sitemap - fh - write to filehandle fh instead of returning string """ sitemap = Sitemap() self.res_container = resources if len(self.res_container) == 0: return return sitemap.resources_as_xml(self.res_container, sitemapindex=sitemap_index, fh=fh)
def test3_with_md5(self): ib = InventoryBuilder(do_md5=True) ib.mapper = Mapper( ['http://example.org/t', 'resync/test/testdata/dir1']) i = ib.from_disk() s = Sitemap() xml = s.resources_as_xml(i) self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>', xml)) #must escape + in md5 self.assertNotEqual( None, re.search( '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>', xml))
def generator(): for file_name in self.paras.last_sitemaps: listbase = ListBaseWithIndex() if os.path.exists(file_name): with open(file_name, "r", encoding="utf-8") as lb_file: sm = Sitemap() sm.parse_xml(lb_file, resources=listbase) for resource in listbase.resources: if resource.change is None or not resource.change == "deleted": path, relpath = self.extract_paths(resource.uri) yield resource, path, relpath else: LOG.warning("Unable to read sitemap: %s" % file_name) self.count_errors += 1 self.observers_inform( self, ResourceAuditorEvent.site_map_not_found, file=file_name)
def test_21_parse_sitemapindex(self): s = Sitemap() si = s.parse_xml(fh=io.StringIO( '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>' ), sitemapindex=True) self.assertEqual(len(si.resources), 2, '2 sitemaps') sms = sorted(si.uris()) self.assertEqual(sms, ['aaa', 'bbb']) # add a couple more s.parse_xml(fh=io.StringIO( '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>' ), resources=si) self.assertTrue(s.parsed_index, 'was a sitemapindex') self.assertEqual(len(si.resources), 4, '4 sitemaps total') sms = sorted(si.uris()) self.assertEqual(sms, ['aaa', 'bbb', 'cc', 'dd'])
def test_30_parse_change_list(self): xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\ <url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\ </urlset>' s = Sitemap() s.resource_class = Resource c = s.parse_xml(fh=io.StringIO(xml)) self.assertEqual(s.resources_created, 2, 'got 2 resources') i = iter(c) r1 = next(i) self.assertEqual(r1.uri, '/tmp/rs_test/src/file_a') self.assertEqual(r1.change, 'updated') r2 = next(i) self.assertEqual(r2.uri, '/tmp/rs_test/src/file_b') self.assertEqual(r2.change, None)
def test_19_parse_with_bad_rs_ln(self): xmlstart = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <rs:md capability="resourcelist"/>\ <url><loc>http://example.com/file_a</loc>' xmlend = '</url></urlset>' s = Sitemap() # # missing href xml = xmlstart + '<rs:ln rel="duplicate"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=StringIO.StringIO(xml)) # missing rel xml = xmlstart + '<rs:ln href="http://example.com/"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=StringIO.StringIO(xml)) # bad length xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="a"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=StringIO.StringIO(xml)) # bad pri xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=StringIO.StringIO(xml)) xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=StringIO.StringIO(xml)) xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>' + xmlend self.assertRaises(SitemapParseError, s.parse_xml, fh=StringIO.StringIO(xml)) # and finally OK with errors fixes xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>' + xmlend rc = s.parse_xml(fh=StringIO.StringIO(xml)) self.assertEqual(len(rc.resources), 1, 'good at last, extra attribute ignored')
def test_all_simple_read(self): """Just try to read each one""" for ex in ('archives_ex_2_1','archives_ex_2_2', 'archives_ex_3_1','archives_ex_3_2', 'archives_ex_4_1', 'archives_ex_5_1', 'archives_ex_6_1', 'resourcesync_ex_1','resourcesync_ex_2','resourcesync_ex_3', 'resourcesync_ex_4','resourcesync_ex_5','resourcesync_ex_6', 'resourcesync_ex_7','resourcesync_ex_8','resourcesync_ex_12', 'resourcesync_ex_13','resourcesync_ex_14','resourcesync_ex_15', 'resourcesync_ex_16','resourcesync_ex_17','resourcesync_ex_18', 'resourcesync_ex_19','resourcesync_ex_20','resourcesync_ex_21', 'resourcesync_ex_22','resourcesync_ex_23','resourcesync_ex_24', 'resourcesync_ex_25','resourcesync_ex_26','resourcesync_ex_27', 'resourcesync_ex_28','resourcesync_ex_29','resourcesync_ex_30', 'resourcesync_ex_31','resourcesync_ex_32','resourcesync_ex_33'): s=Sitemap() fh = self._open_ex(ex) si = s.parse_xml( fh=fh )
def test_13_parse_multi_lastmod(self): xml_start = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\ <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\ <url><loc>uri:a</loc>' xml_end = '</url></urlset>' s = Sitemap() two_lastmod = '<lastmod>2013-01-01</lastmod><lastmod>2013-01-02</lastmod>' self.assertRaises(SitemapParseError, s.parse_xml, io.StringIO(xml_start + two_lastmod + xml_end)) # While it not ideal to omit, <lastmod> is not required and # thus either empty lastmod or lastmod with just an attribute # and no content are not ambiguous and thus should be accepted # with resulting None for resource.lastmod mt_lastmod = '<lastmod></lastmod>' i = s.parse_xml(fh=io.StringIO(xml_start + mt_lastmod + xml_end)) self.assertEqual(s.resources_created, 1) self.assertEqual(i.resources[0].lastmod, None) mt_lastmod_att = '<lastmod att="value"/>' i = s.parse_xml(fh=io.StringIO(xml_start + mt_lastmod_att + xml_end)) self.assertEqual(s.resources_created, 1) self.assertEqual(i.resources[0].lastmod, None)
def parse_sitemap(self): s = Sitemap(allow_multifile=self.allow_multifile) self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap)) i = s.read(self.sitemap) num_entries = len(i) self.logger.warning("Read sitemap with %d entries in %d sitemaps" % (num_entries, s.sitemaps_created)) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for r in i: print r n += 1 if (n >= to_show): break
def parse_sitemap(self): s = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile) if (self.verbose): print "Reading sitemap(s) from %s ..." % (sitemap) i = s.read(sitemap) num_entries = len(i) print "Read sitemap with %d entries in %d sitemaps" % ( num_entries, s.sitemaps_created) if (self.verbose): to_show = 100 override_str = ' (override with --max-sitemap-entries)' if (self.max_sitemap_entries): to_show = self.max_sitemap_entries override_str = '' if (num_entries > to_show): print "Showing first %d entries sorted by URI%s..." % ( to_show, override_str) n = 0 for r in i.resource_uris(): print i.resources[r] n += 1 if (n >= to_show): break
def changeset_sitemap(self, outfile=None, ref_sitemap=None, newref_sitemap=None, empty=None, capabilities=None, dump=None): changeset = ChangeSet() changeset.capabilities = capabilities if (not empty): # 1. Get and parse reference sitemap old_inv = self.read_reference_sitemap(ref_sitemap) # 2. Depending on whether a newref_sitemap was specified, either read that # or build inventory from files on disk if (newref_sitemap is None): # Get inventory from disk new_inv = self.inventory else: new_inv = self.read_reference_sitemap(newref_sitemap, name='new reference') # 3. Calculate changeset (same, updated, deleted, created) = old_inv.compare(new_inv) changeset.add_changed_resources(updated, changetype='UPDATED') changeset.add_changed_resources(deleted, changetype='DELETED') changeset.add_changed_resources(created, changetype='CREATED') # 4. Write out changeset s = Sitemap(pretty_xml=True, allow_multifile=self.allow_multifile, mapper=self.mapper) if (self.max_sitemap_entries is not None): s.max_sitemap_entries = self.max_sitemap_entries if (outfile is None): print s.resources_as_xml(changeset, changeset=True) else: s.write(changeset, basename=outfile, changeset=True) self.write_dump_if_requested(changeset, dump)
def sitemap(self): """Creates a sitemap inventory""" self.inventory.generate() return Sitemap().inventory_as_xml(self.inventory)