Ejemplo n.º 1
0
 def test_22_parse_sitemapindex_file(self):
     s = Sitemap()
     fh = open('tests/testdata/sitemapindex1/sitemap.xml', 'r')
     si = s.parse_xml(fh=fh, sitemapindex=True)
     self.assertTrue(s.parsed_index, 'was a sitemapindex')
     self.assertEqual(len(si.resources), 3, '3 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual(sms, [
         'http://localhost:8888/sitemap00000.xml',
         'http://localhost:8888/sitemap00001.xml',
         'http://localhost:8888/sitemap00002.xml'
     ])
Ejemplo n.º 2
0
 def test_16_parse_valid_xml_but_other(self):
     s = Sitemap()
     self.assertRaises(
         SitemapParseError, s.parse_xml,
         io.StringIO(
             '<urlset xmlns="http://example.org/other_namespace"> </urlset>'
         ))
     self.assertRaises(
         SitemapParseError, s.parse_xml,
         io.StringIO(
             '<other xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> </other>'
         ))
Ejemplo n.º 3
0
    def test33_write(self):
        # ResourceList
        rl = ResourceList()

        rl.add(Resource(uri='http://example.com/test/a', timestamp=1))
        rl.add(Resource(uri='http://example.com/test/b', timestamp=1))
        rl.add(Resource(uri='http://example.com/test/c', timestamp=1))

        rl_filename = os.path.join(self.tmpdir,
                                   'test33_write_resourcelist.xml')
        rl.write(basename=rl_filename)

        with open(rl_filename, 'r') as f:
            s = Sitemap()
            s.parse_xml(fh=f)
            self.assertFalse(s.parsed_index)

        # ResourceListIndex
        rli = ResourceList()

        rli.add(
            Resource(uri='http://example.com/test/resourcelist00000.xml',
                     timestamp=1))
        rli.add(
            Resource(uri='http://example.com/test/resourcelist00001.xml',
                     timestamp=1))
        rli.add(
            Resource(uri='http://example.com/test/resourcelist00002.xml',
                     timestamp=1))
        rli.sitemapindex = True

        rli_filename = os.path.join(self.tmpdir,
                                    'test33_write_resourcelist-index.xml')
        rli.write(basename=rli_filename)

        with open(rli_filename, 'r') as f:
            s = Sitemap()
            s.parse_xml(fh=f)
            self.assertTrue(s.parsed_index)
Ejemplo n.º 4
0
 def test_07_print(self):
     r1 = Resource(uri='a', lastmod='2001-01-01', length=1234)
     r2 = Resource(uri='b', lastmod='2002-02-02', length=56789)
     r3 = Resource(uri='c', lastmod='2003-03-03', length=0)
     m = ResourceList(md={'capability': 'resourcelist', 'modified': None})
     m.add(r1)
     m.add(r2)
     m.add(r3)
     # print m
     self.assertEqual(
         Sitemap().resources_as_xml(m),
         "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://www.openarchives.org/rs/terms/\"><rs:md capability=\"resourcelist\" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length=\"1234\" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length=\"56789\" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length=\"0\" /></url></urlset>"
     )
Ejemplo n.º 5
0
 def explore_links_get(self, uri, seen=[]):
     # Check we haven't been here before
     if (uri in seen):
         self.logger.warning("Already see %s, skipping" % (uri))
     s = Sitemap(allow_multifile=self.allow_multifile)
     self.logger.info("Reading sitemap from %s ..." % (uri))
     i = s.read(uri, index_only=True)
     self.logger.warning("Read %s from %s" % (s.read_type, uri))
     links = self.extract_links(i, verbose=True)
     if ('next' in links and links['next'] == uri):
         self.logger.warning("- self reference \"next\" link")
     seen[uri] = links
     return (s.changeset_read, links)
Ejemplo n.º 6
0
 def test_09_print_subset(self):
     r1 = Resource(uri='a', lastmod='2001-01-01', size=1234)
     r2 = Resource(uri='b', lastmod='2002-02-02', size=56789)
     r3 = Resource(uri='c', lastmod='2003-03-03', size=0)
     r3 = Resource(uri='d', lastmod='2003-03-04', size=444)
     m = Inventory()
     m.add(r1)
     m.add(r2)
     m.add(r3)
     self.assertEqual(
         Sitemap().inventory_as_xml(m, entries=['d', 'b']),
         "<?xml version='1.0' encoding='UTF-8'?>\n<urlset xmlns=\"http://www.sitemaps.org/schemas/sitemap/0.9\" xmlns:rs=\"http://resourcesync.org/change/0.1\"><url><loc>d</loc><lastmod>2003-03-04T00:00:00</lastmod><rs:size>444</rs:size></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00</lastmod><rs:size>56789</rs:size></url></urlset>"
     )
Ejemplo n.º 7
0
 def test_21_parse_sitemapindex(self):
     s=Sitemap()
     si = s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>') )
     self.assertEqual( s.sitemaps_created, 2, '2 sitemaps in sitemapindex')
     self.assertEqual( len(si.resources), 2, '2 sitemaps')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['aaa','bbb'] )
     # add a couple more
     s.sitemapindex_parse_xml( fh=StringIO.StringIO('<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'), sitemapindex=si )
     self.assertEqual( s.sitemaps_created, 2, '2 sitemaps created to sitemapindex')
     self.assertEqual( len(si.resources), 4, '4 sitemaps total')
     sms = sorted(si.resources.keys())
     self.assertEqual( sms, ['aaa','bbb', 'cc', 'dd'] )
Ejemplo n.º 8
0
 def test_04_resource_str(self):
     r1 = Resource(uri='4a',
                   lastmod="2013-01-02",
                   length=9999,
                   md5='ab54de')
     r1.ln = [{
         'rel': 'duplicate',
         'pri': '1',
         'href': 'http://mirror1.example.com/res1',
         'modified': '2013-01-02T18:00:00Z'
     }]
     self.assertEqual(
         Sitemap().resource_as_xml(r1),
         "<url><loc>4a</loc><lastmod>2013-01-02T00:00:00Z</lastmod><rs:md hash=\"md5:ab54de\" length=\"9999\" /><rs:ln href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02T18:00:00Z\" pri=\"1\" rel=\"duplicate\" /></url>"
     )
     # add another two rs:ln's
     r1.ln.append({'rel': 'num2'})
     r1.ln.append({'rel': 'num3'})
     self.assertEqual(
         Sitemap().resource_as_xml(r1),
         "<url><loc>4a</loc><lastmod>2013-01-02T00:00:00Z</lastmod><rs:md hash=\"md5:ab54de\" length=\"9999\" /><rs:ln href=\"http://mirror1.example.com/res1\" modified=\"2013-01-02T18:00:00Z\" pri=\"1\" rel=\"duplicate\" /><rs:ln rel=\"num2\" /><rs:ln rel=\"num3\" /></url>"
     )
Ejemplo n.º 9
0
    def test_10_sitemap(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\
<url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:md5>aabbccdd</rs:md5></url>\
</urlset>'
        s=Sitemap()
        i=s.inventory_parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual( s.resources_created, 1, 'got 1 resources')
        r=i.resources['http://e.com/a']
        self.assertTrue( r is not None, 'got the uri expected')
        self.assertEqual( r.uri, 'http://e.com/a' )
        self.assertEqual( r.lastmod, '2012-03-14T18:37:36' )
        self.assertEqual( r.size, 12 )
        self.assertEqual( r.md5, 'aabbccdd' )
Ejemplo n.º 10
0
 def write_sitemap(self, outfile=None, capabilities=None, dump=None):
     # Set up base_path->base_uri mappings, get inventory from disk
     i = self.inventory
     i.capabilities = capabilities
     s = Sitemap(pretty_xml=True,
                 allow_multifile=self.allow_multifile,
                 mapper=self.mapper)
     if (self.max_sitemap_entries is not None):
         s.max_sitemap_entries = self.max_sitemap_entries
     if (outfile is None):
         print s.resources_as_xml(i, capabilities=i.capabilities)
     else:
         s.write(i, basename=outfile)
     self.write_dump_if_requested(i, dump)
Ejemplo n.º 11
0
def get_from_date_from_url(url):
    """Get smallest timestamp from url and parse to string."""
    s = Sitemap()
    try:
        document = s.parse_xml(url_or_file_open(url))
    except IOError as e:
        raise e
    date_list = []
    for item in document.resources:
        if item.timestamp:
            date_list.append(item.timestamp)
    if len(date_list) > 0:
        from_date = dt.fromtimestamp(min(date_list))
        return from_date.strftime("%Y-%m-%d")
Ejemplo n.º 12
0
    def test_30_parse_changeset(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://resourcesync.org/change/0.1">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>12</rs:size><rs:changetype>UP</rs:changetype></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36</lastmod><rs:size>32</rs:size><rs:changeid>123</rs:changeid></url>\
</urlset>'
        s=Sitemap()
        s.resource_class=ResourceChange
        i=s.inventory_parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual( s.resources_created, 2, 'got 2 resources')
        self.assertEqual( i.resources['/tmp/rs_test/src/file_a'].changetype, 'UP' )
        self.assertEqual( i.resources['/tmp/rs_test/src/file_a'].changeid, None )
        self.assertEqual( i.resources['/tmp/rs_test/src/file_b'].changetype, None )
        self.assertEqual( i.resources['/tmp/rs_test/src/file_b'].changeid, '123' )
Ejemplo n.º 13
0
    def update_previous_state(self):
        if self.previous_resources is None:
            self.previous_resources = {}

            # search for resourcelists
            self.resourcelist_files = sorted(
                glob(self.param.abs_metadata_path("changedump_*.xml")))
            for rl_file_name in self.resourcelist_files:
                resourcelist = ResourceList()
                with open(rl_file_name, "r", encoding="utf-8") as rl_file:
                    sm = Sitemap()
                    sm.parse_xml(rl_file, resources=resourcelist)

                self.date_resourcelist_completed = resourcelist.md_completed
                if self.date_resourcelist_completed is None:
                    self.date_resourcelist_completed = resourcelist.md_at

                self.previous_resources.update({
                    resource.uri: resource
                    for resource in resourcelist.resources
                })

            # search for changedumps
            self.changedump_files = sorted(
                glob(self.param.abs_metadata_path("changedump_*.xml")))
            for cl_file_name in self.changedump_files:
                changedump = ChangeDump()
                with open(cl_file_name, "r", encoding="utf-8") as cl_file:
                    sm = Sitemap()
                    sm.parse_xml(cl_file, resources=changedump)

                for resource in changedump.resources:
                    if resource.change == "created" or resource.change == "updated":
                        self.previous_resources.update(
                            {resource.uri: resource})
                    elif resource.change == "deleted" and resource.uri in self.previous_resources:
                        del self.previous_resources[resource.uri]
Ejemplo n.º 14
0
    def explore_uri(self, uri, checks, caps, show_back=True):
        """Interactive exploration of document at uri

        Will flag warnings if the document is not of type listed in caps
        """
        s = Sitemap()
        print "Reading %s" % (uri)
        options = {}
        capability = None
        try:
            if (caps == 'resource'):
                self.explore_show_head(uri, check_headers=checks)
            else:
                list = s.parse_xml(urllib.urlopen(uri))
                (options, capability) = self.explore_show_summary(
                    list, s.parsed_index, caps)
        except IOError as e:
            print "Cannot read %s (%s)\nGoing back" % (uri, str(e))
            return ('', '', '', 'b')
        except Exception as e:
            print "Cannot parse %s (%s)\nGoing back" % (uri, str(e))
            return ('', '', '', 'b')
        while (True):
            # don't offer number option for no resources/capabilities
            num_prompt = '' if (len(options) == 0) else 'number, '
            up_prompt = 'b(ack), ' if (show_back) else ''
            inp = raw_input("Follow [%s%sq(uit)]?" % (num_prompt, up_prompt))
            if (inp in options.keys()):
                break
            if (inp == 'q' or inp == 'b'):
                return ('', '', '', inp)
        checks = {}
        if (options[inp].capability is None):
            if (capability == 'capabilitylistindex'):
                # all links should be to capabilitylist documents
                caps = ['capabilitylist']
            elif (capability in [
                    'resourcelist', 'changelist', 'resourcedump', 'changedump'
            ]):
                caps = 'resource'
        else:
            r = options[inp]
            caps = [r.capability]
            if (r.length is not None):
                checks['content-length'] = r.length
            if (r.lastmod is not None):
                checks['last-modified'] = r.lastmod
            # FIXME - could do sanity check here and issue warnings if odd
        return (options[inp].uri, checks, caps, inp)
Ejemplo n.º 15
0
 def test3_with_md5(self):
     ib = InventoryBuilder(do_md5=True)
     i = ib.from_disk('resync/test/testdata/dir1', 'http://example.org/t')
     s = Sitemap()
     xml = s.inventory_as_xml(i)
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+</lastmod><rs:size>20</rs:size><rs:md5>6bf26fd66601b528d2e0b47eaa87edfd</rs:md5>',
             xml), 'size/checksum for file_a')
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+</lastmod><rs:size>45</rs:size><rs:md5>452e54bdae1626ac5d6e7be81b39de21</rs:md5>',
             xml), 'size/checksum for file_b')
Ejemplo n.º 16
0
 def test_02_resource_deleted(self):
     # ResourceChange with deleted
     r1 = ResourceChange('http://example.org/r/1',
                         1234,
                         9999,
                         'Q2hlY2sgSW50ZWdyaXR5IQ==',
                         changetype='DELETED')
     self.assertEqual(
         Sitemap().resource_as_xml(r1),
         "<?xml version='1.0' encoding='UTF-8'?>\n<url><loc>http://example.org/r/1</loc><expires>1970-01-01T00:20:34Z</expires><rs:size>9999</rs:size><rs:fixity type=\"md5\">Q2hlY2sgSW50ZWdyaXR5IQ==</rs:fixity></url>"
     )
     # Now make inventory
     i = Inventory()
     i.add(r1)
     inv_xml = Sitemap().resources_as_xml(i)
     # and try parsing back
     s = Sitemap()
     s.resource_class = ResourceChange
     i = s.inventory_parse_xml(fh=StringIO.StringIO(inv_xml))
     self.assertEqual(len(i), 1)
     r = iter(i).next()
     self.assertEqual(r.uri, 'http://example.org/r/1')
     self.assertEqual(r.timestamp, 1234)
     self.assertEqual(r.changetype, 'DELETED')
Ejemplo n.º 17
0
    def test_10_sitemap(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>http://e.com/a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md hash="md5:Q2hlY2sgSW50ZWdyaXR5IQ==" length=\"12\" /></url>\
</urlset>'

        s = Sitemap()
        i = s.parse_xml(fh=io.StringIO(xml))
        self.assertFalse(s.parsed_index, 'was a sitemap')
        self.assertEqual(s.resources_created, 1, 'got 1 resources')
        for r in i.resources:
            self.assertTrue(r is not None, 'got the uri expected')
            self.assertEqual(r.uri, 'http://e.com/a')
            self.assertEqual(r.lastmod, '2012-03-14T18:37:36Z')
            self.assertEqual(r.length, 12)
            self.assertEqual(r.md5, 'Q2hlY2sgSW50ZWdyaXR5IQ==')
Ejemplo n.º 18
0
    def test_12_parse_multi_loc(self):
        xml_start = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url>'

        xml_end = '<lastmod>2012-03-14T18:37:36Z</lastmod></url></urlset>'
        s = Sitemap()
        two_loc = '<loc>/tmp/rs_test/src/file_a</loc><loc>/tmp/rs_test/src/file_b</loc>'
        self.assertRaises(SitemapParseError, s.parse_xml,
                          io.StringIO(xml_start + two_loc + xml_end))
        mt_loc = '<loc></loc>'
        self.assertRaises(SitemapParseError, s.parse_xml,
                          io.StringIO(xml_start + mt_loc + xml_end))
        mt_loc_att = '<loc att="value"/>'
        self.assertRaises(SitemapParseError, s.parse_xml,
                          io.StringIO(xml_start + mt_loc_att + xml_end))
Ejemplo n.º 19
0
    def convert_to_xml(self, resources, sitemap_index=False, fh=None):
        """Write or return XML for a set of resources in sitemap format.

        Arguments:
        - resources - either an iterable or iterator of Resource objects;
                      if there an md attribute this will go to <rs:md>
                      if there an ln attribute this will go to <rs:ln>
        - sitemapindex - set True to write sitemapindex instead of sitemap
        - fh - write to filehandle fh instead of returning string
        """
        sitemap = Sitemap()
        self.res_container = resources
        if len(self.res_container) == 0:
            return
        return sitemap.resources_as_xml(self.res_container,
                                        sitemapindex=sitemap_index,
                                        fh=fh)
Ejemplo n.º 20
0
 def test3_with_md5(self):
     ib = InventoryBuilder(do_md5=True)
     ib.mapper = Mapper(
         ['http://example.org/t', 'resync/test/testdata/dir1'])
     i = ib.from_disk()
     s = Sitemap()
     xml = s.resources_as_xml(i)
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_a</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>20</rs:size><rs:fixity type="md5">a/Jv1mYBtSjS4LR\+qoft/Q==</rs:fixity>',
             xml))  #must escape + in md5
     self.assertNotEqual(
         None,
         re.search(
             '<loc>http://example.org/t/file_b</loc><lastmod>[\w\:\-]+Z</lastmod><rs:size>45</rs:size><rs:fixity type="md5">RS5Uva4WJqxdbnvoGzneIQ==</rs:fixity>',
             xml))
Ejemplo n.º 21
0
 def generator():
     for file_name in self.paras.last_sitemaps:
         listbase = ListBaseWithIndex()
         if os.path.exists(file_name):
             with open(file_name, "r", encoding="utf-8") as lb_file:
                 sm = Sitemap()
                 sm.parse_xml(lb_file, resources=listbase)
             for resource in listbase.resources:
                 if resource.change is None or not resource.change == "deleted":
                     path, relpath = self.extract_paths(resource.uri)
                     yield resource, path, relpath
         else:
             LOG.warning("Unable to read sitemap: %s" % file_name)
             self.count_errors += 1
             self.observers_inform(
                 self,
                 ResourceAuditorEvent.site_map_not_found,
                 file=file_name)
Ejemplo n.º 22
0
 def test_21_parse_sitemapindex(self):
     s = Sitemap()
     si = s.parse_xml(fh=io.StringIO(
         '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>aaa</loc></sitemap><sitemap><loc>bbb</loc></sitemap></sitemapindex>'
     ),
                      sitemapindex=True)
     self.assertEqual(len(si.resources), 2, '2 sitemaps')
     sms = sorted(si.uris())
     self.assertEqual(sms, ['aaa', 'bbb'])
     # add a couple more
     s.parse_xml(fh=io.StringIO(
         '<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"><sitemap><loc>cc</loc></sitemap><sitemap><loc>dd</loc></sitemap></sitemapindex>'
     ),
                 resources=si)
     self.assertTrue(s.parsed_index, 'was a sitemapindex')
     self.assertEqual(len(si.resources), 4, '4 sitemaps total')
     sms = sorted(si.uris())
     self.assertEqual(sms, ['aaa', 'bbb', 'cc', 'dd'])
Ejemplo n.º 23
0
    def test_30_parse_change_list(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md change="updated" length="12" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\
</urlset>'

        s = Sitemap()
        s.resource_class = Resource
        c = s.parse_xml(fh=io.StringIO(xml))
        self.assertEqual(s.resources_created, 2, 'got 2 resources')
        i = iter(c)
        r1 = next(i)
        self.assertEqual(r1.uri, '/tmp/rs_test/src/file_a')
        self.assertEqual(r1.change, 'updated')
        r2 = next(i)
        self.assertEqual(r2.uri, '/tmp/rs_test/src/file_b')
        self.assertEqual(r2.change, None)
Ejemplo n.º 24
0
    def test_19_parse_with_bad_rs_ln(self):
        xmlstart = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="resourcelist"/>\
<url><loc>http://example.com/file_a</loc>'

        xmlend = '</url></urlset>'
        s = Sitemap()
        #
        # missing href
        xml = xmlstart + '<rs:ln rel="duplicate"/>' + xmlend
        self.assertRaises(SitemapParseError,
                          s.parse_xml,
                          fh=StringIO.StringIO(xml))
        # missing rel
        xml = xmlstart + '<rs:ln href="http://example.com/"/>' + xmlend
        self.assertRaises(SitemapParseError,
                          s.parse_xml,
                          fh=StringIO.StringIO(xml))
        # bad length
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="a"/>' + xmlend
        self.assertRaises(SitemapParseError,
                          s.parse_xml,
                          fh=StringIO.StringIO(xml))
        # bad pri
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="fff"/>' + xmlend
        self.assertRaises(SitemapParseError,
                          s.parse_xml,
                          fh=StringIO.StringIO(xml))
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="0"/>' + xmlend
        self.assertRaises(SitemapParseError,
                          s.parse_xml,
                          fh=StringIO.StringIO(xml))
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" pri="1000000"/>' + xmlend
        self.assertRaises(SitemapParseError,
                          s.parse_xml,
                          fh=StringIO.StringIO(xml))
        # and finally OK with errors fixes
        xml = xmlstart + '<rs:ln rel="duplicate" href="http://example.com/" length="12345" pri="1" other="whatever"/>' + xmlend
        rc = s.parse_xml(fh=StringIO.StringIO(xml))
        self.assertEqual(len(rc.resources), 1,
                         'good at last, extra attribute ignored')
Ejemplo n.º 25
0
 def test_all_simple_read(self):
     """Just try to read each one"""
     for ex in ('archives_ex_2_1','archives_ex_2_2',
                'archives_ex_3_1','archives_ex_3_2',
                'archives_ex_4_1',
                'archives_ex_5_1',
                'archives_ex_6_1',
                'resourcesync_ex_1','resourcesync_ex_2','resourcesync_ex_3',
                'resourcesync_ex_4','resourcesync_ex_5','resourcesync_ex_6',
                'resourcesync_ex_7','resourcesync_ex_8','resourcesync_ex_12',
                'resourcesync_ex_13','resourcesync_ex_14','resourcesync_ex_15',
                'resourcesync_ex_16','resourcesync_ex_17','resourcesync_ex_18',
                'resourcesync_ex_19','resourcesync_ex_20','resourcesync_ex_21',
                'resourcesync_ex_22','resourcesync_ex_23','resourcesync_ex_24',
                'resourcesync_ex_25','resourcesync_ex_26','resourcesync_ex_27',
                'resourcesync_ex_28','resourcesync_ex_29','resourcesync_ex_30',
                'resourcesync_ex_31','resourcesync_ex_32','resourcesync_ex_33'):
         s=Sitemap()
         fh = self._open_ex(ex)
         si = s.parse_xml( fh=fh )
Ejemplo n.º 26
0
    def test_13_parse_multi_lastmod(self):
        xml_start = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<url><loc>uri:a</loc>'

        xml_end = '</url></urlset>'
        s = Sitemap()
        two_lastmod = '<lastmod>2013-01-01</lastmod><lastmod>2013-01-02</lastmod>'
        self.assertRaises(SitemapParseError, s.parse_xml,
                          io.StringIO(xml_start + two_lastmod + xml_end))
        # While it not ideal to omit, <lastmod> is not required and
        # thus either empty lastmod or lastmod with just an attribute
        # and no content are not ambiguous and thus should be accepted
        # with resulting None for resource.lastmod
        mt_lastmod = '<lastmod></lastmod>'
        i = s.parse_xml(fh=io.StringIO(xml_start + mt_lastmod + xml_end))
        self.assertEqual(s.resources_created, 1)
        self.assertEqual(i.resources[0].lastmod, None)
        mt_lastmod_att = '<lastmod att="value"/>'
        i = s.parse_xml(fh=io.StringIO(xml_start + mt_lastmod_att + xml_end))
        self.assertEqual(s.resources_created, 1)
        self.assertEqual(i.resources[0].lastmod, None)
Ejemplo n.º 27
0
 def parse_sitemap(self):
     s = Sitemap(allow_multifile=self.allow_multifile)
     self.logger.info("Reading sitemap(s) from %s ..." % (self.sitemap))
     i = s.read(self.sitemap)
     num_entries = len(i)
     self.logger.warning("Read sitemap with %d entries in %d sitemaps" %
                         (num_entries, s.sitemaps_created))
     if (self.verbose):
         to_show = 100
         override_str = ' (override with --max-sitemap-entries)'
         if (self.max_sitemap_entries):
             to_show = self.max_sitemap_entries
             override_str = ''
         if (num_entries > to_show):
             print "Showing first %d entries sorted by URI%s..." % (
                 to_show, override_str)
         n = 0
         for r in i:
             print r
             n += 1
             if (n >= to_show):
                 break
Ejemplo n.º 28
0
 def parse_sitemap(self):
     s = Sitemap(verbose=self.verbose, allow_multifile=self.allow_multifile)
     if (self.verbose):
         print "Reading sitemap(s) from %s ..." % (sitemap)
     i = s.read(sitemap)
     num_entries = len(i)
     print "Read sitemap with %d entries in %d sitemaps" % (
         num_entries, s.sitemaps_created)
     if (self.verbose):
         to_show = 100
         override_str = ' (override with --max-sitemap-entries)'
         if (self.max_sitemap_entries):
             to_show = self.max_sitemap_entries
             override_str = ''
         if (num_entries > to_show):
             print "Showing first %d entries sorted by URI%s..." % (
                 to_show, override_str)
         n = 0
         for r in i.resource_uris():
             print i.resources[r]
             n += 1
             if (n >= to_show):
                 break
Ejemplo n.º 29
0
 def changeset_sitemap(self,
                       outfile=None,
                       ref_sitemap=None,
                       newref_sitemap=None,
                       empty=None,
                       capabilities=None,
                       dump=None):
     changeset = ChangeSet()
     changeset.capabilities = capabilities
     if (not empty):
         # 1. Get and parse reference sitemap
         old_inv = self.read_reference_sitemap(ref_sitemap)
         # 2. Depending on whether a newref_sitemap was specified, either read that
         # or build inventory from files on disk
         if (newref_sitemap is None):
             # Get inventory from disk
             new_inv = self.inventory
         else:
             new_inv = self.read_reference_sitemap(newref_sitemap,
                                                   name='new reference')
         # 3. Calculate changeset
         (same, updated, deleted, created) = old_inv.compare(new_inv)
         changeset.add_changed_resources(updated, changetype='UPDATED')
         changeset.add_changed_resources(deleted, changetype='DELETED')
         changeset.add_changed_resources(created, changetype='CREATED')
     # 4. Write out changeset
     s = Sitemap(pretty_xml=True,
                 allow_multifile=self.allow_multifile,
                 mapper=self.mapper)
     if (self.max_sitemap_entries is not None):
         s.max_sitemap_entries = self.max_sitemap_entries
     if (outfile is None):
         print s.resources_as_xml(changeset, changeset=True)
     else:
         s.write(changeset, basename=outfile, changeset=True)
     self.write_dump_if_requested(changeset, dump)
Ejemplo n.º 30
0
 def sitemap(self):
     """Creates a sitemap inventory"""
     self.inventory.generate()
     return Sitemap().inventory_as_xml(self.inventory)