Ejemplo n.º 1
0
 def test_08_print(self):
     lb = ListBaseWithIndex()
     lb.add( Resource(uri='a',lastmod='2001-01-01',length=1234) )
     lb.add( Resource(uri='b',lastmod='2002-02-02',length=56789) )
     lb.add( Resource(uri='c',lastmod='2003-03-03',length=0) )
     lb.md['from']=None #avoid now being added
     #print lb
     self.assertEqual( lb.as_xml(), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:md capability="unknown" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length="1234" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length="56789" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length="0" /></url></urlset>' )
Ejemplo n.º 2
0
    def test03_parse_2(self):
        xml = '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="unknown" from="2013-02-12T14:09:00Z" />\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\
</urlset>'
        lb = ListBaseWithIndex()
        lb.parse(fh=io.StringIO(xml))
        self.assertEqual(len(lb.resources), 2, 'got 2 resources')
Ejemplo n.º 3
0
    def test_11_parse_2(self):
        xml='<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n\
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/">\
<rs:md capability="unknown" from="2013-02-12T14:09:00Z" />\
<url><loc>/tmp/rs_test/src/file_a</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="12" /></url>\
<url><loc>/tmp/rs_test/src/file_b</loc><lastmod>2012-03-14T18:37:36Z</lastmod><rs:md length="32" /></url>\
</urlset>'
        lb=ListBaseWithIndex()
        lb.parse(fh=StringIO.StringIO(xml))
        self.assertEqual( len(lb.resources), 2, 'got 2 resources')
Ejemplo n.º 4
0
 def test20_index_as_xml(self):
     # Check XML for empty case
     lb = ListBaseWithIndex()
     self.assertEqual(lb.index_as_xml(
     ), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:md capability="unknown" /></sitemapindex>')
     # Add a resource and make sure we find that
     lb.add(Resource(uri='a', lastmod='2001-01-01', length=1234))
     xml = lb.index_as_xml()
     self.assertTrue(
         re.search(r'<loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod>', xml))
Ejemplo n.º 5
0
 def test01_print(self):
     lb = ListBaseWithIndex()
     lb.add(Resource(uri='a', lastmod='2001-01-01', length=1234))
     lb.add(Resource(uri='b', lastmod='2002-02-02', length=56789))
     lb.add(Resource(uri='c', lastmod='2003-03-03', length=0))
     lb.md['from'] = None  # avoid now being added
     self.assertEqual(lb.as_xml(), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:md capability="unknown" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length="1234" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length="56789" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length="0" /></url></urlset>')
Ejemplo n.º 6
0
def render_well_know_resourcesync():
    """Generate source description xml."""
    cap = ListBaseWithIndex(capability_name='description',
                            ln=[{
                                'href': request.url_root,
                                'rel': 'describedby'
                            }])
    cap.add(
        Resource('{}resync/capability.xml'.format(request.url_root),
                 capability='capability'))

    return cap.as_xml()
Ejemplo n.º 7
0
 def test17_as_xml_index(self):
     r = [Resource(uri='a', lastmod='2006-01-01', length=12),
          Resource(uri='b', lastmod='2007-02-02', length=34),
          Resource(uri='c', lastmod='2008-03-03', length=56)]
     lb = ListBaseWithIndex(resources=r)
     lb.max_sitemap_entries = 2
     xml = lb.as_xml_index()
     self.assertTrue(re.search(r'<loc>/tmp/sitemap00001.xml</loc>', xml))
     self.assertFalse(re.search(r'<loc>/tmp/sitemap00002.xml</loc>', xml))
     # Index not required
     lb.max_sitemap_entries = 3
     self.assertRaises(ListBaseIndexError, lb.as_xml_index)
Ejemplo n.º 8
0
 def test18_as_xml_part(self):
     r = [
         Resource(uri='a', lastmod='2006-01-01', length=12),
         Resource(uri='b', lastmod='2007-02-02', length=34),
         Resource(uri='c', lastmod='2008-03-03', length=56)
     ]
     lb = ListBaseWithIndex(resources=r)
     # Allow unlimited entries, part makes no sense
     lb.max_sitemap_entries = None
     self.assertRaises(ListBaseIndexError, lb.as_xml_part)
     # Request after end
     lb.max_sitemap_entries = 1
     self.assertRaises(ListBaseIndexError, lb.as_xml_part, part_number=9)
     # Allow only 1 entry
     lb.max_sitemap_entries = 1
     xml = lb.as_xml_part(part_number=1)
     self.assertFalse(re.search(r'<loc>a</loc>', xml))
     self.assertTrue(re.search(r'<loc>b</loc>', xml))
     self.assertFalse(re.search(r'<loc>c</loc>', xml))
     # Request truncated
     lb.max_sitemap_entries = 2
     xml = lb.as_xml_part(part_number=1)
     self.assertFalse(re.search(r'<loc>a</loc>', xml))
     self.assertFalse(re.search(r'<loc>b</loc>', xml))
     self.assertTrue(re.search(r'<loc>c</loc>', xml))
Ejemplo n.º 9
0
 def test17_as_xml_index(self):
     r = [
         Resource(uri='a', lastmod='2006-01-01', length=12),
         Resource(uri='b', lastmod='2007-02-02', length=34),
         Resource(uri='c', lastmod='2008-03-03', length=56)
     ]
     lb = ListBaseWithIndex(resources=r)
     lb.max_sitemap_entries = 2
     xml = lb.as_xml_index()
     self.assertTrue(re.search(r'<loc>/tmp/sitemap00001.xml</loc>', xml))
     self.assertFalse(re.search(r'<loc>/tmp/sitemap00002.xml</loc>', xml))
     # Index not required
     lb.max_sitemap_entries = 3
     self.assertRaises(ListBaseIndexError, lb.as_xml_index)
Ejemplo n.º 10
0
 def read_sitemap(self, path, sitemap=None):
     if sitemap is None:
         sitemap = ListBaseWithIndex()
     with open(path, "r", encoding="utf-8") as file:
         sm = Sitemap()
         sm.parse_xml(file, resources=sitemap)
     return sitemap
Ejemplo n.º 11
0
 def test_08_print_iter(self):
     r = [ Resource(uri='a',lastmod='2001-01-01',length=1234),
           Resource(uri='b',lastmod='2002-02-02',length=56789),
           Resource(uri='c',lastmod='2003-03-03',length=0) ]
     # without setting count will barf on len() attempt
     lb = ListBaseWithIndex( resources=iter(r) )
     self.assertRaises( TypeError, lb.as_xml )
     # set explicit count larger than max_sitemap_entiries and as_xml will throw exception
     lb = ListBaseWithIndex( resources=iter(r), count=3 )
     lb.max_sitemap_entries = 2
     self.assertRaises( ListBaseIndexError, lb.as_xml )
     # set explicit count and all will be OK
     lb = ListBaseWithIndex( resources=iter(r), count=3 )
     lb.md['from']=None #avoid now being added
     #print lb
     self.assertEqual( lb.as_xml(), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:md capability="unknown" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length="1234" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length="56789" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length="0" /></url></urlset>' )
Ejemplo n.º 12
0
 def test16_as_xml(self):
     r = [Resource(uri='a', lastmod='2014-04-14', length=14),
          Resource(uri='b', lastmod='2015-05-15', length=15),
          Resource(uri='c', lastmod='2016-06-16', length=16)]
     lb = ListBaseWithIndex(resources=r)
     lb.max_sitemap_entries = 3
     # One file
     xml = lb.as_xml()
     self.assertTrue(re.search(r'<urlset ', xml))
     self.assertTrue(re.search(r'<loc>a</loc>', xml))
     self.assertTrue(re.search(r'<loc>b</loc>', xml))
     self.assertTrue(re.search(r'<loc>c</loc>', xml))
     # Needs multifile bit not allowed
     lb.max_sitemap_entries = 1
     self.assertRaises(ListBaseIndexError, lb.as_xml)
     # Allow multifile...
     xml = lb.as_xml(allow_multifile=True)
     self.assertTrue(re.search(r'<sitemapindex', xml))
     self.assertTrue(re.search(r'<loc>/tmp/sitemap00001.xml</loc>', xml))
     self.assertTrue(re.search(r'<loc>/tmp/sitemap00002.xml</loc>', xml))
     self.assertFalse(re.search(r'<loc>/tmp/sitemap00003.xml</loc>', xml))
Ejemplo n.º 13
0
    def get_change_dump_index(self):
        """
        Delete unregister bucket by pid.

        Arguments:
        Returns:
            None.

        """
        if not self._validation():
            return None
        changedump = ListBaseWithIndex(capability_name='changedump', )
        changedump.up = INVENIO_CAPABILITY_URL.format(request.url_root)
        published_date = self.publish_date or datetime.datetime.utcnow()
        change_date = published_date
        day_now = datetime.datetime.now()

        while change_date < day_now:
            until = change_date + timedelta(days=self.interval_by_date)
            if until > day_now:
                until = day_now
            change = Resource(
                '{}/{}/changedump.xml'.format(self.url_path,
                                              change_date.strftime(r"%Y%m%d")),
                capability='changedump',
                md_from=str(
                    change_date.replace(
                        tzinfo=datetime.timezone.utc).isoformat()),
                md_until=str(
                    until.replace(tzinfo=datetime.timezone.utc).isoformat()))
            changedump.add(change)
            change_date = until
        return changedump.as_xml()
Ejemplo n.º 14
0
 def generator():
     for file_name in self.paras.last_sitemaps:
         listbase = ListBaseWithIndex()
         if os.path.exists(file_name):
             with open(file_name, "r", encoding="utf-8") as lb_file:
                 sm = Sitemap()
                 sm.parse_xml(lb_file, resources=listbase)
             for resource in listbase.resources:
                 if resource.change is None or not resource.change == "deleted":
                     path, relpath = self.extract_paths(resource.uri)
                     yield resource, path, relpath
         else:
             LOG.warning("Unable to read sitemap: %s" % file_name)
             self.count_errors += 1
             self.observers_inform(
                 self,
                 ResourceAuditorEvent.site_map_not_found,
                 file=file_name)
Ejemplo n.º 15
0
 def test20_index_as_xml(self):
     # Check XML for empty case
     lb = ListBaseWithIndex()
     self.assertEqual(
         lb.index_as_xml(),
         '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:md capability="unknown" /></sitemapindex>'
     )
     # Add a resource and make sure we find that
     lb.add(Resource(uri='a', lastmod='2001-01-01', length=1234))
     xml = lb.index_as_xml()
     self.assertTrue(
         re.search(r'<loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod>',
                   xml))
Ejemplo n.º 16
0
 def test02_print_iter(self):
     r = [Resource(uri='a', lastmod='2001-01-01', length=1234),
          Resource(uri='b', lastmod='2002-02-02', length=56789),
          Resource(uri='c', lastmod='2003-03-03', length=0)]
     # without setting count will barf on len() attempt
     lb = ListBaseWithIndex(resources=iter(r))
     self.assertRaises(TypeError, lb.as_xml)
     # set explicit count larger than max_sitemap_entiries and as_xml will
     # throw exception
     lb = ListBaseWithIndex(resources=iter(r), count=3)
     lb.max_sitemap_entries = 2
     self.assertRaises(ListBaseIndexError, lb.as_xml)
     # set explicit count and all will be OK
     lb = ListBaseWithIndex(resources=iter(r), count=3)
     lb.md['from'] = None  # avoid now being added
     self.assertEqual(lb.as_xml(), '<?xml version=\'1.0\' encoding=\'UTF-8\'?>\n<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9" xmlns:rs="http://www.openarchives.org/rs/terms/"><rs:md capability="unknown" /><url><loc>a</loc><lastmod>2001-01-01T00:00:00Z</lastmod><rs:md length="1234" /></url><url><loc>b</loc><lastmod>2002-02-02T00:00:00Z</lastmod><rs:md length="56789" /></url><url><loc>c</loc><lastmod>2003-03-03T00:00:00Z</lastmod><rs:md length="0" /></url></urlset>')
Ejemplo n.º 17
0
 def test18_as_xml_part(self):
     r = [Resource(uri='a', lastmod='2006-01-01', length=12),
          Resource(uri='b', lastmod='2007-02-02', length=34),
          Resource(uri='c', lastmod='2008-03-03', length=56)]
     lb = ListBaseWithIndex(resources=r)
     # Allow unlimited entries, part makes no sense
     lb.max_sitemap_entries = None
     self.assertRaises(ListBaseIndexError, lb.as_xml_part)
     # Request after end
     lb.max_sitemap_entries = 1
     self.assertRaises(ListBaseIndexError, lb.as_xml_part, part_number=9)
     # Allow only 1 entry
     lb.max_sitemap_entries = 1
     xml = lb.as_xml_part(part_number=1)
     self.assertFalse(re.search(r'<loc>a</loc>', xml))
     self.assertTrue(re.search(r'<loc>b</loc>', xml))
     self.assertFalse(re.search(r'<loc>c</loc>', xml))
     # Request truncated
     lb.max_sitemap_entries = 2
     xml = lb.as_xml_part(part_number=1)
     self.assertFalse(re.search(r'<loc>a</loc>', xml))
     self.assertFalse(re.search(r'<loc>b</loc>', xml))
     self.assertTrue(re.search(r'<loc>c</loc>', xml))
Ejemplo n.º 18
0
 def test16_as_xml(self):
     r = [
         Resource(uri='a', lastmod='2014-04-14', length=14),
         Resource(uri='b', lastmod='2015-05-15', length=15),
         Resource(uri='c', lastmod='2016-06-16', length=16)
     ]
     lb = ListBaseWithIndex(resources=r)
     lb.max_sitemap_entries = 3
     # One file
     xml = lb.as_xml()
     self.assertTrue(re.search(r'<urlset ', xml))
     self.assertTrue(re.search(r'<loc>a</loc>', xml))
     self.assertTrue(re.search(r'<loc>b</loc>', xml))
     self.assertTrue(re.search(r'<loc>c</loc>', xml))
     # Needs multifile bit not allowed
     lb.max_sitemap_entries = 1
     self.assertRaises(ListBaseIndexError, lb.as_xml)
     # Allow multifile...
     xml = lb.as_xml(allow_multifile=True)
     self.assertTrue(re.search(r'<sitemapindex', xml))
     self.assertTrue(re.search(r'<loc>/tmp/sitemap00001.xml</loc>', xml))
     self.assertTrue(re.search(r'<loc>/tmp/sitemap00002.xml</loc>', xml))
     self.assertFalse(re.search(r'<loc>/tmp/sitemap00003.xml</loc>', xml))