Example #1
0
 def __init__(self, *a, **kw):
     super(SitemapSpider, self).__init__(*a, **kw)
     self._cbs = []
     for rule, cb in self.sitemap_rules:
         if isinstance(cb, basestring):
             cb = getattr(self, cb)
         self._cbs.append((regex(rule), cb))
     self._follow = [regex(x) for x in self.sitemap_follow]
     self._current_sitemap_url = None
     self._sitemap_urls = self.sitemap_urls[:]
     self._site_urls = []
Example #2
0
 def __init__(self, *a, **kw):
     super(SitemapSpider, self).__init__(*a, **kw)
     self._cbs = []
     for rule, cb in self.sitemap_rules:
         if isinstance(cb, basestring):
             cb = getattr(self, cb)
         self._cbs.append((regex(rule), cb))
     self._follow = [regex(x) for x in self.sitemap_follow]
     self._current_sitemap_url = None
     self._sitemap_urls = self.sitemap_urls[:]
     self._site_urls = []
    def __init__(self, allow=None, deny=None,
                 allow_domains=None, deny_domains=None,
                 tags=['a', 'area', 'link'], attrs=['href'], unique=True,
                 deny_extensions=None, filter_mobile=True):
        self.allow_res = [regex(x) for x in arg_to_iter(allow)]
        self.deny_res = [regex(x) for x in arg_to_iter(deny)]
        self.allow_domains = set(arg_to_iter(allow_domains))
        self.deny_domains = set(arg_to_iter(deny_domains))
        self.unique = unique
        if deny_extensions is None:
            deny_extensions = IGNORED_EXTENSIONS
        self.deny_extensions = set(['.' + e for e in deny_extensions])
        self.filter_mobile = filter_mobile

        tags = list(arg_to_iter(tags))  # make a local copy
        self.tag_func = lambda x: x in tags

        attrs = list(arg_to_iter(attrs))
        self.attr_func = lambda x: x in attrs
 def test_regex(self):
     re_type = type(re.compile(''))
     self.assertIsInstance(regex(r'.*'), re_type)
     self.assertIsInstance(regex(u'abc'), re_type)
     self.assertIsInstance(regex(re.compile('.')), re_type)
Example #5
0
 def test_regex(self):
     re_type = type(re.compile(''))
     self.assertIsInstance(regex(r'.*'), re_type)
     self.assertIsInstance(regex(u'abc'), re_type)
     self.assertIsInstance(regex(re.compile('.')), re_type)