Beispiel #1
0
 def get_spiders_by_url(self, url):
     """Return the names of all the available spiders that are prefered to
     handle the given url.
     """
     spiders = []
     for name, cls in self._spiders.iteritems():
         allowed_domains = [name] + getattr(cls, "allowed_domains", [])
         if is_url_from_any_domain(url, allowed_domains):
             spiders.append(name)
     return spiders
 def get_spiders_by_url(self, url):
     '''Return the names of all the available spiders that are prefered to
     handle the given url.
     '''
     spiders = []
     for name, cls in self._spiders.iteritems():
         allowed_domains = [name] + getattr(cls, 'allowed_domains', [])
         if is_url_from_any_domain(url, allowed_domains):
             spiders.append(name)
     return spiders
 def url_allowed(self, url):
     url = to_str(url)
     parsed_url = urlparse(url)
     allowed = parsed_url.scheme in ['http', 'https', 'file']
     # filter mobile and pda sites
     if allowed and self.filter_mobile:
         allowed &= not parsed_url.netloc.startswith('m.')
         allowed &= not parsed_url.netloc.startswith('pda.')
     if allowed and self.allow_res:
         allowed &= _matches(url, self.allow_res)
     if allowed and self.deny_res:
         allowed &= not _matches(url, self.deny_res)
     if allowed and self.allow_domains:
         allowed &= is_url_from_any_domain(parsed_url, self.allow_domains)
     if allowed and self.deny_domains:
         allowed &= not is_url_from_any_domain(parsed_url, self.deny_domains)
     if allowed and self.deny_extensions:
         allowed &= not has_url_any_extension(parsed_url, self.deny_extensions)
     return allowed
Beispiel #4
0
    def test_is_url_from_any_domain(self):
        url = 'http://www.wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(is_url_from_any_domain(url, ['art.co.uk']))

        url = 'http://wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(is_url_from_any_domain(url, ['art.co.uk']))

        url = 'http://www.Wheele-Bin-Art.co.uk/get/product/123'
        self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.CO.UK']))
        self.assertTrue(is_url_from_any_domain(url, ['WHEELE-BIN-ART.CO.UK']))

        url = 'http://192.169.0.15:8080/mypage.html'
        self.assertTrue(is_url_from_any_domain(url, ['192.169.0.15:8080']))
        self.assertFalse(is_url_from_any_domain(url, ['192.169.0.15']))

        url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29'
        self.assertFalse(is_url_from_any_domain(url, ['testdomain.com']))
        self.assertFalse(is_url_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
Beispiel #5
0
    def test_is_url_from_any_domain(self):
        url = 'http://www.wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(is_url_from_any_domain(url, ['art.co.uk']))

        url = 'http://wheele-bin-art.co.uk/get/product/123'
        self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.co.uk']))
        self.assertFalse(is_url_from_any_domain(url, ['art.co.uk']))

        url = 'http://www.Wheele-Bin-Art.co.uk/get/product/123'
        self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.CO.UK']))
        self.assertTrue(is_url_from_any_domain(url, ['WHEELE-BIN-ART.CO.UK']))

        url = 'http://192.169.0.15:8080/mypage.html'
        self.assertTrue(is_url_from_any_domain(url, ['192.169.0.15:8080']))
        self.assertFalse(is_url_from_any_domain(url, ['192.169.0.15']))

        url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29'
        self.assertFalse(is_url_from_any_domain(url, ['testdomain.com']))
        self.assertFalse(
            is_url_from_any_domain(url + '.testdomain.com',
                                   ['testdomain.com']))