def get_spiders_by_url(self, url): """Return the names of all the available spiders that are prefered to handle the given url. """ spiders = [] for name, cls in self._spiders.iteritems(): allowed_domains = [name] + getattr(cls, "allowed_domains", []) if is_url_from_any_domain(url, allowed_domains): spiders.append(name) return spiders
def get_spiders_by_url(self, url): '''Return the names of all the available spiders that are prefered to handle the given url. ''' spiders = [] for name, cls in self._spiders.iteritems(): allowed_domains = [name] + getattr(cls, 'allowed_domains', []) if is_url_from_any_domain(url, allowed_domains): spiders.append(name) return spiders
def url_allowed(self, url): url = to_str(url) parsed_url = urlparse(url) allowed = parsed_url.scheme in ['http', 'https', 'file'] # filter mobile and pda sites if allowed and self.filter_mobile: allowed &= not parsed_url.netloc.startswith('m.') allowed &= not parsed_url.netloc.startswith('pda.') if allowed and self.allow_res: allowed &= _matches(url, self.allow_res) if allowed and self.deny_res: allowed &= not _matches(url, self.deny_res) if allowed and self.allow_domains: allowed &= is_url_from_any_domain(parsed_url, self.allow_domains) if allowed and self.deny_domains: allowed &= not is_url_from_any_domain(parsed_url, self.deny_domains) if allowed and self.deny_extensions: allowed &= not has_url_any_extension(parsed_url, self.deny_extensions) return allowed
def test_is_url_from_any_domain(self): url = 'http://www.wheele-bin-art.co.uk/get/product/123' self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.co.uk'])) self.assertFalse(is_url_from_any_domain(url, ['art.co.uk'])) url = 'http://wheele-bin-art.co.uk/get/product/123' self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.co.uk'])) self.assertFalse(is_url_from_any_domain(url, ['art.co.uk'])) url = 'http://www.Wheele-Bin-Art.co.uk/get/product/123' self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.CO.UK'])) self.assertTrue(is_url_from_any_domain(url, ['WHEELE-BIN-ART.CO.UK'])) url = 'http://192.169.0.15:8080/mypage.html' self.assertTrue(is_url_from_any_domain(url, ['192.169.0.15:8080'])) self.assertFalse(is_url_from_any_domain(url, ['192.169.0.15'])) url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29' self.assertFalse(is_url_from_any_domain(url, ['testdomain.com'])) self.assertFalse(is_url_from_any_domain(url+'.testdomain.com', ['testdomain.com']))
def test_is_url_from_any_domain(self): url = 'http://www.wheele-bin-art.co.uk/get/product/123' self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.co.uk'])) self.assertFalse(is_url_from_any_domain(url, ['art.co.uk'])) url = 'http://wheele-bin-art.co.uk/get/product/123' self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.co.uk'])) self.assertFalse(is_url_from_any_domain(url, ['art.co.uk'])) url = 'http://www.Wheele-Bin-Art.co.uk/get/product/123' self.assertTrue(is_url_from_any_domain(url, ['wheele-bin-art.CO.UK'])) self.assertTrue(is_url_from_any_domain(url, ['WHEELE-BIN-ART.CO.UK'])) url = 'http://192.169.0.15:8080/mypage.html' self.assertTrue(is_url_from_any_domain(url, ['192.169.0.15:8080'])) self.assertFalse(is_url_from_any_domain(url, ['192.169.0.15'])) url = 'javascript:%20document.orderform_2581_1190810811.mode.value=%27add%27;%20javascript:%20document.orderform_2581_1190810811.submit%28%29' self.assertFalse(is_url_from_any_domain(url, ['testdomain.com'])) self.assertFalse( is_url_from_any_domain(url + '.testdomain.com', ['testdomain.com']))