def test_generosity(self): robotstxt_incorrect = """ Foo: Foobot Bar: /""" rp = Protego.parse(content=robotstxt_incorrect) self.assertTrue(rp.can_fetch('http://foo.bar/x/y', 'FooBot')) robotstxt_incorrect_accepted = """ user-agent foobot disallow / user agent harry potter disallow /horcrux request rate 1/10s 1820-1940 """ rp = Protego.parse(content=robotstxt_incorrect_accepted) self.assertFalse(rp.can_fetch('http://foo.bar/x/y', 'FooBot')) self.assertFalse(rp.can_fetch('http://foo.bar/horcrux', 'harry potter')) self.assertTrue(rp.can_fetch('http://foo.bar/abc', 'harry potter')) req_rate = rp.request_rate('harry potter') self.assertTrue(req_rate.requests == 1) self.assertTrue(req_rate.seconds == 10) self.assertTrue(req_rate.start_time.hour == 18) self.assertTrue(req_rate.start_time.minute == 20) self.assertTrue(req_rate.end_time.hour == 19) self.assertTrue(req_rate.end_time.minute == 40) wildcards_in_user_agent = """ user-agent: foo*bot disallow: /myprofile """ rp = Protego.parse(content=wildcards_in_user_agent) self.assertFalse(rp.can_fetch('http://foo.bar/myprofile', 'foo*bot')) self.assertFalse(rp.can_fetch('http://foo.bar/myprofile', 'foobot'))
def test_comments(self): content = """ # comment 1 User-Agent: one # comment 2 # comment 3 User-Agent: two Disallow: /one-two-bot # Disallow: /commented # comment 4 User-Agent: * Disallow: /default-ua """ rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("https://site.local/one-two-bot", "one")) self.assertFalse(rp.can_fetch("https://site.local/one-two-bot", "two")) self.assertTrue(rp.can_fetch("https://site.local/commented", "one")) self.assertTrue(rp.can_fetch("https://site.local/commented", "two")) self.assertTrue(rp.can_fetch("https://site.local/default-ua", "one")) self.assertTrue(rp.can_fetch("https://site.local/default-ua", "two")) content = ("User-agent: FooBot\n" "# Disallow: /\n" "Disallow: /foo/quz#qux\n" "Allow: /\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/foo/bar", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/foo/quz", "FooBot"))
def test_percentage_encoding(self): content = ("User-agent: FooBot\n" "Disallow: /\n" "Allow: /foo/bar?qux=taz&baz=http://foo.bar?tar&par\n") rp = Protego.parse(content=content) self.assertTrue( rp.can_fetch( "http://foo.bar/foo/bar?qux=taz&baz=http://foo.bar?tar&par", "FooBot")) content = ("User-agent: FooBot\n" "Disallow: /\n" u"Allow: /foo/bar/ツ\n") rp = Protego.parse(content=content) self.assertTrue( rp.can_fetch("http://foo.bar/foo/bar/%E3%83%84", "FooBot")) self.assertTrue(rp.can_fetch(u"http://foo.bar/foo/bar/ツ", "FooBot")) content = ("User-agent: FooBot\n" "Disallow: /\n" "Allow: /foo/bar/%E3%83%84\n") rp = Protego.parse(content=content) self.assertTrue( rp.can_fetch("http://foo.bar/foo/bar/%E3%83%84", "FooBot")) self.assertTrue(rp.can_fetch(u"http://foo.bar/foo/bar/ツ", "FooBot")) content = ("User-agent: FooBot\n" "Disallow: /\n" "Allow: /foo/bar/%62%61%7A\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/foo/bar/baz", "FooBot")) self.assertTrue( rp.can_fetch("http://foo.bar/foo/bar/%62%61%7A", "FooBot"))
def test_url_case_sensitivity(self): content = ("user-agent: FooBot\n" "disallow: /x/\n") rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("http://foo.bar/x/y", "FooBot")) content = ("user-agent: FooBot\n" "disallow: /X/\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/x/y", "FooBot"))
def test_unicode_url_and_useragent(self): content = u""" User-Agent: * Disallow: /admin/ Disallow: /static/ # taken from https://en.wikipedia.org/robots.txt Disallow: /wiki/K%C3%A4ytt%C3%A4j%C3%A4: Disallow: /wiki/Käyttäjä: Disallow: /wiki/Keskustelu_käyttäjästä: User-Agent: UnicödeBöt Disallow: /some/randome/page.html""" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://site.local/", "*")) self.assertFalse(rp.can_fetch("https://site.local/admin/", "*")) self.assertFalse(rp.can_fetch("https://site.local/static/", "*")) self.assertTrue( rp.can_fetch("https://site.local/admin/", u"UnicödeBöt")) self.assertFalse( rp.can_fetch("https://site.local/wiki/K%C3%A4ytt%C3%A4j%C3%A4:", "*")) self.assertFalse( rp.can_fetch(u"https://site.local/wiki/Käyttäjä:", "*")) self.assertFalse( rp.can_fetch( u"https://site.local/wiki/Keskustelu_k%C3%A4ytt%C3%A4j%C3%A4st%C3%A4:", "*")) self.assertFalse( rp.can_fetch(u"https://site.local/wiki/Keskustelu_käyttäjästä:", "*")) self.assertTrue( rp.can_fetch("https://site.local/some/randome/page.html", "*")) self.assertFalse( rp.can_fetch("https://site.local/some/randome/page.html", u"UnicödeBöt")) content = u""" # robots.txt for http://www.example.com/ User-Agent: Jävla-Foobot Disallow: / User-Agent: \u041b\u044c\u0432\u0456\u0432-bot Disallow: /totalitarianism/ """ rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("/foo/bar.html", u"jävla-fanbot")) self.assertFalse(rp.can_fetch("/foo/bar.html", u"jävla-foobot")) self.assertTrue(rp.can_fetch("/", "foobot")) self.assertTrue( rp.can_fetch("/", u"Mozilla/5.0 (compatible; Львів-bot/1.1)")) self.assertFalse( rp.can_fetch("/totalitarianism/foo.htm", u"Mozilla/5.0 (compatible; Львів-bot/1.1)"))
def test_unescaped_url(self): content = ("User-agent: * \n" "Disallow: / \n" "Allow: /a<d.html") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/a<d.html", "*")) self.assertTrue(rp.can_fetch("https://www.site.local/a%3cd.html", "*")) content = ("User-agent: * \n" "Disallow: / \n" "Allow: /a<*") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/a<d.html", "*")) self.assertTrue(rp.can_fetch("https://www.site.local/a%3cd.html", "*"))
def test_no_exceptions(path_to_robotstxt): try: with open(join(test_data_directory, path_to_robotstxt), 'rb') as f: try: content = f.read().decode('utf-8') except UnicodeDecodeError: # Downloaded robots.txt is malformed, ignore this return Protego.parse(content=content) except Exception as e: assert False, "{}. Exception raised while parsing {}".format(e, join(path_to_robotstxt, 'robots.txt'))
def test_allowed(self): content = ("User-agent: * \n" "Disallow: /disallowed \n" "Allow: /allowed \n" "Crawl-delay: 10") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/allowed", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/disallowed", "*")) content = ("User-agent: * \n" "Disallow: /d \n" "Crawl-delay: 10") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://www.site.local/abc/d", "*")) self.assertFalse(rp.can_fetch("https://www.site.local/disallowed", "*"))
def test_user_agent_grouping(self): content = """ User-Agent: one User-Agent: two Disallow: /success User-Agent: * Disallow: /failure """ rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("https://site.local/success", "one")) self.assertFalse(rp.can_fetch("https://site.local/success", "two")) self.assertTrue(rp.can_fetch("https://site.local/failure", "one")) self.assertTrue(rp.can_fetch("https://site.local/failure", "two")) content = ( "allow: /foo/bar/\n" "\n" "user-agent: FooBot\n" "disallow: /\n" "allow: /x/\n" "user-agent: BarBot\n" "disallow: /\n" "allow: /y/\n" "\n" "\n" "allow: /w/\n" "user-agent: BazBot\n" "\n" "user-agent: FooBot\n" "allow: /z/\n" "disallow: /\n") url_w = "http://foo.bar/w/a" url_x = "http://foo.bar/x/b" url_y = "http://foo.bar/y/c" url_z = "http://foo.bar/z/d" url_foo = "http://foo.bar/foo/bar/" rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch(url_x, "FooBot")) self.assertTrue(rp.can_fetch(url_z, "FooBot")) self.assertFalse(rp.can_fetch(url_y, "FooBot")) self.assertTrue(rp.can_fetch(url_y, "BarBot")) self.assertTrue(rp.can_fetch(url_w, "BarBot")) self.assertFalse(rp.can_fetch(url_z, "BarBot")) self.assertTrue(rp.can_fetch(url_z, "BazBot")) self.assertFalse(rp.can_fetch(url_foo, 'FooBot')) self.assertFalse(rp.can_fetch(url_foo, 'BarBot')) self.assertFalse(rp.can_fetch(url_foo, 'BazBot'))
def test_empty_response(self): """empty response should equal 'allow all'""" rp = Protego.parse(content='') self.assertTrue(rp.can_fetch("https://site.local/", "*")) self.assertTrue(rp.can_fetch("https://site.local/", "chrome")) self.assertTrue(rp.can_fetch("https://site.local/index.html", "*")) self.assertTrue(rp.can_fetch("https://site.local/disallowed", "*"))
def test_malformed_crawl_delay(self): content = ("User-agent: * \n" "Disallow: /disallowed \n" "Allow: /allowed \n" "Crawl-delay: random_word") rp = Protego.parse(content=content) self.assertTrue(rp.crawl_delay('*') is None)
def test_path_matching(pattern, path, match): content = """ User-Agent: * disallow: {} """.format(pattern) rp = Protego.parse(content) assert (not rp.can_fetch(path, '*')) == match
def test_record_precedence(rules, url, allowed): content = """ User-Agent: * {} """.format(rules) rp = Protego.parse(content) assert rp.can_fetch(url, '*') == allowed
def test_sitemaps_come_first(self): """Some websites have sitemaps before any robots directives""" content = ("Sitemap: https://www.foo.bar/sitmap.xml\n" "User-Agent: FootBot\n" "Disallow: /something") rp = Protego.parse(content=content) self.assertEquals(list(rp.sitemaps), ["https://www.foo.bar/sitmap.xml"])
def test_with_absolute_urls(self): content = ("user-agent: *\n" "disallow: http://ms-web00.walkerplus.com/\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.bar/", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/http://ms-web00.walkerplus.com/", "FooBot"))
def test_no_sitemaps(self): content = ("User-agent: * \n" "Disallow: /disallowed \n" "Allow: /allowed \n" "Crawl-delay: 10") rp = Protego.parse(content=content) self.assertTrue(not list(rp.sitemaps))
def test_no_preferred_host(self): content = ("User-agent: * \n" "Disallow: /disallowed \n" "Allow: /allowed \n" "Crawl-delay: 10") rp = Protego.parse(content=content) self.assertTrue(rp.preferred_host is None)
def test_1994rfc_example(self): """Test parser on examples form 1994 RFC.""" content = """ # robots.txt for http://www.example.com/ User-agent: * Disallow: /cyberworld/map/ # This is an infinite virtual URL space Disallow: /tmp/ # these will soon disappear Disallow: /foo.html """ rp = Protego.parse(content=content) user_agent = "CrunchyFrogBot" self.assertTrue(rp.can_fetch("/", user_agent)) self.assertFalse(rp.can_fetch("/foo.html", user_agent)) self.assertTrue(rp.can_fetch("/foo.htm", user_agent)) self.assertTrue(rp.can_fetch("/foo.shtml", user_agent)) self.assertFalse(rp.can_fetch("/foo.htmlx", user_agent)) self.assertTrue(rp.can_fetch("/cyberworld/index.html", user_agent)) self.assertFalse(rp.can_fetch("/tmp/foo.html", user_agent)) # Since it is the caller's responsibility to make sure the host name # matches, the parser disallows foo.html regardless of what I pass for # host name and protocol. self.assertFalse( rp.can_fetch("http://example.com/foo.html", user_agent)) self.assertFalse( rp.can_fetch("http://www.example.com/foo.html", user_agent)) self.assertFalse( rp.can_fetch("http://www.example.org/foo.html", user_agent)) self.assertFalse( rp.can_fetch("https://www.example.org/foo.html", user_agent)) self.assertFalse(rp.can_fetch("ftp://example.net/foo.html", user_agent))
def test_url_parts(self): content = ("User-agent: * \n" "Disallow: /path;params?query \n") rp = Protego.parse(content=content) self.assertFalse( rp.can_fetch( "http://[email protected]:10/path;params?query#fragment", "*")) content = ("User-agent: * \n" "Disallow: /? \n") rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("/?query", "*")) self.assertTrue(rp.can_fetch('/', '*')) content = ("User-agent: * \n" "Disallow: /; \n") rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("/;params", "*")) self.assertTrue(rp.can_fetch('/', '*'))
async def _lauch_browser(self) -> None: self._pw = await async_playwright().start() pwOptions = self._settingsdict['PLAYWRIGHT_LAUNCH_OPTIONS'] if not "".__eq__(self._settingsdict['PLAYWRIGHT_BROWSER_TYPE']): if self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] not in ['chromium', 'firefox', 'webkit']: raise RuntimeError( 'Invalid PLAYWRIGHT_BROWSER_TYPE configuration') if self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'chromium': self._browser = await self._pw.chromium.launch(**pwOptions) elif self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'firefox': self._browser = await self._pw.firefox.launch(**pwOptions) elif self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'webkit': self._browser = await self._pw.webkit.launch(**pwOptions) # If no Cookies path is provided, storage state is still returned, but won't be saved to the disk self._context = await self._browser.new_context(user_agent=self._settingsdict['USER_AGENT'], storage_state=self._settingsdict['COOKIES_PATH']) self._context.set_default_navigation_timeout( self._settingsdict['PLAYWRIGHT_NAVIGATION_TIMEOUT']) blankPage = await self._context.new_page() getUA = await blankPage.evaluate('''() => { return navigator.userAgent }''') # Load robots.txt file response = await blankPage.goto(urlparse(self.base_url).scheme + '://' + urlparse(self.base_url).netloc + '/robots.txt') if response.ok: try: text = await blankPage.inner_text('pre') except TimeoutError as e: print('my print', e) text = await blankPage.inner_text('body') text = text + self._settingsdict['CUSTOM_ROBOT'] res = Protego.parse(text) self._robotsTxt = res else: self._robotsTxt = Protego.parse(""" User-agent: * Allow: / """) self.crawllogger.info( '[000] Starting browser with User Agent: {}'.format(getUA))
def test_index_html_is_directory(self): content = ("User-Agent: *\n" "Allow: /allowed-slash/index.html\n" "Disallow: /\n") rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("http://foo.com/allowed-slash/", "footbot")) self.assertTrue(rp.can_fetch("http://foo.com/allowed-slash/index.html", "footbot")) self.assertFalse(rp.can_fetch("http://foo.com/allowed-slash/index.htm", "footbot")) self.assertFalse(rp.can_fetch("http://foo.com/anyother-url", "footbot"))
def test_empty_record_group(self): content = """ User-Agent: harrybot Disallow: / User-Agent: testbot """ rp = Protego.parse(content=content) self.assertTrue(rp.can_fetch("https://site.local/path1", "testbot")) self.assertTrue(rp.can_fetch("https://site.local/path2", "testbot"))
def test_special_symbols_dual_behaviour(self): '''Special symbols such as * and $, should also be treated as an ordinary character''' content = ("user-agent: FooBot\n" "disallow: /x/abc$\n" "disallow: /x*x/abc\n") rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("http://foo.bar/x*x/abc", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x/abc$", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x/abc%24", "FooBot"))
def test_no_request_rate(self): content = """ User-agent: one Request-rate: 1/10s User-agent: two Disallow: / """ rp = Protego.parse(content=content) self.assertTrue(rp.request_rate('two') is None)
def test_crawl_delay(self): content = ("User-agent: * \n" "Disallow: /disallowed \n" "Allow: /allowed \n" "Crawl-delay: 10 \n" "User-agent: testbot\n" "Crawl-delay: 15 \n") rp = Protego.parse(content=content) self.assertTrue(rp.crawl_delay('*') == 10.0) self.assertTrue(rp.crawl_delay('testbot') == 15.0)
async def _lauch_browser(self) -> None: self._pw = await async_playwright().start() pwOptions = self._settingsdict['PLAYWRIGHT_LAUNCH_OPTIONS'] if not "".__eq__(self._settingsdict['PLAYWRIGHT_BROWSER_TYPE']): if self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] not in [ 'chromium', 'firefox', 'webkit' ]: raise RuntimeError( 'Invalid PLAYWRIGHT_BROWSER_TYPE configuration') if self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'chromium': self._browser = await self._pw.chromium.launch(**pwOptions) elif self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'firefox': self._browser = await self._pw.firefox.launch(**pwOptions) elif self._settingsdict['PLAYWRIGHT_BROWSER_TYPE'] == 'webkit': self._browser = await self._pw.webkit.launch(**pwOptions) self._context = await self._browser.newContext() self._context.setDefaultNavigationTimeout( self._settingsdict['PLAYWRIGHT_NAVIGATION_TIMEOUT']) blankPage = await self._context.newPage() getUA = await blankPage.evaluate('''() => { return navigator.userAgent }''') # Load robots.txt file response = await blankPage.goto( urlparse(self.base_url).scheme + '://' + urlparse(self.base_url).netloc + '/robots.txt') if response.ok: text = await blankPage.innerText('pre') self._robotsTxt = Protego.parse(text) else: self._robotsTxt = Protego.parse(""" User-agent: * Allow: / """) self.crawllogger.info( '[000] Starting browser with User Agent: {}'.format(getUA))
def test_escaped_special_symbols(self): '''Percent encoded special symbols should be treated as ordinary characters.''' content = ("user-agent: FooBot\n" "disallow: /x/abc%24\n" "disallow: /x%2Ax/abc\n") rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("http://foo.bar/x/abc$abc", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x/abc$", "FooBot")) self.assertTrue(rp.can_fetch("http://foo.bar/x/abc", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x*x/abc", "FooBot")) self.assertFalse(rp.can_fetch("http://foo.bar/x*x/abcdef", "FooBot")) self.assertTrue(rp.can_fetch("http://foo.bar/xabcx/abc", "FooBot"))
def robotstxt_test(robotstxt_url, user_agents, urls): """Given a :attr:`robotstxt_url` check which of the :attr:`user_agents` is allowed to fetch which of the :attr:`urls`. All the combinations of :attr:`user_agents` and :attr:`urls` will be checked and the results returned in one DataFrame. >>> robotstxt_test('https://facebook.com/robots.txt', ... user_agents=['*', 'Googlebot', 'Applebot'], ... urls=['/', '/bbc', '/groups', '/hashtag/']) robotstxt_url user_agent url_path can_fetch 0 https://facebook.com/robots.txt * / False 1 https://facebook.com/robots.txt * /bbc False 2 https://facebook.com/robots.txt * /groups False 3 https://facebook.com/robots.txt * /hashtag/ False 4 https://facebook.com/robots.txt Applebot / True 5 https://facebook.com/robots.txt Applebot /bbc True 6 https://facebook.com/robots.txt Applebot /groups True 7 https://facebook.com/robots.txt Applebot /hashtag/ False 8 https://facebook.com/robots.txt Googlebot / True 9 https://facebook.com/robots.txt Googlebot /bbc True 10 https://facebook.com/robots.txt Googlebot /groups True 11 https://facebook.com/robots.txt Googlebot /hashtag/ False :param url robotstxt_url: The URL of robotx.txt file :param str,list user_agents: One or more user agents :param str,list urls: One or more paths (relative) or URLs (absolute) to check :return DataFrame robotstxt_test_df: """ if not robotstxt_url.endswith('/robots.txt'): raise ValueError('Please make sure you enter a valid robots.txt URL') if isinstance(user_agents, str): user_agents = [user_agents] if isinstance(urls, str): urls = [urls] robots_open = urlopen(Request(robotstxt_url, headers=headers)) robots_bytes = robots_open.readlines() robots_text = ''.join(line.decode() for line in robots_bytes) rp = Protego.parse(robots_text) test_list = [] for path, agent in product(urls, user_agents): d = dict() d['user_agent'] = agent d['url_path'] = path d['can_fetch'] = rp.can_fetch(path, agent) test_list.append(d) df = pd.DataFrame(test_list) df.insert(0, 'robotstxt_url', robotstxt_url) df = df.sort_values(['user_agent', 'url_path']).reset_index(drop=True) return df
def test_skip_malformed_line(self): content = """ User-Agent: one Disallow: /bot-one Harry Potter User-Agent: two Disallow: /bot-two """ rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("https://site.local/bot-one", "one")) self.assertTrue(rp.can_fetch("https://site.local/bot-two", "one")) self.assertFalse(rp.can_fetch("https://site.local/bot-two", "two")) self.assertTrue(rp.can_fetch("https://site.local/bot-one", "two"))
def test_skip_unknown_directives(self): content = """ User-Agent: one Disallow: /bot-one Harry: Potter User-Agent: two Disallow: /bot-two """ rp = Protego.parse(content=content) self.assertFalse(rp.can_fetch("https://site.local/bot-one", "one")) self.assertTrue(rp.can_fetch("https://site.local/bot-two", "one")) self.assertFalse(rp.can_fetch("https://site.local/bot-two", "two")) self.assertTrue(rp.can_fetch("https://site.local/bot-one", "two"))