def test_cocrawler_reppy(): r1 = Robots.parse( 'http://example.com/robots.txt', ''' User-Agent: foo Allow: / # comment Disallow: / Disallow: /disallowed ''') r2 = Robots.parse( 'http://example.com/robots.txt', ''' User-Agent: foo Allow: / Disallow: / Disallow: /disallowed ''') r3 = Robots.parse( '', ''' User-Agent: foo Allow: / Disallow: / Disallow: /disallowed ''') # despite the blank line or comment, 'foo' is disllowed from disallowed assert r1.allowed('/', 'foo') is True assert r1.allowed('/disallowed', 'foo') is False assert r2.allowed('/', 'foo') is True assert r2.allowed('/disallowed', 'foo') is False assert r3.allowed('/', 'foo') is True assert r3.allowed('/disallowed', 'foo') is False # blank line does not reset user-agent to *, so bar has no rules assert r1.allowed('/', 'bar') is True assert r1.allowed('/disallowed', 'bar') is True assert r2.allowed('/', 'bar') is True assert r2.allowed('/disallowed', 'bar') is True assert r3.allowed('/', 'bar') is True assert r3.allowed('/disallowed', 'bar') is True # no substring weirdnesses, so foobar does not match foo rules assert r1.allowed('/', 'foobar') is True assert r1.allowed('/disallowed', 'foobar') is True assert r2.allowed('/', 'foobar') is True assert r2.allowed('/disallowed', 'foobar') is True assert r3.allowed('/', 'foobar') is True assert r3.allowed('/disallowed', 'foobar') is True
def test_cocrawler_reppy_xfail(): r4 = Robots.parse('', ''' User-agent: * Disallow: // ''') # ibm.com, I'm looking at you assert r4.allowed('/foo', '') is True assert r4.allowed('/', '') is True
def getRobotParser(loader, startUrl): robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt") page = loader.get(robotUrl, allow_redirects=True) if page is None: print("Could not read ROBOTS.TXT at: " + robotUrl) return None #end if rp = Robots.parse(robotUrl, page) print("Found ROBOTS.TXT at: " + robotUrl) return rp
def getRobotParser(startUrl): robotUrl = urllib.parse.urljoin(startUrl, "/robots.txt") page, _, _ = getPage(robotUrl) if page is None: print("Could not read ROBOTS.TXT at: " + robotUrl) return None #end if rp = Robots.parse(robotUrl, page) print("Found ROBOTS.TXT at: " + robotUrl) # return rp return None
def __init__(self, robotstxt_body, spider): from reppy.robots import Robots self.spider = spider self.rp = Robots.parse('', robotstxt_body)
def parse(self, content, name): '''Parse the robots.txt in content and return the agent of the provided name.''' return Robots.parse('http://example.com', content).agent(name)
def benchmark_reppy_parser(website): from reppy.robots import Robots rp = Robots.parse('', website['robotstxt']) for link in website['links']: rp.allowed(link, 'googlebot')
def parse_robots_txt(self, link_list): host, port = self.config.cache_server robotsURL = '' robots = None links = [] for link_url in link_list: parsed_link = parse.urlparse(link_url) link_base = '{0.scheme}://{0.netloc}/'.format(parsed_link) if robots == None or link_base not in robotsURL: if 'today.uci.edu' in link_base: robots = Robots.parse('https://today.uci.edu/department/information_computer_sciences/robots.txt', ''' User-agent: * Disallow: /*/calendar/*?*types* Disallow: /*/browse*?*types* Disallow: /*/calendar/200* Disallow: /*/calendar/2015* Disallow: /*/calendar/2016* Disallow: /*/calendar/2017* Disallow: /*/calendar/2018* Disallow: /*/calendar/2019* Disallow: /*/calendar/202* Disallow: /*/calendar/week Disallow: /*/search Disallow: /*?utm Allow: / Allow: /*/search/events.ics Allow: /*/search/events.xml Allow: /*/calendar/ics Allow: /*/calendar/xml ''') else: robotsURL = link_base + 'robots.txt' time.sleep(0.5) # get the robots.txt file try: robots = Robots.fetch(f"http://{host}:{port}/", params=[("q", f"{robotsURL}"), ("u", f"{self.config.user_agent}")], timeout=20) except Exception as e: print(e) robots = None # WARNING: UNCOMMENTING BYPASSES CACHE # if the robots is empty, get the robots.txt from actual server # robots_str = str(robots) # robots_str = robots_str.split(': ')[1].split('}')[0] # if robots_str == '[]': # robots = Robots.fetch(robotsURL, timeout=20) # print(robots) if robots == None: links.append(link_url) continue if parsed_link.params == '': if parsed_link.query == '': query_only = '{0.path}/'.format(parsed_link) else: query_only = '{0.path}/?{0.query}'.format(parsed_link) else: if parsed_link.query == '': query_only = '{0.path}/{0.params}/'.format(parsed_link) else: query_only = '{0.path}/{0.params}/?{0.query}'.format(parsed_link) if robots.allowed(query_only, self.config.user_agent): links.append(link_url) return links
Allow: /serv Allow: /~mak Disallow: / ''' @contextmanager def timer(name, count): '''Time this block.''' start = time.time() try: yield count finally: duration = time.time() - start print(name) print('=' * 10) print('Total: %s' % duration) print(' Avg: %s' % (duration / count)) print(' Rate: %s' % (count / duration)) print('') with timer('Parse', 100000) as count: for _ in xrange(count): Robots.parse('http://example.com/robots.txt', content) parsed = Robots.parse('http://example.com/robots.txt', content) with timer('Evaluate', 100000) as count: for _ in xrange(count): parsed.allowed('/org/example.html', 'other-bot')