def testRobot3(self): robots = RobotsCache() rules = robots.fetch("http://www.realwire.com/") crawl_delay = rules.delay("idiot") print("delay is:", crawl_delay) for i in range(1, 1000): print(rules.allowed("http://api.google.com/search/", agent="idiot"))
def __FetchRobotFileInfo__(self, url, robotDictForDomains, timeStamp): domainName = self.__GetComSubdomainOfUrl__(url) robotUrl = "" if robotDictForDomains.has_key(domainName) == False: robotUrl = self.__GetRobotUrlForUrl__(domainName) cache = RobotsCache() try: timeStamp[domainName] = datetime.datetime.now() robotFileObj = cache.fetch(robotUrl) doesUrlExistOnline = self.__DoesUrlExistOnline__(robotUrl) except: doesUrlExistOnline = False robotDictForDomains[domainName] = (doesUrlExistOnline, object) if doesUrlExistOnline == True: robotDictForDomains[domainName] = (doesUrlExistOnline, robotFileObj) else: robotDictForDomains[domainName] = (doesUrlExistOnline, object) doesUrlExistOnline = robotDictForDomains[domainName][0] robotFileObj = robotDictForDomains[domainName][1] # print "heyyy",robotUrl, doesUrlExistOnline, robotFileObj, robotDictForDomains return doesUrlExistOnline, robotFileObj, robotDictForDomains, timeStamp, domainName
def testRobot3(self): robots = RobotsCache() rules = robots.fetch("http://www.realwire.com/") crawl_delay = rules.delay("idiot") print("delay is:", crawl_delay) for i in range(1, 1000): print(rules.allowed("http://api.google.com/search/", agent="idiot"))
def get_scanner_mock(request_limit): robots_cache = RobotsCache() robots_cache.fetch = MagicMock(return_value=robots_cache) robots_cache.allowed = MagicMock(return_value=True) robots_validator = RobotsValidator(agent='*') robots_validator.robots = robots_cache scanner = UrlScanner(request_limit) scanner.url_fetcher = get_url_fetcher_mock(request_limit) scanner.robots_validator = robots_validator return scanner
def robot_rules(_url_scheme, _url_netloc): # return a robot rules objects _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', '')) robots = RobotsCache() try: rules = robots.fetch(_domain, timeout=5) except Exception as exc: print('FAIL to fatch robot.txt {},{}'.format(_url_scheme, _url_netloc)) print(exc) return None return rules
def robot_rules(_url_scheme, _url_netloc): # return a robot rules objects _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', '')) robots = RobotsCache() try: rules = robots.fetch(_domain, timeout=5) except Exception as exc: print('FAIL to fatch robot.txt {},{}'.format( _url_scheme, _url_netloc)) print(exc) return None return rules
def robot_rules(_url_scheme, _url_netloc): # return a robot rules objects #_parsed_url = urlparse(_url) _domain = urlunparse((_url_scheme, _url_netloc, '', '', '', '')) robots = RobotsCache() try: #print('DOMAIN: {}'.format(_domain)) rules = robots.fetch(_domain) except Exception as exc: print('FAIL to fatch robot.txt') print(_url_scheme, _url_netloc) print(exc) return None return rules
def __init__(self, robots_url=None): if robots_url: robots = RobotsCache() self._rules = robots.fetch(robots_url) self.is_use_robots = True else: self.is_use_robots = False self._url_norm = UrlNorm() self.counter = 0 self.urls = dict() self.connections = defaultdict(set) self._lock = RLock()
def __init__(self, robots_url=None): if robots_url: robots = RobotsCache() self._rules = robots.fetch(robots_url) self.is_use_robots = True else: self.is_use_robots = False self._url_norm = UrlNorm() self.counter = 0 self.urls = dict() self.connections = defaultdict(set) self._lock = RLock()
def crawl_Pages(Seed): r = RobotsCache() robots_url=urljoin(Seed,'/robots.txt') x = r.fetch(robots_url) unvisited=[Seed] visited=[] cnt=0 delay=5 while unvisited: page=unvisited.pop(0) hdr={'User-Agent':'*'} try: req = urllib2.Request(page, headers=hdr) pagecontent=urllib2.urlopen(req) if page not in visited: time.sleep(delay) s=pagecontent.read() if (ishtmlcontent(pagecontent)): soup=BeautifulSoup(s) links=soup.findAll('a',href=True) for l in links: if (isurlvalid(l['href'])): u1=urljoin(page,l['href']) unvisited.append(u1) if x.allowed(page,'*'): visited.append(page) cnt=cnt+1 print cnt print 'Crawled:'+page visited=remove_duplicates(visited) else: if(page.endswith(".pdf")): visited.append(page) cnt=cnt+1 print 'Crawled:'+page visited=remove_duplicates(visited) if(len(visited)==100): unvisited=[] except Exception, err: print Exception, err continue
class RobotsValidator(object): """ Validates urls via robots.txt file """ def __init__(self, agent): self._agent = agent self.robots = RobotsCache() def get_allowed_from(self, child_urls): """ :param child_urls: List of child urls to check robots.txt on :return: A list of allowed child urls to crawl """ allowed = [] domains = list(set('{0}'.format(get_domain(url)) for url in child_urls)) domain_to_children = {domain: filter(lambda u: get_domain(u) == domain, child_urls) for domain in domains} for domain in domain_to_children: try: rules = self.robots.fetch(domain) for url in domain_to_children[domain]: if rules.allowed(url, self._agent): allowed.append(url) except: allowed.extend(domain_to_children[domain]) return allowed
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual(self.robots.allowed( 'http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): old_ttl = self.robots.min_ttl self.robots.min_ttl = 0 self.assertNotEqual( self.robots.find('http://localhost:8080/foo', True), None) # Now, it shouldn't be cached, so when we find it again, it should # be missing (or at least, requiring a refetch) self.assertEqual( self.robots.find('http://localhost:8080/foo', False), None) self.robots.min_ttl = old_ttl def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual( self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual( self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch( 'http://localhost:8080/foo')) self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse(self.robots.disallowed( 'http://localhost:8080/foo', 'rogerbot')) urls = [ 'http://localhost:8080/foo', 'http://localhost:8080/bar' ] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual(self.robots.delay( 'http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ])
class Worker(object): def __init__(self, args): if args.statsd and not re.match(REGEX_STATSD_HOST, args.statsd): raise Exception("Invalid statsd host provided") self.statsd_host = self.statsd_port = None self.statsd_disabled = True if args.statsd: self.statsd_host, self.statsd_port = args.statsd.rsplit(":", 1) self.statsd_port = int(self.statsd_port) self.statsd_disabled = False self.statsd_connection = statsd.Connection(host=self.statsd_host, port=self.statsd_port, sample_rate=0.5, disabled=self.statsd_disabled) self.statsd = statsd.Client("hydra_worker", self.statsd_connection) self.statsd_timers = {} self.statsd_counter = self.statsd.get_client(class_=statsd.Counter) self.get_info = self.time(self.get_info) utils.find_urls = self.time(utils.find_urls) self.get_jobs = self.time(self.get_jobs) self.done_jobs = self.time(self.done_jobs) self.get_tasks = self.time(self.get_tasks) self.args = args self.threads = [] self.working = True self.break_now = False self.jobs = {} self.job_lock = threading.Lock() self.broker_lock = threading.Lock() self.insert_queue = queue() self.print_queue = queue() self.fill_queue = queue() self.robots = RobotsCache() self.worker_id = args.worker_id or uuid.uuid1().hex self.socket = context.socket(zmq.REQ) self.socket.connect(args.broker_address) self.get_info() self.headers = {"Accept-encoding": "gzip", "User-agent": self.info["u"]} if args.db_override: self.info["d"] += " host=%s" % args.db_override self.database = database.Database(self.info["d"]) def get_timer(self): return self.statsd.get_client(class_=statsd.Timer) def time(self, function): def wrapper(*args, **kwargs): with self.get_timer().time(function.__name__): return function(*args, **kwargs) return wrapper def start(self): threaded_jobs = [] threaded_jobs.append(self.do_heartbeat) threaded_jobs.append(self.insert_urls) threaded_jobs.append(self.printer) threaded_jobs.append(self.fill_jobs) for job in threaded_jobs: new_thread = threading.Thread(target=job) new_thread.daemon = True new_thread.start() self.threads.append(new_thread) self.do_jobs() def get_info(self): self.socket.send(make_message(BYTE_HELLO, self.worker_id)) response = check_message(self.socket.recv(), BYTE_HELLO) self.info = response[1] def get_jobs(self, count = 1): self.print_queue.put("broker getting jobs") jobs = {} first_loop = True while self.working and len(jobs) < count: if not first_loop: time.sleep(2) first_loop = False count_needed = count-len(jobs)-1 self.broker_lock.acquire() self.socket.send(make_message(BYTE_GET_JOB, self.worker_id, count_needed)) response = check_message(self.socket.recv(), BYTE_GET_JOB) self.broker_lock.release() for job, tasks in response[1]: job = job.decode("utf8") tasks = [[https, task.decode("utf8")] for https, task in tasks] jobs[job] = tasks for job, tasks in jobs.items(): try: robots = self.robots.fetch("http://%s" % job, timeout=5) sleep = robots.delay(self.info["n"]) except Exception as e: robots = sleep = None self.print_queue.put("failed to get robots") sleep = sleep or self.info["s"] self.jobs[job] = {"tasks": tasks, "robots": robots, "sleep": sleep, "timestamp": 0.0} self.insert_queue.put([job, False, job]) task_count = 0 for _, tasks in jobs.items(): task_count += len(tasks) self.print_queue.put("broker got %d jobs (%d tasks)" % (count, task_count)) def done_jobs(self, *jobs): self.broker_lock.acquire() self.socket.send(make_message(BYTE_JOB_DONE, *jobs)) self.socket.recv() self.broker_lock.release() self.job_lock.acquire() for job in jobs: self.jobs.pop(job) self.job_lock.release() def get_tasks(self, job): if not job in self.jobs: return 0 new_urls = self.database.get_urls(job) self.print_queue.put("got tasks for %s (%d)" % (job, len(new_urls))) new_urls = [[https, task.decode("utf8")] for https, task in new_urls] new_allowed_urls = [] for new_https, new_url in new_urls: task_scheme = "%s%s" % (utils.get_scheme(new_https), new_url) if not self.jobs[job]["robots"] or self.jobs[job]["robots"].allowed( task_scheme, self.info["n"]): new_allowed_urls.append([new_https, new_url]) else: self.database.timestamp(new_url) self.jobs[job]["tasks"] += new_urls return len(new_urls) def fill_jobs(self): while self.working: job = self.fill_queue.get(True) if self.get_tasks(job) == 0 and not self.jobs[job]["tasks"]: self.print_queue.put("removing job %s" % job) self.done_jobs(job) self.get_jobs() def yield_tasks(self): while self.working: tasks = [] for job in list(self.jobs): if not self.jobs[job]["tasks"]: self.fill_queue.put(job) continue after_delay = self.jobs[job]["timestamp"]+self.jobs[job]["sleep"] time_since = after_delay-time.time() if not time_since > 0 or self.jobs[job]["timestamp"] == 0.0: https, task = self.jobs[job]["tasks"].pop(0) if not len(self.jobs[job]["tasks"]) > 0: self.fill_queue.put(job) task_scheme = "%s%s" % (utils.get_scheme(https), task) tasks.append(task_scheme) self.jobs[job]["timestamp"] = time.time() if tasks: yield tasks def printer(self): while self.working: line = self.print_queue.get(True) print datetime.datetime.now(), line def insert_urls(self): while self.working: self.database.start_transaction() urls = [] hostnames = set([]) for insert in xrange(self.info["b"]): try: to_insert = self.insert_queue.get(True, 4) except Empty: break if to_insert[0] and to_insert[2]: urls.append(to_insert) hostnames.add((to_insert[2],)) self.insert_queue.task_done() if urls and hostnames: self.database.insert(urls, hostnames) self.database.stop_transaction() def do_jobs(self): self.get_jobs(self.args.jobs or 2) time_before_yield = 0.0 for tasks in self.yield_tasks(): if not time_before_yield == 0.0: self.print_queue.put("got yield tasks, took %f seconds" % ( time.time()-time_before_yield)) time_before_get = time.time() get_requests = dict([(grequests.get(task, timeout=5), task) for task in tasks]) grequests.map(get_requests.keys(), stream=True) self.print_queue.put("got responses, took %f seconds" % ( time.time()-time_before_get)) found_count = 0.0 get_successful = [] get_failed = [] get_wrong_type = [] get_responses = {} time_before_process = time.time() for request in list(get_requests): original_url = get_requests[request].split("://", 1)[-1] response = request.response if not response or not response.status_code < 400: get_failed.append(original_url) elif not response.headers.get("content-type", "").startswith( "text/"): get_wrong_type.append(original_url) else: get_responses[response] = original_url self.print_queue.put("finished processing, took %f seconds" % ( time.time()-time_before_process)) time_before_responses = time.time() gevent.joinall([gevent.spawn(getattr, response, "text") for response in get_responses]) self.print_queue.put("got second responses, took %f seconds" % ( time.time()-time_before_responses)) time_before_second_process = time.time() for response in list(get_responses): original_url = get_responses[response] try: text = response.text except: get_failed.append(original_url) continue if not text: get_failed.append(original_url) else: actual_url = response.url get_successful.append(original_url) found_urls = utils.find_urls(response.text, actual_url) found_count += len(found_urls) for url in found_urls: url_parts = utils.process_url(url) if url_parts: self.statsd_counter.increment("url_found") self.insert_queue.put(url_parts) self.print_queue.put("finished second processing, took %f seconds" % (time.time()-time_before_second_process)) time_taken = time.time()-time_before_get stats = "tried %d" % len(tasks) stats += ", success %d" % len(get_successful) stats += ", fail %d" % len(get_failed) stats += ", wrong %d" % len(get_wrong_type) stats += ", took %f seconds" % time_taken if get_successful: stats += ", found %d" % found_count stats += ", %f/site" % (found_count/len(get_successful)) stats += ", %f/second" % (found_count/time_taken) self.print_queue.put(stats) for url in get_successful: self.database.timestamp(url) for url in get_failed: self.database.timestamp(url, 1) for url in get_wrong_type: self.database.timestamp(url, 2) time_before_join = time.time() self.insert_queue.join() self.print_queue.put("finished insert queue join, took %f seconds" % (time.time()-time_before_join)) time_before_yield = time.time() def do_heartbeat(self): while self.working: time.sleep(self.info["h"]) request = make_message(BYTE_HEARTBEAT, self.worker_id, *list(self.jobs)) self.broker_lock.acquire() self.socket.send(request) response = check_message(self.socket.recv(), BYTE_HEARTBEAT) self.broker_lock.release() if len(response) > 1 and response[1] == BYTE_GET_JOB: for bad_job in response[2:]: self.job_lock.acquire() assert bad_job in self.jobs del self.jobs[bad_job] self.job_lock.release() self.get_job()
################################################# ################################################# # fetch content of url while len(url_frontier) != 0: # pop any random url url = url_frontier.pop() try: print("\n---------------------------------------------------------") print("Crawling:", url) print("---------------------------------------------------------") # get crawl delay r = robots_cache.fetch(Robots.robots_url(url))[1] # check if its allowed to crawl that url? If not, then skip this url if not robots_cache.allowed(url, '*'): print("This URL is restricted to be crawled.") continue # insert this link to database cur.execute("INSERT OR IGNORE INTO crawled_urls (url_link) values(?)", (url,)) # if its allowed to crawl, then get the crawling delay crawl_delay = r.agent("*").delay if crawl_delay is not None: time.sleep(crawl_delay) else:
class Hodor(object): def __init__(self, url, config={}, proxies={}, auth=None, ua=DEFAULT_HODOR_UA, pagination_max_limit=DEFAULT_HODOR_MAX_PAGES, crawl_delay=DEFAULT_CRAWL_DELAY, ssl_verify=False, trim_values=True, robots=True, reppy_capacity=100): self.content = None self.url = url self.domain = self._get_domain() self.proxies = proxies self.auth = auth self.ua = ua self.trim_values = trim_values self.ssl_verify = ssl_verify self.config = {} self.extra_config = {} self.robots = RobotsCache(capacity=reppy_capacity) if robots else None self._pages = [] self._page_count = 0 self._pagination_max_limit = pagination_max_limit self.crawl_delay = self._crawl_delay(crawl_delay) for k, v in config.items(): if k.startswith("_"): self.extra_config[k.lstrip("_")] = v else: self.config[k] = v def _get_domain(self): parsed_uri = urlparse(self.url) return '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri) def _crawl_delay(self, crawl_delay): if self.robots not in EMPTY_VALUES: expiry, robots = self.robots.fetch('{}robots.txt'.format(self.domain)) delay = robots.agent(self.ua).delay try: crawl_delay = max(filter(partial(is_not, None), [delay, crawl_delay])) except ConnectionException: pass return crawl_delay def _fetch(self, url): '''Does the requests fetching and stores result in self.content''' if self.robots in EMPTY_VALUES or self.robots.allowed(url, self.ua): session = requests.session() headers = {'User-Agent': self.ua} if len(self.proxies) > 0: session.proxies = self.proxies if self.auth: r = session.get(url, headers=headers, auth=self.auth, verify=self.ssl_verify) else: r = session.get(url, headers=headers, verify=self.ssl_verify) self.content = r.content return self.content @staticmethod def _get_value(content, rule): '''Returns result for a specific xpath''' try: tree = html.fromstring(content) except TypeError: tree = None post_processing = rule.get('transform', lambda data: data) data = "" if tree not in EMPTY_VALUES: if 'xpath' in rule: data = tree.xpath(rule['xpath']) elif 'css' in rule: data = [node.text_content() for node in tree.cssselect(rule['css'])] many = rule.get('many', True) if not many: if len(data) == 0: data = None else: data = post_processing(data[0]) else: data = [post_processing(d) for d in data] return data @staticmethod def _group_data(data, groups, config): del_fields = [] for dest, group_fields in groups.items(): if '__all__' in group_fields or group_fields == '__all__': group_fields = [rule for rule in config.keys() if not rule.startswith('_')] del_fields.extend(group_fields) gdata = [] for field in group_fields: gdata.append(data[field]) data[dest] = [] for gd in zip(*gdata): d = {} for i, field in enumerate(group_fields): d[field] = gd[i] data[dest].append(d) if len(del_fields) == 0: del_fields = [field for field_set in groups.values() for field in field_set] for field in del_fields: if field in data: del data[field] def _package_pages(self): self._data = {} if len(self._pages) == 1: self._data = self._pages[0] else: self._data = {key: [] for key in self._pages[0].keys()} for page in self._pages: for k, v in page.items(): if hasattr(v, '__iter__'): self._data[k].extend(v) else: self._data[k].append(v) return self._data @classmethod def _parse(cls, content, config={}, extra_config={}, trim_values=True): '''Parses the content based on the config set''' if len(config) is 0: _data = {'content': content} else: _data = {} try: str_class = basestring except NameError: str_class = str for key, rule in config.items(): value = cls._get_value(content, rule) if trim_values and value not in EMPTY_VALUES: if 'many' in rule and rule['many']: value = [v.strip() if isinstance(v, str_class) else v for v in value] else: value = value.strip() if isinstance(value, str_class) else value _data[key] = value paginate_by = extra_config.get('paginate_by') if paginate_by: paginate_by = cls._get_value(content, paginate_by) groups = extra_config.get('groups', {}) if groups: cls._group_data(_data, groups, config) return _data, paginate_by def _get(self, url): self._fetch(url) data, paginate_by = self._parse(self.content, self.config, self.extra_config, self.trim_values) if paginate_by not in EMPTY_VALUES: paginate_by = urljoin(self.domain, paginate_by) return data, paginate_by def get(self, url=None): url = url if url else self.url self._data, paginate_by = self._get(url) self._pages.append(self._data) self._page_count += 1 if paginate_by and self._page_count < self._pagination_max_limit: time.sleep(self.crawl_delay) self.get(paginate_by) self._package_pages() return self._data @property def data(self): if not hasattr(self, '_data'): self.get() return self._data
def _fetch_sitemap_from_url(self, url): robots = RobotsCache() try: return robots.fetch(url, timeout=1.5).sitemaps except: return []
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual( self.robots.allowed('http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): old_ttl = self.robots.min_ttl self.robots.min_ttl = 0 self.assertNotEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=True), None) # If we ignore the TTL, it should still be there. self.assertNotEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=False, honor_ttl=False), None) # However, if we honor the TTL, it should be missing in the cache. self.assertEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=False), None) self.robots.min_ttl = old_ttl def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch('http://localhost:8080/foo')) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' with mock.patch.object(self.robots.session, 'get', side_effect=TypeError): self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse( self.robots.disallowed('http://localhost:8080/foo', 'rogerbot')) urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar'] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual( self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ]) def test_dns_exception(self): '''Raises an exception if url does not resolve.''' self.assertRaises(ConnectionException, self.robots.allowed, 'http://does-not-resolve', 'rogerbot') def test_malformed_url(self): '''Raises an exception if the url is malformed.''' self.assertRaises(MalformedUrl, self.robots.allowed, 'hhttp://moz.com', 'rogerbot') def test_ssl_exception(self): '''Raises an exception if there is an ssl error.''' with asis.Server('tests/asis/test_ssl_exception', port=8080): self.assertRaises(SSLException, self.robots.allowed, 'https://localhost:8080', 'rogerbot') def test_excessive_redirects(self): '''Raises an exception if there are too many redirects.''' with asis.Server('tests/asis/test_excessive_redirects', port=8080): self.assertRaises(ExcessiveRedirects, self.robots.allowed, 'http://localhost:8080/one', 'rogerbot') def test_bad_status_codes(self): '''Raises an exception if there is a 5xx status code.''' with asis.Server('tests/asis/test_bad_status_codes', port=8080): self.assertRaises(BadStatusCode, self.robots.allowed, 'http://localhost:8080', 'rogerbot')
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual( self.robots.allowed('http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): self.assertNotEqual( self.robots.find('http://localhost:8080/foo', True), None) # Now, it shouldn't be cached, so when we find it again, it should # be missing (or at least, requiring a refetch) self.assertEqual( self.robots.find('http://localhost:8080/foo', False), None) def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch('http://localhost:8080/foo')) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse( self.robots.disallowed('http://localhost:8080/foo', 'rogerbot')) urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar'] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual( self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ])