class RobotsMiddleware(BaseMiddleware): def __init__(self, *args, **kwargs): self.cache = RobotsCache(*args, **kwargs) self.visited = collections.defaultdict(dict) def check_disallow(self, url, agent): if not self.cache.allowed(url, agent): raise RobotsDisallowedError def check_crawl_delay(self, url, agent): delay = self.cache.delay(url, agent) if delay is None: return now = datetime.datetime.utcnow() host = urlparse.urlparse(url).hostname try: last_visit = self.visited[agent][host] if (now - last_visit).seconds < delay: raise RobotsThrottledError except KeyError: pass self.visited[agent][host] = now def before_send(self, request, *args, **kwargs): url = request.url agent = request.headers.get('User-Agent') self.check_disallow(url, agent) self.check_crawl_delay(url, agent)
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual(self.robots.allowed( 'http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): old_ttl = self.robots.min_ttl self.robots.min_ttl = 0 self.assertNotEqual( self.robots.find('http://localhost:8080/foo', True), None) # Now, it shouldn't be cached, so when we find it again, it should # be missing (or at least, requiring a refetch) self.assertEqual( self.robots.find('http://localhost:8080/foo', False), None) self.robots.min_ttl = old_ttl def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual( self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual( self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual( self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch( 'http://localhost:8080/foo')) self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse(self.robots.disallowed( 'http://localhost:8080/foo', 'rogerbot')) urls = [ 'http://localhost:8080/foo', 'http://localhost:8080/bar' ] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual(self.robots.delay( 'http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ])
""" """ # Imports import json import time import requests import urlparse doi_url = 'http://dx.doi.org/' # Get crawl-delay parameter from robots.txt from reppy.cache import RobotsCache robots = RobotsCache() doi_delay = robots.delay(doi_url, '*') def doi_to_csl(doi): """ Fetch CSL-formatted reference by DOI. """ # Build URL url = urlparse.urljoin(doi_url, doi) # Send request req = requests.get( url, headers={ 'accept' : 'application/citeproc+json' } ) # Wait for crawl-delay
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual( self.robots.allowed('http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): self.assertNotEqual( self.robots.find('http://localhost:8080/foo', True), None) # Now, it shouldn't be cached, so when we find it again, it should # be missing (or at least, requiring a refetch) self.assertEqual( self.robots.find('http://localhost:8080/foo', False), None) def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch('http://localhost:8080/foo')) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse( self.robots.disallowed('http://localhost:8080/foo', 'rogerbot')) urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar'] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual( self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ])
class Archiver(object): ARCHIVE_SUBFORUM_SUBURL_TEMPLATE = 'index.php/f-{forum_code}.html' ARCHIVE_SUBFORUM_SUBURL_RE_TEMPLATE = 'index.php/f-{forum_code}[^(.html)]?.html' ARCHIVE_THREAD_SUBURL_RE = 'index.php/t-[^(.html)]*.html' ARCHIVE_CSS_RE = '[^(.css)]*.css' def __init__(self, base_url, forum_codes, archive_location, user_agent, worker_count): archiver_logger.info('Archiver initialized.') self.base_url = base_url self.archive_base_url = urljoin(self.base_url, ScraperConfig.ARCHIVE_SUBURL) self.forum_codes = forum_codes self.archive_location = archive_location self.user_agent = user_agent self.robot_parser = RobotsCache() self.scraper_timer = None self.shutdown_event = threading.Event() self.delay_time = 1 self.workers = [] self.worker_count = worker_count self.pages_need_visiting = Queue() self.pages_need_analysis_counter = RachetingCounter() self.pages_visited_lock = threading.Lock() self.pages_visited = [] self.page_re_filters = [] def setup(self): archiver_logger.info('Beginning Archiver setup.') success = True archiver_logger.info('Building page filters.') # Build regular expression filters for pages to attempt to crawl. archive_base_url = self.archive_base_url # Build regular expression for sub-forums we're interested in. for forum_code in self.forum_codes: regex = urljoin( archive_base_url, self.ARCHIVE_SUBFORUM_SUBURL_RE_TEMPLATE.format( forum_code=forum_code)) self.page_re_filters.append(re.compile(regex)) # Add a regular expression for thread pages. thread_regex = urljoin(archive_base_url, self.ARCHIVE_THREAD_SUBURL_RE) self.page_re_filters.append(re.compile(thread_regex)) # Finally add a regular expression to grab the archive CSS. css_regex = urljoin(archive_base_url, self.ARCHIVE_CSS_RE) self.page_re_filters.append(re.compile(css_regex)) archiver_logger.info('Adding seed pages.') for fc in self.forum_codes: subforum_url = urljoin( self.archive_base_url, self.ARCHIVE_SUBFORUM_SUBURL_TEMPLATE.format(forum_code=fc)) self.pages_need_visiting.put(subforum_url) self.pages_need_analysis_counter.increment() archiver_logger.info( 'Archiver seeded with page {}.'.format(subforum_url)) archiver_logger.info('Checking archive location...') # Setup archive location. base_path, new_archive = os.path.split(self.archive_location) if not os.path.exists(base_path) or not os.path.isdir(base_path): success = False archiver_logger.error( 'Base path {} does not exist or is not a directory! Aborting!') return success elif (os.path.exists(self.archive_location) and (not os.path.isdir(self.archive_location) or os.listdir(self.archive_location))): success = False archiver_logger.error( 'Archive location {} is either a not a directory or is not empty! Aborting!' ''.format(self.archive_location)) return success elif not os.path.exists(self.archive_location): archiver_logger.info('Creating archive directory {}.'.format( self.archive_location)) try: os.mkdir(self.archive_location) except OSError: success = False archiver_logger.exception( 'Faulted attempting to create archive directory! Aborting!' ) return success else: archiver_logger.info( 'Empty archive directory {} exists. Proceeding...'.format( self.archive_location)) # Attempt to retrieve robots.txt information about target site. if not self.robot_parser.allowed(self.base_url, self.user_agent): success = False archiver_logger.error('Not allowed to scrape {}! Aborting!'.format( self.base_url)) return success else: archiver_logger.info( 'Successfully polled {} for robots.txt, can scrape.'.format( self.base_url)) # Get crawl delay and build scraper timer. delay_time = self.robot_parser.delay(self.base_url, self.user_agent) if delay_time: archiver_logger.info( 'Site crawl-delay: {} seconds.'.format(delay_time)) else: delay_time = ScraperConfig.DEFAULT_CRAWL_DELAY archiver_logger.info( 'No crawl delay for this site. Using default crawl delay of {} seconds.' ''.format(delay_time)) archiver_logger.info('Intializng Scraper timer.') self.scraper_timer = ScraperTimer(delay_time) self.delay_time = delay_time if success: archiver_logger.info('Archiver setup success!') else: archiver_logger.error('Archiver setup failure! Check logs!') archiver_logger.info('Building workers...') for i in xrange(self.worker_count): archiver_logger.info('Adding worker {}.'.format(i + 1)) worker = ArchiverWorker( self.shutdown_event, self.user_agent, self.robot_parser, self.scraper_timer, self.pages_need_visiting, self.pages_visited, self.pages_visited_lock, self.page_re_filters, self.pages_need_analysis_counter, self.archive_location) worker.daemon = True self.workers.append(worker) return success def run(self): archiver_logger.info('Starting workers...') [worker.start() for worker in self.workers] while not self.pages_need_analysis_counter.empty(): time.sleep(0.1) archiver_logger.info( 'Finished archiving all possible pages. Shutting down.') archiver_logger.info('Waiting for threads to finish up.') self.shutdown_event.set() self.scraper_timer.wait() return True def teardown(self): if not self.shutdown_event.is_set(): self.shutdown_event.set() return True
class TestCache(unittest.TestCase): def setUp(self): self.robots = RobotsCache() def test_404(self): '''When we get a 404, assume free range''' with asis.Server('tests/asis/test_404', port=8080): self.assertEqual( self.robots.allowed('http://localhost:8080/foo', 'rogerbot'), True) def test_caching(self): '''We should be able to cache results''' with asis.Server('tests/asis/test_caching', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_context_manager(self): '''When using as a context manager, it should clear afterwards''' with asis.Server('tests/asis/test_context_manager', port=8080): with self.robots: self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual( self.robots.find('http://localhost:8080/foo'), None) # And now, we should have it no longer cached self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_expires(self): '''Should be able to recognize expired rules''' with asis.Server('tests/asis/test_expires', port=8080): old_ttl = self.robots.min_ttl self.robots.min_ttl = 0 self.assertNotEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=True), None) # If we ignore the TTL, it should still be there. self.assertNotEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=False, honor_ttl=False), None) # However, if we honor the TTL, it should be missing in the cache. self.assertEqual( self.robots.find('http://localhost:8080/foo', fetch_if_missing=False), None) self.robots.min_ttl = old_ttl def test_clear(self): '''Should be able to explicitly clear rules''' with asis.Server('tests/asis/test_clear', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.allowed('http://localhost:8080/foo', 'rogerbot') self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) # Now if we clear the rules, we should not find it self.robots.clear() self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_fetch(self): '''Ensure that 'fetch' doesn't cache''' with asis.Server('tests/asis/test_fetch', port=8080): self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'), None) self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) def test_cache(self): '''Ensure we can ask it to cache a result''' with asis.Server('tests/asis/test_cache', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'), None) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_add(self): '''We should be able to add rules that we get''' with asis.Server('tests/asis/test_add', port=8080): self.assertEqual(self.robots.find('http://localhost:8080/foo'), None) self.robots.add(self.robots.fetch('http://localhost:8080/foo')) self.assertNotEqual(self.robots.find('http://localhost:8080/foo'), None) def test_server_error(self): '''Make sure we can catch server errors''' with mock.patch.object(self.robots.session, 'get', side_effect=TypeError): self.assertRaises(ServerError, self.robots.allowed, 'http://localhost:8080/foo', 'rogerbot') def test_disallowed(self): '''Check the disallowed interface''' with asis.Server('tests/asis/test_disallowed', port=8080): self.assertFalse( self.robots.disallowed('http://localhost:8080/foo', 'rogerbot')) urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar'] self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls) self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), []) def test_delay(self): '''Check the delay interface''' with asis.Server('tests/asis/test_delay', port=8080): self.assertEqual( self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5) def test_sitemaps(self): '''Check the sitemaps interface''' with asis.Server('tests/asis/test_sitemaps', port=8080): self.assertEqual( self.robots.sitemaps('http://localhost:8080/foo'), [ 'http://localhost:8080/a', 'http://localhost:8080/b', 'http://localhost:8080/c' ]) def test_dns_exception(self): '''Raises an exception if url does not resolve.''' self.assertRaises(ConnectionException, self.robots.allowed, 'http://does-not-resolve', 'rogerbot') def test_malformed_url(self): '''Raises an exception if the url is malformed.''' self.assertRaises(MalformedUrl, self.robots.allowed, 'hhttp://moz.com', 'rogerbot') def test_ssl_exception(self): '''Raises an exception if there is an ssl error.''' with asis.Server('tests/asis/test_ssl_exception', port=8080): self.assertRaises(SSLException, self.robots.allowed, 'https://localhost:8080', 'rogerbot') def test_excessive_redirects(self): '''Raises an exception if there are too many redirects.''' with asis.Server('tests/asis/test_excessive_redirects', port=8080): self.assertRaises(ExcessiveRedirects, self.robots.allowed, 'http://localhost:8080/one', 'rogerbot') def test_bad_status_codes(self): '''Raises an exception if there is a 5xx status code.''' with asis.Server('tests/asis/test_bad_status_codes', port=8080): self.assertRaises(BadStatusCode, self.robots.allowed, 'http://localhost:8080', 'rogerbot')