Esempio n. 1
0
class RobotsMiddleware(BaseMiddleware):

    def __init__(self, *args, **kwargs):
        self.cache = RobotsCache(*args, **kwargs)
        self.visited = collections.defaultdict(dict)

    def check_disallow(self, url, agent):
        if not self.cache.allowed(url, agent):
            raise RobotsDisallowedError

    def check_crawl_delay(self, url, agent):
        delay = self.cache.delay(url, agent)
        if delay is None:
            return
        now = datetime.datetime.utcnow()
        host = urlparse.urlparse(url).hostname
        try:
            last_visit = self.visited[agent][host]
            if (now - last_visit).seconds < delay:
                raise RobotsThrottledError
        except KeyError:
            pass
        self.visited[agent][host] = now

    def before_send(self, request, *args, **kwargs):
        url = request.url
        agent = request.headers.get('User-Agent')
        self.check_disallow(url, agent)
        self.check_crawl_delay(url, agent)
Esempio n. 2
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(self.robots.allowed(
                'http://localhost:8080/foo', 'rogerbot'), True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(
                self.robots.fetch('http://localhost:8080/foo'), None)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.cache('http://localhost:8080/foo'), None)
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo'), None)
            self.robots.add(self.robots.fetch(
                'http://localhost:8080/foo'))
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo'), None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
            'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(self.robots.disallowed(
                'http://localhost:8080/foo', 'rogerbot'))
            urls = [
                'http://localhost:8080/foo',
                'http://localhost:8080/bar'
            ]
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(self.robots.delay(
                'http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a',
                    'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
Esempio n. 3
0
"""
"""

# Imports
import json
import time
import requests
import urlparse

doi_url = 'http://dx.doi.org/'

# Get crawl-delay parameter from robots.txt
from reppy.cache import RobotsCache
robots = RobotsCache()
doi_delay = robots.delay(doi_url, '*')

def doi_to_csl(doi):
    """ Fetch CSL-formatted reference by DOI. """

    # Build URL
    url = urlparse.urljoin(doi_url, doi)
    
    # Send request
    req = requests.get(
        url, 
        headers={
            'accept' : 'application/citeproc+json'
        }
    )

    # Wait for crawl-delay
Esempio n. 4
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo', True), None)
            # Now, it shouldn't be cached, so when we find it again, it should
            # be missing (or at least, requiring a refetch)
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo', False), None)

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        self.assertRaises(ServerError, self.robots.allowed,
                          'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])
class Archiver(object):
    ARCHIVE_SUBFORUM_SUBURL_TEMPLATE = 'index.php/f-{forum_code}.html'
    ARCHIVE_SUBFORUM_SUBURL_RE_TEMPLATE = 'index.php/f-{forum_code}[^(.html)]?.html'
    ARCHIVE_THREAD_SUBURL_RE = 'index.php/t-[^(.html)]*.html'
    ARCHIVE_CSS_RE = '[^(.css)]*.css'

    def __init__(self, base_url, forum_codes, archive_location, user_agent,
                 worker_count):
        archiver_logger.info('Archiver initialized.')
        self.base_url = base_url
        self.archive_base_url = urljoin(self.base_url,
                                        ScraperConfig.ARCHIVE_SUBURL)
        self.forum_codes = forum_codes
        self.archive_location = archive_location
        self.user_agent = user_agent
        self.robot_parser = RobotsCache()
        self.scraper_timer = None
        self.shutdown_event = threading.Event()
        self.delay_time = 1

        self.workers = []
        self.worker_count = worker_count

        self.pages_need_visiting = Queue()
        self.pages_need_analysis_counter = RachetingCounter()
        self.pages_visited_lock = threading.Lock()
        self.pages_visited = []
        self.page_re_filters = []

    def setup(self):
        archiver_logger.info('Beginning Archiver setup.')
        success = True

        archiver_logger.info('Building page filters.')
        # Build regular expression filters for pages to attempt to crawl.
        archive_base_url = self.archive_base_url

        # Build regular expression for sub-forums we're interested in.
        for forum_code in self.forum_codes:
            regex = urljoin(
                archive_base_url,
                self.ARCHIVE_SUBFORUM_SUBURL_RE_TEMPLATE.format(
                    forum_code=forum_code))
            self.page_re_filters.append(re.compile(regex))

        # Add a regular expression for thread pages.
        thread_regex = urljoin(archive_base_url, self.ARCHIVE_THREAD_SUBURL_RE)
        self.page_re_filters.append(re.compile(thread_regex))

        # Finally add a regular expression to grab the archive CSS.
        css_regex = urljoin(archive_base_url, self.ARCHIVE_CSS_RE)
        self.page_re_filters.append(re.compile(css_regex))

        archiver_logger.info('Adding seed pages.')
        for fc in self.forum_codes:
            subforum_url = urljoin(
                self.archive_base_url,
                self.ARCHIVE_SUBFORUM_SUBURL_TEMPLATE.format(forum_code=fc))
            self.pages_need_visiting.put(subforum_url)
            self.pages_need_analysis_counter.increment()
            archiver_logger.info(
                'Archiver seeded with page {}.'.format(subforum_url))

        archiver_logger.info('Checking archive location...')
        # Setup archive location.
        base_path, new_archive = os.path.split(self.archive_location)
        if not os.path.exists(base_path) or not os.path.isdir(base_path):
            success = False
            archiver_logger.error(
                'Base path {} does not exist or is not a directory! Aborting!')
            return success
        elif (os.path.exists(self.archive_location)
              and (not os.path.isdir(self.archive_location)
                   or os.listdir(self.archive_location))):
            success = False
            archiver_logger.error(
                'Archive location {} is either a not a directory or is not empty! Aborting!'
                ''.format(self.archive_location))
            return success
        elif not os.path.exists(self.archive_location):
            archiver_logger.info('Creating archive directory {}.'.format(
                self.archive_location))
            try:
                os.mkdir(self.archive_location)
            except OSError:
                success = False
                archiver_logger.exception(
                    'Faulted attempting to create archive directory! Aborting!'
                )
                return success
        else:
            archiver_logger.info(
                'Empty archive directory {} exists. Proceeding...'.format(
                    self.archive_location))

        # Attempt to retrieve robots.txt information about target site.
        if not self.robot_parser.allowed(self.base_url, self.user_agent):
            success = False
            archiver_logger.error('Not allowed to scrape {}! Aborting!'.format(
                self.base_url))
            return success
        else:
            archiver_logger.info(
                'Successfully polled {} for robots.txt, can scrape.'.format(
                    self.base_url))

        # Get crawl delay and build scraper timer.
        delay_time = self.robot_parser.delay(self.base_url, self.user_agent)
        if delay_time:
            archiver_logger.info(
                'Site crawl-delay: {} seconds.'.format(delay_time))

        else:
            delay_time = ScraperConfig.DEFAULT_CRAWL_DELAY
            archiver_logger.info(
                'No crawl delay for this site. Using default crawl delay of {} seconds.'
                ''.format(delay_time))
        archiver_logger.info('Intializng Scraper timer.')
        self.scraper_timer = ScraperTimer(delay_time)
        self.delay_time = delay_time
        if success:
            archiver_logger.info('Archiver setup success!')
        else:
            archiver_logger.error('Archiver setup failure! Check logs!')
        archiver_logger.info('Building workers...')
        for i in xrange(self.worker_count):
            archiver_logger.info('Adding worker {}.'.format(i + 1))
            worker = ArchiverWorker(
                self.shutdown_event, self.user_agent, self.robot_parser,
                self.scraper_timer, self.pages_need_visiting,
                self.pages_visited, self.pages_visited_lock,
                self.page_re_filters, self.pages_need_analysis_counter,
                self.archive_location)
            worker.daemon = True
            self.workers.append(worker)
        return success

    def run(self):
        archiver_logger.info('Starting workers...')
        [worker.start() for worker in self.workers]
        while not self.pages_need_analysis_counter.empty():
            time.sleep(0.1)
        archiver_logger.info(
            'Finished archiving all possible pages. Shutting down.')
        archiver_logger.info('Waiting for threads to finish up.')
        self.shutdown_event.set()
        self.scraper_timer.wait()
        return True

    def teardown(self):
        if not self.shutdown_event.is_set():
            self.shutdown_event.set()
        return True
Esempio n. 6
0
class TestCache(unittest.TestCase):
    def setUp(self):
        self.robots = RobotsCache()

    def test_404(self):
        '''When we get a 404, assume free range'''
        with asis.Server('tests/asis/test_404', port=8080):
            self.assertEqual(
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot'),
                True)

    def test_caching(self):
        '''We should be able to cache results'''
        with asis.Server('tests/asis/test_caching', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_context_manager(self):
        '''When using as a context manager, it should clear afterwards'''
        with asis.Server('tests/asis/test_context_manager', port=8080):
            with self.robots:
                self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                                 None)
                self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
                self.assertNotEqual(
                    self.robots.find('http://localhost:8080/foo'), None)
            # And now, we should have it no longer cached
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_expires(self):
        '''Should be able to recognize expired rules'''
        with asis.Server('tests/asis/test_expires', port=8080):
            old_ttl = self.robots.min_ttl
            self.robots.min_ttl = 0
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=True), None)
            # If we ignore the TTL, it should still be there.
            self.assertNotEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=False,
                                 honor_ttl=False), None)
            # However, if we honor the TTL, it should be missing in the cache.
            self.assertEqual(
                self.robots.find('http://localhost:8080/foo',
                                 fetch_if_missing=False), None)
            self.robots.min_ttl = old_ttl

    def test_clear(self):
        '''Should be able to explicitly clear rules'''
        with asis.Server('tests/asis/test_clear', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.allowed('http://localhost:8080/foo', 'rogerbot')
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)
            # Now if we clear the rules, we should not find it
            self.robots.clear()
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_fetch(self):
        '''Ensure that 'fetch' doesn't cache'''
        with asis.Server('tests/asis/test_fetch', port=8080):
            self.assertNotEqual(self.robots.fetch('http://localhost:8080/foo'),
                                None)
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)

    def test_cache(self):
        '''Ensure we can ask it to cache a result'''
        with asis.Server('tests/asis/test_cache', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.assertNotEqual(self.robots.cache('http://localhost:8080/foo'),
                                None)
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_add(self):
        '''We should be able to add rules that we get'''
        with asis.Server('tests/asis/test_add', port=8080):
            self.assertEqual(self.robots.find('http://localhost:8080/foo'),
                             None)
            self.robots.add(self.robots.fetch('http://localhost:8080/foo'))
            self.assertNotEqual(self.robots.find('http://localhost:8080/foo'),
                                None)

    def test_server_error(self):
        '''Make sure we can catch server errors'''
        with mock.patch.object(self.robots.session,
                               'get',
                               side_effect=TypeError):
            self.assertRaises(ServerError, self.robots.allowed,
                              'http://localhost:8080/foo', 'rogerbot')

    def test_disallowed(self):
        '''Check the disallowed interface'''
        with asis.Server('tests/asis/test_disallowed', port=8080):
            self.assertFalse(
                self.robots.disallowed('http://localhost:8080/foo',
                                       'rogerbot'))
            urls = ['http://localhost:8080/foo', 'http://localhost:8080/bar']
            self.assertEqual(self.robots.allowed(urls, 'rogerbot'), urls)
            self.assertEqual(self.robots.disallowed(urls, 'rogerbot'), [])

    def test_delay(self):
        '''Check the delay interface'''
        with asis.Server('tests/asis/test_delay', port=8080):
            self.assertEqual(
                self.robots.delay('http://localhost:8080/foo', 'rogerbot'), 5)

    def test_sitemaps(self):
        '''Check the sitemaps interface'''
        with asis.Server('tests/asis/test_sitemaps', port=8080):
            self.assertEqual(
                self.robots.sitemaps('http://localhost:8080/foo'), [
                    'http://localhost:8080/a', 'http://localhost:8080/b',
                    'http://localhost:8080/c'
                ])

    def test_dns_exception(self):
        '''Raises an exception if url does not resolve.'''
        self.assertRaises(ConnectionException, self.robots.allowed,
                          'http://does-not-resolve', 'rogerbot')

    def test_malformed_url(self):
        '''Raises an exception if the url is malformed.'''
        self.assertRaises(MalformedUrl, self.robots.allowed, 'hhttp://moz.com',
                          'rogerbot')

    def test_ssl_exception(self):
        '''Raises an exception if there is an ssl error.'''
        with asis.Server('tests/asis/test_ssl_exception', port=8080):
            self.assertRaises(SSLException, self.robots.allowed,
                              'https://localhost:8080', 'rogerbot')

    def test_excessive_redirects(self):
        '''Raises an exception if there are too many redirects.'''
        with asis.Server('tests/asis/test_excessive_redirects', port=8080):
            self.assertRaises(ExcessiveRedirects, self.robots.allowed,
                              'http://localhost:8080/one', 'rogerbot')

    def test_bad_status_codes(self):
        '''Raises an exception if there is a 5xx status code.'''
        with asis.Server('tests/asis/test_bad_status_codes', port=8080):
            self.assertRaises(BadStatusCode, self.robots.allowed,
                              'http://localhost:8080', 'rogerbot')