Example #1
0
class RobotCheck(Step):
    cache = LRUCache(1000)

    @classmethod
    def post_save_clear(cls, sender, document, **kwargs):
        logging.debug("Removing robots.txt cache information for: %s" % document.site)

        for scheme in ('http', 'https'):
            url = "%s://%s/robots.txt" % (scheme, document.site)
            if url in cls.cache:
                del cls.cache[url]

    def __init__(self, settings, **kwargs):
        """Initialzation"""
        self.settings = settings
        self.fetch    = Fetch(settings)
        self.store    = StoreResponse(settings)

    @gen.engine
    def process(self, task, callback=None, **kwargs):
        url = "%s://%s/robots.txt" % (task.url_scheme, task.url_host)

        if url in self.cache:
            matcher = self.cache[url]
        else:
            matcher = self.cache[url] = yield gen.Task(self.build_matcher, url)
            # TODO - Get Crawl Delay ``matcher.get_crawl_delay()``
            
        if matcher.is_allowed_path(task.url_path):
            callback((Step.CONTINUE, task))
        else:
            callback((Step.STOP, task))

    @gen.engine
    def build_matcher(self, url, callback):
        task   = Task(url)
        extra_rules = []

        for rule in RobotRule.objects(site=task.url_host):
            extra_rules.append(('allow' if rule.flag else 'deny', rule.path, rule.order))

        extra_rules = sorted(extra_rules, key=lambda x: x[2])

        try:
            parser = RobotParser(useragent=self.settings.USER_AGENT, extra_rules=extra_rules)
        except Exception as e:
            logging.error("Exception building robot parser", e)
            raise e

        v, t = yield gen.Task(self.fetch.process, task)

        # Save the robots.txt
        yield gen.Task(self.store.process, task)

        if task.content:
            parser.parse(task.content)

        matcher = parser.matcher(self.settings.ROBOT_NAME)

        callback(matcher)
Example #2
0
    def __init__(self, redis):
        super(RedisQueue, self).__init__()
        self.redis = redis

        self._owned = {}
        self._seen_cache = LRUCache(10000)

        import socket, os
        self.guid = "%s:%s" % (socket.gethostname(), os.getpid())

        self._known_buckets = set()
Example #3
0
class DNSHandler(object):
    """Cache DNS Names - this is plugged into the Tornado Async Client"""
    def __init__(self):
        self.cache = LRUCache(1000)

    def get(self, host, default=None):
        """Mimic Dictionary get's but handling them through a cache"""

        addr = self.cache.get(host, None)
        if addr:
            return addr

        addrinfo = socket.getaddrinfo(host, 80, 0, 0, socket.SOL_TCP)
        af, socktype, proto, canonname, sockaddr = addrinfo[0]

        self.cache[host] = sockaddr[0]
        return sockaddr[0]
Example #4
0
class DNSHandler(object):
    """Cache DNS Names - this is plugged into the Tornado Async Client"""

    def __init__(self):
        self.cache = LRUCache(1000)
    
    def get(self, host, default=None):
        """Mimic Dictionary get's but handling them through a cache"""

        addr = self.cache.get(host, None)
        if addr:
            return addr

        addrinfo = socket.getaddrinfo(host, 80, 0, 0, socket.SOL_TCP)
        af, socktype, proto, canonname, sockaddr = addrinfo[0]

        self.cache[host] = sockaddr[0]
        return sockaddr[0]
Example #5
0
 def __init__(self):
     self.cache = LRUCache(1000)
Example #6
0
class Fetch(Step):
    default_delay = 10
    cache = LRUCache(1000)

    def __init__(self, settings, user_settings=None, **kwargs):
        self.client     = httpclient.AsyncHTTPClient()
        self.use_gzip   = settings.USE_GZIP
        self.user_agent = settings.USER_AGENT
        self.ioloop     = ioloop.IOLoop.instance()

        if user_settings:
            self.post_save(None, user_settings)

    def process(self, task, callback=None, **kwargs):
        task.request = httpclient.HTTPRequest(task.url, use_gzip=self.use_gzip, user_agent=self.user_agent)

        tnow = time.time()
        tv = self.cache.get(task.url_host, tnow)
        if tv > tnow:
            tnext = tv + self.delay
            self.cache[task.url_host] = tnext
            logging.debug("Fetching on timer in %.2f seconds" % (tnext - tnow))
            self.ioloop.add_timeout(tnext, partial(self.fetch, task, callback))
        else:
            logging.debug("Fetcher not busy %r" % (tv))
            self.cache[task.url_host] = tnow + self.delay
            self.fetch(task, callback)

    @property
    def delay(self):
        return self.__class__.default_delay

    @gen.engine
    def fetch(self, task, callback):
        logging.debug("Starting fetch of url=%s" % (task.url))
        task.response = yield gen.Task(self.client.fetch, task.request)

        if task.response.body:
            blen = len(task.response.body)
        else:
            blen = 0

        try:
            raw_len = int(task.response.headers.get('content-length', blen))
        except:
            raw_len = blen

        logging.debug("Fetched code=%d len_raw=%d len=%d url=%s" % (task.response.code, raw_len, blen, task.url))
        PageStats.crawled(task.response.code, raw_len)

        if task.response.code == 200:
            task.content = task.content_from_response()
        elif task.response.code in (301, 302):
            logging.error("Unhandled Redirect code=%d url=%s" % (task.response.code, task.url))
        else:
            task.content = None

        callback((Step.CONTINUE, task))

    @classmethod
    def post_save(cls, sender, document, **kwargs):
        cls.default_delay = document.crawl_delay
Example #7
0
 def __init__(self):
     self.cache = LRUCache(1000)