Ejemplo n.º 1
0
 def push(self, request):
     key = self.getKey(request)
     q = qr.Queue(key)
     with self.req_lock:
         if not len(q):
             with self.pld_lock:
                 self.pldQueue.push_init(key, time.time())
         q.push(request)
     self.remaining += 1
     return 1
Ejemplo n.º 2
0
 def setUp(self):
     r.delete('qrtestqueue')
     self.q = qr.Queue(key='qrtestqueue')
     self.assertEquals(len(self.q), 0)
Ejemplo n.º 3
0
 def __init__(self, key):
     self.queue = qr.Queue(key, host='120.95.132.153', port=6379)
     self.key = key
Ejemplo n.º 4
0
    def pop(self, polite=True):
        '''Get the next request'''
        while True:

            # First, we pop the next thing in pldQueue *if* it's not a
            # premature fetch (and a race condition is not detected).
            with self.pld_lock:
                # Get the next plds we might want to fetch from
                next, when = self.pldQueue.peek(withscores=True)
                if not next:
                    # logger.debug('Nothing in pldQueue.')
                    return None
                # If the next-fetchable is too soon, wait. If we're
                # already waiting, don't schedule a double callLater.
                now = time.time()
                if polite and when > now:
                    with self.twi_lock:
                        if not (self.timer and self.timer.active()):
                            logger.debug('Waiting %f seconds on %s' %
                                         (when - now, next))
                            self.timer = reactor.callLater(
                                when - now, self.serveNext)
                    return None
                # If we get here, we don't need to wait. However, the
                # multithreaded nature of Twisted means that something
                # else might be waiting. Only clear timer if it's not
                # holding some other pending call.
                with self.twi_lock:
                    if not (self.timer and self.timer.active()):
                        self.timer = None
                # We know the time has passed (we peeked) so pop it.
                next = self.pldQueue.pop()

            # Get the queue pertaining to the PLD of interest and
            # acquire a request lock for it.
            q = qr.Queue(next)
            with self.req_lock:
                if len(q):
                    # If we've already saturated our parallel requests, then we'll
                    # wait some short amount of time before we make our next request.
                    # There is logic elsewhere so that if one of these requests
                    # completes before this small amount of time elapses, then it
                    # will be advanced accordingly.
                    if Counter.len(self.r, next) >= self.maxParallelRequests:
                        logger.debug('maxParallelRequests exceeded for %s' %
                                     next)
                        with self.pld_lock:
                            self.pldQueue.push_unique(next, time.time() + 20)
                        continue
                    # If the robots for this particular request is not fetched
                    # or it's expired, then we'll have to make a request for it
                    v = q.peek()
                    domain = urlparse.urlparse(v.url).netloc
                    robot = reppy.findRobot('http://' + domain)
                    if not self.allowAll and (not robot or robot.expired):
                        logger.debug('Making robots request for %s' % next)
                        r = RobotsRequest('http://' + domain + '/robots.txt')
                        r._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, r)
                        return r
                    else:
                        logger.debug('Popping next request from %s' % next)
                        v = q.pop()
                        # This was the source of a rather difficult-to-track bug
                        # wherein the pld queue would slowly drain, despite there
                        # being plenty of logical queues to draw from. The problem
                        # was introduced by calling urlparse.urljoin when invoking
                        # the request's onURL method. As a result, certain redirects
                        # were making changes to the url, saving it as an updated
                        # value, but we'd then try to pop off the queue for the new
                        # hostname, when in reality, we should pop off the queue
                        # for the original hostname.
                        v._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, v)
                        # At this point, we should also schedule the next request
                        # to this domain.
                        with self.pld_lock:
                            self.pldQueue.push_unique(
                                next,
                                time.time() + self.crawlDelay(v))
                        return v
                else:
                    try:
                        if Counter.len(self.r, next) == 0:
                            logger.debug('Calling onEmptyQueue for %s' % next)
                            self.onEmptyQueue(next)
                            try:
                                with self.pld_lock:
                                    self.pldQueue.clear_ph(next)
                            except ValueError:
                                logger.error(
                                    'pldQueue.clear_ph failed for %s' % next)
                        else:
                            # Otherwise, we should try again in a little bit, and
                            # see if the last request has finished.
                            with self.pld_lock:
                                self.pldQueue.push_unique(
                                    next,
                                    time.time() + 20)
                            logger.debug(
                                'Requests still in flight for %s. Waiting' %
                                next)
                    except Exception:
                        logger.exception('onEmptyQueue failed for %s' % next)
                    continue

        logger.debug('Returning None (should not happen).')
        return None
Ejemplo n.º 5
0
 def trim(self, request, trim):
     # Then, trim that list
     with self.req_lock:
         qr.Queue(self.getKey(request)).trim(trim)
Ejemplo n.º 6
0
    def __init__(self,
                 poolSize=10,
                 agent=None,
                 stopWhenDone=False,
                 delay=2,
                 allowAll=False,
                 use_lock=None,
                 **kwargs):

        # First, call the parent constructor
        BaseFetcher.__init__(self, poolSize, agent, stopWhenDone)

        # Import DownpourLock only if use_lock specified, because it uses
        # *NIX-specific features. We use one lock for the pldQueue and one
        # for all the request queues collectively. The latter is a tad
        # overly restrictive, but is far easier than managing hundreds
        # of locks for hundreds of queues.
        if use_lock:
            import DownpourLock
            self.pld_lock = DownpourLock.DownpourLock("%s_pld.lock" % use_lock)
            self.req_lock = DownpourLock.DownpourLock("%s_req.lock" % use_lock)
        else:
            self.pld_lock = threading.RLock()
            self.req_lock = threading.RLock()
        self.twi_lock = threading.RLock()  # Twisted reactor lock

        # Include a priority queue of plds
        self.pldQueue = PLDQueue('plds', **kwargs)
        # Make sure that there is an entry in the plds for
        # each domain waiting to be fetched. Also, include
        # the number of urls from each domain in the count
        # of remaining urls to be fetched.
        self.r = redis.Redis(**kwargs)
        # Redis has a pipeline feature that allows for bulk
        # requests, the result of which is a list of the
        # result of each individual request. Thus, only get
        # the length of each of the queues in the pipeline
        # as we're just going to set remaining to the sum
        # of the lengths of each of the domain queues.
        with self.r.pipeline() as p:
            for key in self.r.keys('domain:*'):
                with self.pld_lock:
                    self.pldQueue.push_init(key, 0)
                p.llen(key)
            self.remaining = sum(p.execute())
        # For whatever reason, pushing key names back into the
        # priority queue has been problematic. As such, we'll
        # set them aside as they fail, and then retry them at
        # some point. Like when the next request finishes.
        self.retries = []
        # Now make a queue for incoming requests
        self.requests = qr.Queue('request', **kwargs)
        self.delay = float(delay)
        # This is used when we have to impose a delay before
        # servicing the next available request.
        with self.twi_lock:
            self.timer = None
        # This is a way to ignore the allow/disallow directives
        # For example, if you're checking for allow in other places
        self.allowAll = allowAll
        self.userAgentString = reppy.getUserAgentString(self.agent)
Ejemplo n.º 7
0
                            # see if the last request has finished.
                            with self.pld_lock:
                                self.pldQueue.push_unique(
                                    next,
                                    time.time() + 20)
                            logger.debug(
                                'Requests still in flight for %s. Waiting' %
                                next)
                    except Exception:
                        logger.exception('onEmptyQueue failed for %s' % next)
                    continue

        logger.debug('Returning None (should not happen).')
        return None


if __name__ == '__main__':
    import logging
    from downpour import BaseRequest

    # Turn on logging
    logger.setLevel(logging.DEBUG)

    q = qr.Queue('requests')
    with file('urls.txt') as f:
        for line in f:
            q.push(BaseRequest(line.strip()))

    p = PoliteFetcher(100)
    p.start()