Example #1
0
    def pop(self, polite=True):
        '''Get the next request'''
        while True:

            # First, we pop the next thing in pldQueue *if* it's not a
            # premature fetch (and a race condition is not detected).
            with self.pld_lock:
                # Get the next plds we might want to fetch from
                next, when = self.pldQueue.peek(withscores=True)
                if not next:
                    # logger.debug('Nothing in pldQueue.')
                    return None
                # If the next-fetchable is too soon, wait. If we're
                # already waiting, don't schedule a double callLater.
                now = time.time()
                if polite and when > now:
                    with self.twi_lock:
                        if not (self.timer and self.timer.active()):
                            logger.debug('Waiting %f seconds on %s' %
                                         (when - now, next))
                            self.timer = reactor.callLater(
                                when - now, self.serveNext)
                    return None
                # If we get here, we don't need to wait. However, the
                # multithreaded nature of Twisted means that something
                # else might be waiting. Only clear timer if it's not
                # holding some other pending call.
                with self.twi_lock:
                    if not (self.timer and self.timer.active()):
                        self.timer = None
                # We know the time has passed (we peeked) so pop it.
                next = self.pldQueue.pop()

            # Get the queue pertaining to the PLD of interest and
            # acquire a request lock for it.
            q = qr.Queue(next)
            with self.req_lock:
                if len(q):
                    # If we've already saturated our parallel requests, then we'll
                    # wait some short amount of time before we make our next request.
                    # There is logic elsewhere so that if one of these requests
                    # completes before this small amount of time elapses, then it
                    # will be advanced accordingly.
                    if Counter.len(self.r, next) >= self.maxParallelRequests:
                        logger.debug('maxParallelRequests exceeded for %s' %
                                     next)
                        with self.pld_lock:
                            self.pldQueue.push_unique(next, time.time() + 20)
                        continue
                    # If the robots for this particular request is not fetched
                    # or it's expired, then we'll have to make a request for it
                    v = q.peek()
                    domain = urlparse.urlparse(v.url).netloc
                    robot = reppy.findRobot('http://' + domain)
                    if not self.allowAll and (not robot or robot.expired):
                        logger.debug('Making robots request for %s' % next)
                        r = RobotsRequest('http://' + domain + '/robots.txt')
                        r._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, r)
                        return r
                    else:
                        logger.debug('Popping next request from %s' % next)
                        v = q.pop()
                        # This was the source of a rather difficult-to-track bug
                        # wherein the pld queue would slowly drain, despite there
                        # being plenty of logical queues to draw from. The problem
                        # was introduced by calling urlparse.urljoin when invoking
                        # the request's onURL method. As a result, certain redirects
                        # were making changes to the url, saving it as an updated
                        # value, but we'd then try to pop off the queue for the new
                        # hostname, when in reality, we should pop off the queue
                        # for the original hostname.
                        v._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, v)
                        # At this point, we should also schedule the next request
                        # to this domain.
                        with self.pld_lock:
                            self.pldQueue.push_unique(
                                next,
                                time.time() + self.crawlDelay(v))
                        return v
                else:
                    try:
                        if Counter.len(self.r, next) == 0:
                            logger.debug('Calling onEmptyQueue for %s' % next)
                            self.onEmptyQueue(next)
                            try:
                                with self.pld_lock:
                                    self.pldQueue.clear_ph(next)
                            except ValueError:
                                logger.error(
                                    'pldQueue.clear_ph failed for %s' % next)
                        else:
                            # Otherwise, we should try again in a little bit, and
                            # see if the last request has finished.
                            with self.pld_lock:
                                self.pldQueue.push_unique(
                                    next,
                                    time.time() + 20)
                            logger.debug(
                                'Requests still in flight for %s. Waiting' %
                                next)
                    except Exception:
                        logger.exception('onEmptyQueue failed for %s' % next)
                    continue

        logger.debug('Returning None (should not happen).')
        return None
Example #2
0
    def pop(self, polite=True):
        '''Get the next request'''
        while True:

            # First, we pop the next thing in pldQueue *if* it's not a
            # premature fetch (and a race condition is not detected).
            with self.pld_lock:
                # Get the next plds we might want to fetch from
                next, when = self.pldQueue.peek(withscores=True)
                if not next:
                    # logger.debug('Nothing in pldQueue.')
                    return None
                # If the next-fetchable is too soon, wait. If we're
                # already waiting, don't schedule a double callLater.
                now = time.time()
                if polite and when > now:
                    with self.twi_lock:
                        if not (self.timer and self.timer.active()):
                            logger.debug('Waiting %f seconds on %s' % (when - now, next))
                            self.timer = reactor.callLater(when - now, self.serveNext)
                    return None
                # If we get here, we don't need to wait. However, the
                # multithreaded nature of Twisted means that something
                # else might be waiting. Only clear timer if it's not
                # holding some other pending call.
                with self.twi_lock:
                    if not (self.timer and self.timer.active()):
                        self.timer = None
                # We know the time has passed (we peeked) so pop it.
                next = self.pldQueue.pop()

            # Get the queue pertaining to the PLD of interest and
            # acquire a request lock for it.
            q = qr.Queue(next)
            with self.req_lock:
                if len(q):
                    # If we've already saturated our parallel requests, then we'll
                    # wait some short amount of time before we make our next request.
                    # There is logic elsewhere so that if one of these requests
                    # completes before this small amount of time elapses, then it
                    # will be advanced accordingly.
                    if Counter.len(self.r, next) >= self.maxParallelRequests:
                        logger.debug('maxParallelRequests exceeded for %s' % next)
                        with self.pld_lock:
                            self.pldQueue.push_unique(next, time.time() + 20)
                        continue
                    # If the robots for this particular request is not fetched
                    # or it's expired, then we'll have to make a request for it
                    v = q.peek()
                    domain = urlparse.urlparse(v.url).netloc
                    robot = reppy.findRobot('http://' + domain)
                    if not self.allowAll and (not robot or robot.expired):
                        logger.debug('Making robots request for %s' % next)
                        r = RobotsRequest('http://' + domain + '/robots.txt')
                        r._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, r)
                        return r
                    else:
                        logger.debug('Popping next request from %s' % next)
                        v = q.pop()
                        # This was the source of a rather difficult-to-track bug
                        # wherein the pld queue would slowly drain, despite there
                        # being plenty of logical queues to draw from. The problem
                        # was introduced by calling urlparse.urljoin when invoking
                        # the request's onURL method. As a result, certain redirects
                        # were making changes to the url, saving it as an updated
                        # value, but we'd then try to pop off the queue for the new
                        # hostname, when in reality, we should pop off the queue
                        # for the original hostname.
                        v._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, v)
                        # At this point, we should also schedule the next request
                        # to this domain.
                        with self.pld_lock:
                            self.pldQueue.push_unique(next, time.time() + self.crawlDelay(v))
                        return v
                else:
                    try:
                        if Counter.len(self.r, next) == 0:
                            logger.debug('Calling onEmptyQueue for %s' % next)
                            self.onEmptyQueue(next)
                            try:
                                with self.pld_lock:
                                    self.pldQueue.clear_ph(next)
                            except ValueError:
                                logger.error('pldQueue.clear_ph failed for %s' % next)
                        else:
                            # Otherwise, we should try again in a little bit, and
                            # see if the last request has finished.
                            with self.pld_lock:
                                self.pldQueue.push_unique(next, time.time() + 20)
                            logger.debug('Requests still in flight for %s. Waiting' % next)
                    except Exception:
                        logger.exception('onEmptyQueue failed for %s' % next)
                    continue

        logger.debug('Returning None (should not happen).')
        return None
Example #3
0
 def pop(self, polite=True):
     '''Get the next request'''
     now = time.time()
     while True:
         # Get the next plds we might want to fetch from
         next, when = self.pldQueue.peek(withscores=True)
         if not next:
             return None
         # If the next-fetchable is not soon enough, then wait
         if polite and when > now:
             with self.tlock:
                 if not (self.timer and self.timer.active()):
                     logger.debug('Waiting %f seconds on %s' % (when - now, next))
                     self.timer = reactor.callLater(when - now, self.serveNext)
                 return None
         else:
             # Go ahead and pop this item
             last = next
             next = self.pldQueue.pop()
             # Unset the timer
             self.timer = None
             q = qr.Queue(next)
             
             with self.lock:
                 if len(q):
                     # If we've already saturated our parallel requests, then we'll
                     # wait some short amount of time before we make our next request.
                     # There is logic elsewhere so that if one of these requests 
                     # completes before this small amount of time elapses, then it
                     # will be advanced accordingly.
                     if Counter.len(self.r, next) >= self.maxParallelRequests:
                         self.pldQueue.push(next, time.time() + 20)
                         continue
                     
                     # If the robots for this particular request is not fetched
                     # or it's expired, then we'll have to make a request for it
                     v = q.peek()
                     domain = urlparse.urlparse(v.url).netloc
                     robot = reppy.findRobot('http://' + domain)
                     if not self.allowAll and (not robot or robot.expired):
                         logger.debug('Making robots request for %s' % next)
                         r = RobotsRequest('http://' + domain + '/robots.txt')
                         r._originalKey = next
                         # Increment the number of requests we currently have in flight
                         Counter.put(self.r, r)
                         return r
                     else:
                         logger.debug('Popping next request from %s' % next)
                         v = q.pop()
                         # This was the source of a rather difficult-to-track bug
                         # wherein the pld queue would slowly drain, despite there
                         # being plenty of logical queues to draw from. The problem
                         # was introduced by calling urlparse.urljoin when invoking
                         # the request's onURL method. As a result, certain redirects
                         # were making changes to the url, saving it as an updated
                         # value, but we'd then try to pop off the queue for the new
                         # hostname, when in reality, we should pop off the queue 
                         # for the original hostname.
                         v._originalKey = next
                         # Increment the number of requests we currently have in flight
                         Counter.put(self.r, v)
                         # At this point, we should also schedule the next request
                         # to this domain.
                         self.pldQueue.push(next, time.time() + self.crawlDelay(v))
                         return v
                 else:
                     try:
                         if Counter.len(self.r, next) == 0:
                             logger.debug('Calling onEmptyQueue for %s' % next)
                             self.onEmptyQueue(next)
                         else:
                             # Otherwise, we should try again in a little bit, and 
                             # see if the last request has finished.
                             self.pldQueue.push(next, time.time() + 20)
                             logger.debug('Requests still in flight for %s. Waiting' % next)
                     except Exception:
                         logger.exception('onEmptyQueue failed for %s' % next)
                     continue
     return None