def push(self, request): key = self.getKey(request) q = qr.Queue(key) with self.req_lock: if not len(q): with self.pld_lock: self.pldQueue.push_init(key, time.time()) q.push(request) self.remaining += 1 return 1
def setUp(self): r.delete('qrtestqueue') self.q = qr.Queue(key='qrtestqueue') self.assertEquals(len(self.q), 0)
def __init__(self, key): self.queue = qr.Queue(key, host='120.95.132.153', port=6379) self.key = key
def pop(self, polite=True): '''Get the next request''' while True: # First, we pop the next thing in pldQueue *if* it's not a # premature fetch (and a race condition is not detected). with self.pld_lock: # Get the next plds we might want to fetch from next, when = self.pldQueue.peek(withscores=True) if not next: # logger.debug('Nothing in pldQueue.') return None # If the next-fetchable is too soon, wait. If we're # already waiting, don't schedule a double callLater. now = time.time() if polite and when > now: with self.twi_lock: if not (self.timer and self.timer.active()): logger.debug('Waiting %f seconds on %s' % (when - now, next)) self.timer = reactor.callLater( when - now, self.serveNext) return None # If we get here, we don't need to wait. However, the # multithreaded nature of Twisted means that something # else might be waiting. Only clear timer if it's not # holding some other pending call. with self.twi_lock: if not (self.timer and self.timer.active()): self.timer = None # We know the time has passed (we peeked) so pop it. next = self.pldQueue.pop() # Get the queue pertaining to the PLD of interest and # acquire a request lock for it. q = qr.Queue(next) with self.req_lock: if len(q): # If we've already saturated our parallel requests, then we'll # wait some short amount of time before we make our next request. # There is logic elsewhere so that if one of these requests # completes before this small amount of time elapses, then it # will be advanced accordingly. if Counter.len(self.r, next) >= self.maxParallelRequests: logger.debug('maxParallelRequests exceeded for %s' % next) with self.pld_lock: self.pldQueue.push_unique(next, time.time() + 20) continue # If the robots for this particular request is not fetched # or it's expired, then we'll have to make a request for it v = q.peek() domain = urlparse.urlparse(v.url).netloc robot = reppy.findRobot('http://' + domain) if not self.allowAll and (not robot or robot.expired): logger.debug('Making robots request for %s' % next) r = RobotsRequest('http://' + domain + '/robots.txt') r._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, r) return r else: logger.debug('Popping next request from %s' % next) v = q.pop() # This was the source of a rather difficult-to-track bug # wherein the pld queue would slowly drain, despite there # being plenty of logical queues to draw from. The problem # was introduced by calling urlparse.urljoin when invoking # the request's onURL method. As a result, certain redirects # were making changes to the url, saving it as an updated # value, but we'd then try to pop off the queue for the new # hostname, when in reality, we should pop off the queue # for the original hostname. v._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, v) # At this point, we should also schedule the next request # to this domain. with self.pld_lock: self.pldQueue.push_unique( next, time.time() + self.crawlDelay(v)) return v else: try: if Counter.len(self.r, next) == 0: logger.debug('Calling onEmptyQueue for %s' % next) self.onEmptyQueue(next) try: with self.pld_lock: self.pldQueue.clear_ph(next) except ValueError: logger.error( 'pldQueue.clear_ph failed for %s' % next) else: # Otherwise, we should try again in a little bit, and # see if the last request has finished. with self.pld_lock: self.pldQueue.push_unique( next, time.time() + 20) logger.debug( 'Requests still in flight for %s. Waiting' % next) except Exception: logger.exception('onEmptyQueue failed for %s' % next) continue logger.debug('Returning None (should not happen).') return None
def trim(self, request, trim): # Then, trim that list with self.req_lock: qr.Queue(self.getKey(request)).trim(trim)
def __init__(self, poolSize=10, agent=None, stopWhenDone=False, delay=2, allowAll=False, use_lock=None, **kwargs): # First, call the parent constructor BaseFetcher.__init__(self, poolSize, agent, stopWhenDone) # Import DownpourLock only if use_lock specified, because it uses # *NIX-specific features. We use one lock for the pldQueue and one # for all the request queues collectively. The latter is a tad # overly restrictive, but is far easier than managing hundreds # of locks for hundreds of queues. if use_lock: import DownpourLock self.pld_lock = DownpourLock.DownpourLock("%s_pld.lock" % use_lock) self.req_lock = DownpourLock.DownpourLock("%s_req.lock" % use_lock) else: self.pld_lock = threading.RLock() self.req_lock = threading.RLock() self.twi_lock = threading.RLock() # Twisted reactor lock # Include a priority queue of plds self.pldQueue = PLDQueue('plds', **kwargs) # Make sure that there is an entry in the plds for # each domain waiting to be fetched. Also, include # the number of urls from each domain in the count # of remaining urls to be fetched. self.r = redis.Redis(**kwargs) # Redis has a pipeline feature that allows for bulk # requests, the result of which is a list of the # result of each individual request. Thus, only get # the length of each of the queues in the pipeline # as we're just going to set remaining to the sum # of the lengths of each of the domain queues. with self.r.pipeline() as p: for key in self.r.keys('domain:*'): with self.pld_lock: self.pldQueue.push_init(key, 0) p.llen(key) self.remaining = sum(p.execute()) # For whatever reason, pushing key names back into the # priority queue has been problematic. As such, we'll # set them aside as they fail, and then retry them at # some point. Like when the next request finishes. self.retries = [] # Now make a queue for incoming requests self.requests = qr.Queue('request', **kwargs) self.delay = float(delay) # This is used when we have to impose a delay before # servicing the next available request. with self.twi_lock: self.timer = None # This is a way to ignore the allow/disallow directives # For example, if you're checking for allow in other places self.allowAll = allowAll self.userAgentString = reppy.getUserAgentString(self.agent)
# see if the last request has finished. with self.pld_lock: self.pldQueue.push_unique( next, time.time() + 20) logger.debug( 'Requests still in flight for %s. Waiting' % next) except Exception: logger.exception('onEmptyQueue failed for %s' % next) continue logger.debug('Returning None (should not happen).') return None if __name__ == '__main__': import logging from downpour import BaseRequest # Turn on logging logger.setLevel(logging.DEBUG) q = qr.Queue('requests') with file('urls.txt') as f: for line in f: q.push(BaseRequest(line.strip())) p = PoliteFetcher(100) p.start()