def allowed(url, useragent, headers=None, meta_robots=None): '''Given a url, a useragent and optionally headers and a dictionary of meta robots key-value pairs, determine whether that url is allowed or not. The headers must be a dictionary mapping of string key to list value: { 'Content-Type': ['...'], 'X-Powered-By': ['...', '...'] } The meta robots must be provided as a mapping of string key to directives: { 'robots': 'index, follow', 'foobot': 'index, nofollow' } ''' # First, check robots.txt r = reppy.findRobot(url) allowed = (r == None) or r.allowed(url, useragent) # Next, check the X-Robots-Tag # There can be multiple instances of X-Robots-Tag in the headers, so we've # joined them together with semicolons. Now, make a dictionary of each of # the directives we found. They can be specific, in which case it's in the # format: # botname : directive # In the absence of a botname, it applies to all bots. Strictly speaking, # there is one directive that itself has a value, but as we're not interested # in it (it's the unavailable_after directive), it's ok to ignore it. if headers: for bot in headers.get('x-robots-tag', []): botname, sep, directive = bot.partition(':') if directive and botname == useragent: # This is when it applies just to us allowed = allowed and ('noindex' not in directive) and ( 'none' not in directive) else: # This is when it applies to all bots allowed = allowed and ('noindex' not in botname) and ('none' not in botname) # Now check for specific and general meta tags # In this implementation, specific meta tags override general meta robots if meta_robots: s = meta_robots.get(useragent, '') + meta_robots.get('robots', '') allowed = allowed and ('noindex' not in s) and ('none' not in s) return allowed
def allowed(url, useragent, headers=None, meta_robots=None): '''Given a url, a useragent and optionally headers and a dictionary of meta robots key-value pairs, determine whether that url is allowed or not. The headers must be a dictionary mapping of string key to list value: { 'Content-Type': ['...'], 'X-Powered-By': ['...', '...'] } The meta robots must be provided as a mapping of string key to directives: { 'robots': 'index, follow', 'foobot': 'index, nofollow' } ''' # First, check robots.txt r = reppy.findRobot(url) allowed = (r == None) or r.allowed(url, useragent) # Next, check the X-Robots-Tag # There can be multiple instances of X-Robots-Tag in the headers, so we've # joined them together with semicolons. Now, make a dictionary of each of # the directives we found. They can be specific, in which case it's in the # format: # botname : directive # In the absence of a botname, it applies to all bots. Strictly speaking, # there is one directive that itself has a value, but as we're not interested # in it (it's the unavailable_after directive), it's ok to ignore it. if headers: for bot in headers.get('x-robots-tag', []): botname, sep, directive = bot.partition(':') if directive and botname == useragent: # This is when it applies just to us allowed = allowed and ('noindex' not in directive) and ('none' not in directive) else: # This is when it applies to all bots allowed = allowed and ('noindex' not in botname) and ('none' not in botname) # Now check for specific and general meta tags # In this implementation, specific meta tags override general meta robots if meta_robots: s = meta_robots.get(useragent, '') + meta_robots.get('robots', '') allowed = allowed and ('noindex' not in s) and ('none' not in s) return allowed
def pop(self, polite=True): '''Get the next request''' while True: # First, we pop the next thing in pldQueue *if* it's not a # premature fetch (and a race condition is not detected). with self.pld_lock: # Get the next plds we might want to fetch from next, when = self.pldQueue.peek(withscores=True) if not next: # logger.debug('Nothing in pldQueue.') return None # If the next-fetchable is too soon, wait. If we're # already waiting, don't schedule a double callLater. now = time.time() if polite and when > now: with self.twi_lock: if not (self.timer and self.timer.active()): logger.debug('Waiting %f seconds on %s' % (when - now, next)) self.timer = reactor.callLater(when - now, self.serveNext) return None # If we get here, we don't need to wait. However, the # multithreaded nature of Twisted means that something # else might be waiting. Only clear timer if it's not # holding some other pending call. with self.twi_lock: if not (self.timer and self.timer.active()): self.timer = None # We know the time has passed (we peeked) so pop it. next = self.pldQueue.pop() # Get the queue pertaining to the PLD of interest and # acquire a request lock for it. q = qr.Queue(next) with self.req_lock: if len(q): # If we've already saturated our parallel requests, then we'll # wait some short amount of time before we make our next request. # There is logic elsewhere so that if one of these requests # completes before this small amount of time elapses, then it # will be advanced accordingly. if Counter.len(self.r, next) >= self.maxParallelRequests: logger.debug('maxParallelRequests exceeded for %s' % next) with self.pld_lock: self.pldQueue.push_unique(next, time.time() + 20) continue # If the robots for this particular request is not fetched # or it's expired, then we'll have to make a request for it v = q.peek() domain = urlparse.urlparse(v.url).netloc robot = reppy.findRobot('http://' + domain) if not self.allowAll and (not robot or robot.expired): logger.debug('Making robots request for %s' % next) r = RobotsRequest('http://' + domain + '/robots.txt') r._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, r) return r else: logger.debug('Popping next request from %s' % next) v = q.pop() # This was the source of a rather difficult-to-track bug # wherein the pld queue would slowly drain, despite there # being plenty of logical queues to draw from. The problem # was introduced by calling urlparse.urljoin when invoking # the request's onURL method. As a result, certain redirects # were making changes to the url, saving it as an updated # value, but we'd then try to pop off the queue for the new # hostname, when in reality, we should pop off the queue # for the original hostname. v._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, v) # At this point, we should also schedule the next request # to this domain. with self.pld_lock: self.pldQueue.push_unique(next, time.time() + self.crawlDelay(v)) return v else: try: if Counter.len(self.r, next) == 0: logger.debug('Calling onEmptyQueue for %s' % next) self.onEmptyQueue(next) try: with self.pld_lock: self.pldQueue.clear_ph(next) except ValueError: logger.error('pldQueue.clear_ph failed for %s' % next) else: # Otherwise, we should try again in a little bit, and # see if the last request has finished. with self.pld_lock: self.pldQueue.push_unique(next, time.time() + 20) logger.debug('Requests still in flight for %s. Waiting' % next) except Exception: logger.exception('onEmptyQueue failed for %s' % next) continue logger.debug('Returning None (should not happen).') return None
def pop(self, polite=True): '''Get the next request''' while True: # First, we pop the next thing in pldQueue *if* it's not a # premature fetch (and a race condition is not detected). with self.pld_lock: # Get the next plds we might want to fetch from next, when = self.pldQueue.peek(withscores=True) if not next: # logger.debug('Nothing in pldQueue.') return None # If the next-fetchable is too soon, wait. If we're # already waiting, don't schedule a double callLater. now = time.time() if polite and when > now: with self.twi_lock: if not (self.timer and self.timer.active()): logger.debug('Waiting %f seconds on %s' % (when - now, next)) self.timer = reactor.callLater( when - now, self.serveNext) return None # If we get here, we don't need to wait. However, the # multithreaded nature of Twisted means that something # else might be waiting. Only clear timer if it's not # holding some other pending call. with self.twi_lock: if not (self.timer and self.timer.active()): self.timer = None # We know the time has passed (we peeked) so pop it. next = self.pldQueue.pop() # Get the queue pertaining to the PLD of interest and # acquire a request lock for it. q = qr.Queue(next) with self.req_lock: if len(q): # If we've already saturated our parallel requests, then we'll # wait some short amount of time before we make our next request. # There is logic elsewhere so that if one of these requests # completes before this small amount of time elapses, then it # will be advanced accordingly. if Counter.len(self.r, next) >= self.maxParallelRequests: logger.debug('maxParallelRequests exceeded for %s' % next) with self.pld_lock: self.pldQueue.push_unique(next, time.time() + 20) continue # If the robots for this particular request is not fetched # or it's expired, then we'll have to make a request for it v = q.peek() domain = urlparse.urlparse(v.url).netloc robot = reppy.findRobot('http://' + domain) if not self.allowAll and (not robot or robot.expired): logger.debug('Making robots request for %s' % next) r = RobotsRequest('http://' + domain + '/robots.txt') r._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, r) return r else: logger.debug('Popping next request from %s' % next) v = q.pop() # This was the source of a rather difficult-to-track bug # wherein the pld queue would slowly drain, despite there # being plenty of logical queues to draw from. The problem # was introduced by calling urlparse.urljoin when invoking # the request's onURL method. As a result, certain redirects # were making changes to the url, saving it as an updated # value, but we'd then try to pop off the queue for the new # hostname, when in reality, we should pop off the queue # for the original hostname. v._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, v) # At this point, we should also schedule the next request # to this domain. with self.pld_lock: self.pldQueue.push_unique( next, time.time() + self.crawlDelay(v)) return v else: try: if Counter.len(self.r, next) == 0: logger.debug('Calling onEmptyQueue for %s' % next) self.onEmptyQueue(next) try: with self.pld_lock: self.pldQueue.clear_ph(next) except ValueError: logger.error( 'pldQueue.clear_ph failed for %s' % next) else: # Otherwise, we should try again in a little bit, and # see if the last request has finished. with self.pld_lock: self.pldQueue.push_unique( next, time.time() + 20) logger.debug( 'Requests still in flight for %s. Waiting' % next) except Exception: logger.exception('onEmptyQueue failed for %s' % next) continue logger.debug('Returning None (should not happen).') return None
def pop(self, polite=True): '''Get the next request''' now = time.time() while True: # Get the next plds we might want to fetch from next, when = self.pldQueue.peek(withscores=True) if not next: return None # If the next-fetchable is not soon enough, then wait if polite and when > now: with self.tlock: if not (self.timer and self.timer.active()): logger.debug('Waiting %f seconds on %s' % (when - now, next)) self.timer = reactor.callLater(when - now, self.serveNext) return None else: # Go ahead and pop this item last = next next = self.pldQueue.pop() # Unset the timer self.timer = None q = qr.Queue(next) with self.lock: if len(q): # If we've already saturated our parallel requests, then we'll # wait some short amount of time before we make our next request. # There is logic elsewhere so that if one of these requests # completes before this small amount of time elapses, then it # will be advanced accordingly. if Counter.len(self.r, next) >= self.maxParallelRequests: self.pldQueue.push(next, time.time() + 20) continue # If the robots for this particular request is not fetched # or it's expired, then we'll have to make a request for it v = q.peek() domain = urlparse.urlparse(v.url).netloc robot = reppy.findRobot('http://' + domain) if not self.allowAll and (not robot or robot.expired): logger.debug('Making robots request for %s' % next) r = RobotsRequest('http://' + domain + '/robots.txt') r._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, r) return r else: logger.debug('Popping next request from %s' % next) v = q.pop() # This was the source of a rather difficult-to-track bug # wherein the pld queue would slowly drain, despite there # being plenty of logical queues to draw from. The problem # was introduced by calling urlparse.urljoin when invoking # the request's onURL method. As a result, certain redirects # were making changes to the url, saving it as an updated # value, but we'd then try to pop off the queue for the new # hostname, when in reality, we should pop off the queue # for the original hostname. v._originalKey = next # Increment the number of requests we currently have in flight Counter.put(self.r, v) # At this point, we should also schedule the next request # to this domain. self.pldQueue.push(next, time.time() + self.crawlDelay(v)) return v else: try: if Counter.len(self.r, next) == 0: logger.debug('Calling onEmptyQueue for %s' % next) self.onEmptyQueue(next) else: # Otherwise, we should try again in a little bit, and # see if the last request has finished. self.pldQueue.push(next, time.time() + 20) logger.debug('Requests still in flight for %s. Waiting' % next) except Exception: logger.exception('onEmptyQueue failed for %s' % next) continue return None