Python findRobot Examples

Programming Language: Python

Namespace/Package Name: reppy

Method/Function: findRobot

Examples at hotexamples.com: 5

Python findRobot - 5 examples found. These are the top rated real world Python examples of reppy.findRobot extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: url.py Project: pombredanne/g-crawl-py

    def allowed(url, useragent, headers=None, meta_robots=None):
        '''Given a url, a useragent and optionally headers and a dictionary of
        meta robots key-value pairs, determine whether that url is allowed or not.
        
        The headers must be a dictionary mapping of string key to list value:
        
            {
                'Content-Type': ['...'],
                'X-Powered-By': ['...', '...']
            }
        
        The meta robots must be provided as a mapping of string key to directives:
        
            {
                'robots': 'index, follow',
                'foobot': 'index, nofollow'
            }
        '''
        # First, check robots.txt
        r = reppy.findRobot(url)
        allowed = (r == None) or r.allowed(url, useragent)

        # Next, check the X-Robots-Tag
        # There can be multiple instances of X-Robots-Tag in the headers, so we've
        # joined them together with semicolons. Now, make a dictionary of each of
        # the directives we found. They can be specific, in which case it's in the
        # format:
        #   botname : directive
        # In the absence of a botname, it applies to all bots. Strictly speaking,
        # there is one directive that itself has a value, but as we're not interested
        # in it (it's the unavailable_after directive), it's ok to ignore it.
        if headers:
            for bot in headers.get('x-robots-tag', []):
                botname, sep, directive = bot.partition(':')
                if directive and botname == useragent:
                    # This is when it applies just to us
                    allowed = allowed and ('noindex' not in directive) and (
                        'none' not in directive)
                else:
                    # This is when it applies to all bots
                    allowed = allowed and ('noindex'
                                           not in botname) and ('none'
                                                                not in botname)

        # Now check for specific and general meta tags
        # In this implementation, specific meta tags override general meta robots
        if meta_robots:
            s = meta_robots.get(useragent, '') + meta_robots.get('robots', '')
            allowed = allowed and ('noindex' not in s) and ('none' not in s)

        return allowed

Example #2

Show file

File: url.py Project: BananaOnTheWall/g-crawl-py

 def allowed(url, useragent, headers=None, meta_robots=None):
     '''Given a url, a useragent and optionally headers and a dictionary of
     meta robots key-value pairs, determine whether that url is allowed or not.
     
     The headers must be a dictionary mapping of string key to list value:
     
         {
             'Content-Type': ['...'],
             'X-Powered-By': ['...', '...']
         }
     
     The meta robots must be provided as a mapping of string key to directives:
     
         {
             'robots': 'index, follow',
             'foobot': 'index, nofollow'
         }
     '''
     # First, check robots.txt
     r = reppy.findRobot(url)
     allowed = (r == None) or r.allowed(url, useragent)
     
     # Next, check the X-Robots-Tag
     # There can be multiple instances of X-Robots-Tag in the headers, so we've
     # joined them together with semicolons. Now, make a dictionary of each of
     # the directives we found. They can be specific, in which case it's in the
     # format:
     #   botname : directive
     # In the absence of a botname, it applies to all bots. Strictly speaking,
     # there is one directive that itself has a value, but as we're not interested
     # in it (it's the unavailable_after directive), it's ok to ignore it.
     if headers:
         for bot in headers.get('x-robots-tag', []):
             botname, sep, directive = bot.partition(':')
             if directive and botname == useragent:
                 # This is when it applies just to us
                 allowed = allowed and ('noindex' not in directive) and ('none' not in directive)
             else:
                 # This is when it applies to all bots
                 allowed = allowed and ('noindex' not in botname) and ('none' not in botname)
     
     # Now check for specific and general meta tags
     # In this implementation, specific meta tags override general meta robots
     if meta_robots:
         s = meta_robots.get(useragent, '') + meta_robots.get('robots', '')
         allowed = allowed and ('noindex' not in s) and ('none' not in s)
     
     return allowed

Example #3

Show file

File: PoliteFetcher.py Project: seomoz/downpour

    def pop(self, polite=True):
        '''Get the next request'''
        while True:

            # First, we pop the next thing in pldQueue *if* it's not a
            # premature fetch (and a race condition is not detected).
            with self.pld_lock:
                # Get the next plds we might want to fetch from
                next, when = self.pldQueue.peek(withscores=True)
                if not next:
                    # logger.debug('Nothing in pldQueue.')
                    return None
                # If the next-fetchable is too soon, wait. If we're
                # already waiting, don't schedule a double callLater.
                now = time.time()
                if polite and when > now:
                    with self.twi_lock:
                        if not (self.timer and self.timer.active()):
                            logger.debug('Waiting %f seconds on %s' % (when - now, next))
                            self.timer = reactor.callLater(when - now, self.serveNext)
                    return None
                # If we get here, we don't need to wait. However, the
                # multithreaded nature of Twisted means that something
                # else might be waiting. Only clear timer if it's not
                # holding some other pending call.
                with self.twi_lock:
                    if not (self.timer and self.timer.active()):
                        self.timer = None
                # We know the time has passed (we peeked) so pop it.
                next = self.pldQueue.pop()

            # Get the queue pertaining to the PLD of interest and
            # acquire a request lock for it.
            q = qr.Queue(next)
            with self.req_lock:
                if len(q):
                    # If we've already saturated our parallel requests, then we'll
                    # wait some short amount of time before we make our next request.
                    # There is logic elsewhere so that if one of these requests
                    # completes before this small amount of time elapses, then it
                    # will be advanced accordingly.
                    if Counter.len(self.r, next) >= self.maxParallelRequests:
                        logger.debug('maxParallelRequests exceeded for %s' % next)
                        with self.pld_lock:
                            self.pldQueue.push_unique(next, time.time() + 20)
                        continue
                    # If the robots for this particular request is not fetched
                    # or it's expired, then we'll have to make a request for it
                    v = q.peek()
                    domain = urlparse.urlparse(v.url).netloc
                    robot = reppy.findRobot('http://' + domain)
                    if not self.allowAll and (not robot or robot.expired):
                        logger.debug('Making robots request for %s' % next)
                        r = RobotsRequest('http://' + domain + '/robots.txt')
                        r._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, r)
                        return r
                    else:
                        logger.debug('Popping next request from %s' % next)
                        v = q.pop()
                        # This was the source of a rather difficult-to-track bug
                        # wherein the pld queue would slowly drain, despite there
                        # being plenty of logical queues to draw from. The problem
                        # was introduced by calling urlparse.urljoin when invoking
                        # the request's onURL method. As a result, certain redirects
                        # were making changes to the url, saving it as an updated
                        # value, but we'd then try to pop off the queue for the new
                        # hostname, when in reality, we should pop off the queue
                        # for the original hostname.
                        v._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, v)
                        # At this point, we should also schedule the next request
                        # to this domain.
                        with self.pld_lock:
                            self.pldQueue.push_unique(next, time.time() + self.crawlDelay(v))
                        return v
                else:
                    try:
                        if Counter.len(self.r, next) == 0:
                            logger.debug('Calling onEmptyQueue for %s' % next)
                            self.onEmptyQueue(next)
                            try:
                                with self.pld_lock:
                                    self.pldQueue.clear_ph(next)
                            except ValueError:
                                logger.error('pldQueue.clear_ph failed for %s' % next)
                        else:
                            # Otherwise, we should try again in a little bit, and
                            # see if the last request has finished.
                            with self.pld_lock:
                                self.pldQueue.push_unique(next, time.time() + 20)
                            logger.debug('Requests still in flight for %s. Waiting' % next)
                    except Exception:
                        logger.exception('onEmptyQueue failed for %s' % next)
                    continue

        logger.debug('Returning None (should not happen).')
        return None

Example #4

Show file

    def pop(self, polite=True):
        '''Get the next request'''
        while True:

            # First, we pop the next thing in pldQueue *if* it's not a
            # premature fetch (and a race condition is not detected).
            with self.pld_lock:
                # Get the next plds we might want to fetch from
                next, when = self.pldQueue.peek(withscores=True)
                if not next:
                    # logger.debug('Nothing in pldQueue.')
                    return None
                # If the next-fetchable is too soon, wait. If we're
                # already waiting, don't schedule a double callLater.
                now = time.time()
                if polite and when > now:
                    with self.twi_lock:
                        if not (self.timer and self.timer.active()):
                            logger.debug('Waiting %f seconds on %s' %
                                         (when - now, next))
                            self.timer = reactor.callLater(
                                when - now, self.serveNext)
                    return None
                # If we get here, we don't need to wait. However, the
                # multithreaded nature of Twisted means that something
                # else might be waiting. Only clear timer if it's not
                # holding some other pending call.
                with self.twi_lock:
                    if not (self.timer and self.timer.active()):
                        self.timer = None
                # We know the time has passed (we peeked) so pop it.
                next = self.pldQueue.pop()

            # Get the queue pertaining to the PLD of interest and
            # acquire a request lock for it.
            q = qr.Queue(next)
            with self.req_lock:
                if len(q):
                    # If we've already saturated our parallel requests, then we'll
                    # wait some short amount of time before we make our next request.
                    # There is logic elsewhere so that if one of these requests
                    # completes before this small amount of time elapses, then it
                    # will be advanced accordingly.
                    if Counter.len(self.r, next) >= self.maxParallelRequests:
                        logger.debug('maxParallelRequests exceeded for %s' %
                                     next)
                        with self.pld_lock:
                            self.pldQueue.push_unique(next, time.time() + 20)
                        continue
                    # If the robots for this particular request is not fetched
                    # or it's expired, then we'll have to make a request for it
                    v = q.peek()
                    domain = urlparse.urlparse(v.url).netloc
                    robot = reppy.findRobot('http://' + domain)
                    if not self.allowAll and (not robot or robot.expired):
                        logger.debug('Making robots request for %s' % next)
                        r = RobotsRequest('http://' + domain + '/robots.txt')
                        r._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, r)
                        return r
                    else:
                        logger.debug('Popping next request from %s' % next)
                        v = q.pop()
                        # This was the source of a rather difficult-to-track bug
                        # wherein the pld queue would slowly drain, despite there
                        # being plenty of logical queues to draw from. The problem
                        # was introduced by calling urlparse.urljoin when invoking
                        # the request's onURL method. As a result, certain redirects
                        # were making changes to the url, saving it as an updated
                        # value, but we'd then try to pop off the queue for the new
                        # hostname, when in reality, we should pop off the queue
                        # for the original hostname.
                        v._originalKey = next
                        # Increment the number of requests we currently have in flight
                        Counter.put(self.r, v)
                        # At this point, we should also schedule the next request
                        # to this domain.
                        with self.pld_lock:
                            self.pldQueue.push_unique(
                                next,
                                time.time() + self.crawlDelay(v))
                        return v
                else:
                    try:
                        if Counter.len(self.r, next) == 0:
                            logger.debug('Calling onEmptyQueue for %s' % next)
                            self.onEmptyQueue(next)
                            try:
                                with self.pld_lock:
                                    self.pldQueue.clear_ph(next)
                            except ValueError:
                                logger.error(
                                    'pldQueue.clear_ph failed for %s' % next)
                        else:
                            # Otherwise, we should try again in a little bit, and
                            # see if the last request has finished.
                            with self.pld_lock:
                                self.pldQueue.push_unique(
                                    next,
                                    time.time() + 20)
                            logger.debug(
                                'Requests still in flight for %s. Waiting' %
                                next)
                    except Exception:
                        logger.exception('onEmptyQueue failed for %s' % next)
                    continue

        logger.debug('Returning None (should not happen).')
        return None

Example #5

Show file

File: PoliteFetcher.py Project: mt3/downpour

 def pop(self, polite=True):
     '''Get the next request'''
     now = time.time()
     while True:
         # Get the next plds we might want to fetch from
         next, when = self.pldQueue.peek(withscores=True)
         if not next:
             return None
         # If the next-fetchable is not soon enough, then wait
         if polite and when > now:
             with self.tlock:
                 if not (self.timer and self.timer.active()):
                     logger.debug('Waiting %f seconds on %s' % (when - now, next))
                     self.timer = reactor.callLater(when - now, self.serveNext)
                 return None
         else:
             # Go ahead and pop this item
             last = next
             next = self.pldQueue.pop()
             # Unset the timer
             self.timer = None
             q = qr.Queue(next)
             
             with self.lock:
                 if len(q):
                     # If we've already saturated our parallel requests, then we'll
                     # wait some short amount of time before we make our next request.
                     # There is logic elsewhere so that if one of these requests 
                     # completes before this small amount of time elapses, then it
                     # will be advanced accordingly.
                     if Counter.len(self.r, next) >= self.maxParallelRequests:
                         self.pldQueue.push(next, time.time() + 20)
                         continue
                     
                     # If the robots for this particular request is not fetched
                     # or it's expired, then we'll have to make a request for it
                     v = q.peek()
                     domain = urlparse.urlparse(v.url).netloc
                     robot = reppy.findRobot('http://' + domain)
                     if not self.allowAll and (not robot or robot.expired):
                         logger.debug('Making robots request for %s' % next)
                         r = RobotsRequest('http://' + domain + '/robots.txt')
                         r._originalKey = next
                         # Increment the number of requests we currently have in flight
                         Counter.put(self.r, r)
                         return r
                     else:
                         logger.debug('Popping next request from %s' % next)
                         v = q.pop()
                         # This was the source of a rather difficult-to-track bug
                         # wherein the pld queue would slowly drain, despite there
                         # being plenty of logical queues to draw from. The problem
                         # was introduced by calling urlparse.urljoin when invoking
                         # the request's onURL method. As a result, certain redirects
                         # were making changes to the url, saving it as an updated
                         # value, but we'd then try to pop off the queue for the new
                         # hostname, when in reality, we should pop off the queue 
                         # for the original hostname.
                         v._originalKey = next
                         # Increment the number of requests we currently have in flight
                         Counter.put(self.r, v)
                         # At this point, we should also schedule the next request
                         # to this domain.
                         self.pldQueue.push(next, time.time() + self.crawlDelay(v))
                         return v
                 else:
                     try:
                         if Counter.len(self.r, next) == 0:
                             logger.debug('Calling onEmptyQueue for %s' % next)
                             self.onEmptyQueue(next)
                         else:
                             # Otherwise, we should try again in a little bit, and 
                             # see if the last request has finished.
                             self.pldQueue.push(next, time.time() + 20)
                             logger.debug('Requests still in flight for %s. Waiting' % next)
                     except Exception:
                         logger.exception('onEmptyQueue failed for %s' % next)
                     continue
     return None