Esempio n. 1
0
    def __init__(self, poolSize=10, agent=None, stopWhenDone=False,
        delay=2, allowAll=False, use_lock=None, **kwargs):

        # First, call the parent constructor
        BaseFetcher.__init__(self, poolSize, agent, stopWhenDone)

        # Import DownpourLock only if use_lock specified, because it uses
        # *NIX-specific features. We use one lock for the pldQueue and one
        # for all the request queues collectively. The latter is a tad
        # overly restrictive, but is far easier than managing hundreds
        # of locks for hundreds of queues.
        if use_lock:
            import DownpourLock
            self.pld_lock = DownpourLock.DownpourLock("%s_pld.lock" % use_lock)
            self.req_lock = DownpourLock.DownpourLock("%s_req.lock" % use_lock)
        else:
            self.pld_lock  = threading.RLock()
            self.req_lock = threading.RLock()
        self.twi_lock = threading.RLock()  # Twisted reactor lock

        # Include a priority queue of plds
        self.pldQueue = PLDQueue('plds', **kwargs)
        # Make sure that there is an entry in the plds for
        # each domain waiting to be fetched. Also, include
        # the number of urls from each domain in the count
        # of remaining urls to be fetched.
        self.r = redis.Redis(**kwargs)
        # Redis has a pipeline feature that allows for bulk
        # requests, the result of which is a list of the
        # result of each individual request. Thus, only get
        # the length of each of the queues in the pipeline
        # as we're just going to set remaining to the sum
        # of the lengths of each of the domain queues.
        with self.r.pipeline() as p:
            for key in self.r.keys('domain:*'):
                with self.pld_lock:
                    self.pldQueue.push_init(key, 0)
                p.llen(key)
            self.remaining = sum(p.execute())
        # For whatever reason, pushing key names back into the
        # priority queue has been problematic. As such, we'll
        # set them aside as they fail, and then retry them at
        # some point. Like when the next request finishes.
        self.retries = []
        # Now make a queue for incoming requests
        self.requests = qr.Queue('request', **kwargs)
        self.delay = float(delay)
        # This is used when we have to impose a delay before
        # servicing the next available request.
        with self.twi_lock:
            self.timer = None
        # This is a way to ignore the allow/disallow directives
        # For example, if you're checking for allow in other places
        self.allowAll = allowAll
        self.userAgentString = reppy.getUserAgentString(self.agent)
Esempio n. 2
0
 def grow(self, upto=10000):
     count = 0
     t = time.time()
     r = self.requests.pop()
     while r and count < upto:
         count += self.push(r) or 0
         r = self.requests.pop()
     logger.debug('Grew by %i' % count)
     return BaseFetcher.grew(self, count)
Esempio n. 3
0
 def grow(self, upto=10000):
     count = 0
     t = time.time()
     with self.req_lock:
         r = self.requests.pop()
     while r and count < upto:
         count += self.push(r) or 0
         with self.req_lock:
             r = self.requests.pop()
     logger.debug('Grew by %i' % count)
     return BaseFetcher.grew(self, count)
Esempio n. 4
0
 def __init__(self, poolSize=10, agent=None, stopWhenDone=False, 
     delay=2, allowAll=False, **kwargs):
     
     # Call the parent constructor
     BaseFetcher.__init__(self, poolSize, agent, stopWhenDone)
     # Include a priority queue of plds
     self.pldQueue = qr.PriorityQueue('plds', **kwargs)
     # Make sure that there is an entry in the plds for
     # each domain waiting to be fetched. Also, include
     # the number of urls from each domain in the count
     # of remaining urls to be fetched.
     self.r = redis.Redis(**kwargs)
     # Redis has a pipeline feature that allows for bulk
     # requests, the result of which is a list of the 
     # result of each individual request. Thus, only get
     # the length of each of the queues in the pipeline
     # as we're just going to set remaining to the sum
     # of the lengths of each of the domain queues.
     with self.r.pipeline() as p:
         for key in self.r.keys('domain:*'):
             self.pldQueue.push(key, 0)
             p.llen(key)
         self.remaining = sum(p.execute())
     # For whatever reason, pushing key names back into the 
     # priority queue has been problematic. As such, we'll
     # set them aside as they fail, and then retry them at
     # some point. Like when the next request finishes.
     self.retries = []
     # Now make a queue for incoming requests
     self.requests = qr.Queue('request', **kwargs)
     self.delay = float(delay)
     # This is used when we have to impose a delay before
     # servicing the next available request.
     self.timer = None
     # This is a way to ignore the allow/disallow directives
     # For example, if you're checking for allow in other places
     self.allowAll = allowAll
     self.userAgentString = reppy.getUserAgentString(self.agent)
     self.lock  = threading.RLock()
     self.tlock = threading.RLock()
Esempio n. 5
0
#! /usr/bin/env python

import logging
from downpour import logger
from downpour.test import run, host
from downpour.test import ExpectRequest
from downpour.test import ExamineRequest
from downpour import BaseFetcher, BaseRequest

logger.setLevel(logging.CRITICAL)

fetcher = BaseFetcher(stopWhenDone=True)

# Try a plain-and-simple request
fetcher.push(
    ExpectRequest('200 Test',
                  host + 'asis/ok.asis',
                  expectHeaders={
                      'content-type': ['text/html'],
                      'content-length': ['11']
                  },
                  expectStatus=('HTTP/1.1', '200', 'OK'),
                  expectURL=host + 'asis/ok.asis',
                  expectSuccess='Hello world'))

# Try a redirect request, making sure we get
# every url we expect to get.
fetcher.push(
    ExpectRequest(
        '301 Redirect Test',
        host + 'asis/301_to_ok.asis',
Esempio n. 6
0
    def __init__(self,
                 poolSize=10,
                 agent=None,
                 stopWhenDone=False,
                 delay=2,
                 allowAll=False,
                 use_lock=None,
                 **kwargs):

        # First, call the parent constructor
        BaseFetcher.__init__(self, poolSize, agent, stopWhenDone)

        # Import DownpourLock only if use_lock specified, because it uses
        # *NIX-specific features. We use one lock for the pldQueue and one
        # for all the request queues collectively. The latter is a tad
        # overly restrictive, but is far easier than managing hundreds
        # of locks for hundreds of queues.
        if use_lock:
            import DownpourLock
            self.pld_lock = DownpourLock.DownpourLock("%s_pld.lock" % use_lock)
            self.req_lock = DownpourLock.DownpourLock("%s_req.lock" % use_lock)
        else:
            self.pld_lock = threading.RLock()
            self.req_lock = threading.RLock()
        self.twi_lock = threading.RLock()  # Twisted reactor lock

        # Include a priority queue of plds
        self.pldQueue = PLDQueue('plds', **kwargs)
        # Make sure that there is an entry in the plds for
        # each domain waiting to be fetched. Also, include
        # the number of urls from each domain in the count
        # of remaining urls to be fetched.
        self.r = redis.Redis(**kwargs)
        # Redis has a pipeline feature that allows for bulk
        # requests, the result of which is a list of the
        # result of each individual request. Thus, only get
        # the length of each of the queues in the pipeline
        # as we're just going to set remaining to the sum
        # of the lengths of each of the domain queues.
        with self.r.pipeline() as p:
            for key in self.r.keys('domain:*'):
                with self.pld_lock:
                    self.pldQueue.push_init(key, 0)
                p.llen(key)
            self.remaining = sum(p.execute())
        # For whatever reason, pushing key names back into the
        # priority queue has been problematic. As such, we'll
        # set them aside as they fail, and then retry them at
        # some point. Like when the next request finishes.
        self.retries = []
        # Now make a queue for incoming requests
        self.requests = qr.Queue('request', **kwargs)
        self.delay = float(delay)
        # This is used when we have to impose a delay before
        # servicing the next available request.
        with self.twi_lock:
            self.timer = None
        # This is a way to ignore the allow/disallow directives
        # For example, if you're checking for allow in other places
        self.allowAll = allowAll
        self.userAgentString = reppy.getUserAgentString(self.agent)
Esempio n. 7
0
#! /usr/bin/env python

import logging
from downpour import logger
from downpour.test import run, host
from downpour.test import ExpectRequest
from downpour.test import ExamineRequest
from downpour import BaseFetcher, BaseRequest

logger.setLevel(logging.CRITICAL)

fetcher = BaseFetcher(stopWhenDone=True)

# Try a plain-and-simple request
fetcher.push(ExpectRequest('200 Test', host + 'asis/ok.asis',
	expectHeaders = {
	'content-type': ['text/html'],
	'content-length': ['11']
}, expectStatus = ('HTTP/1.1', '200', 'OK'),
 	expectURL = host + 'asis/ok.asis',
	expectSuccess = 'Hello world'))

# Try a redirect request, making sure we get
# every url we expect to get.
fetcher.push(ExpectRequest('301 Redirect Test', host + 'asis/301_to_ok.asis', expectURL = [
	host + 'asis/301_to_ok.asis',
	host + 'asis/ok.asis'
]))

fetcher.push(ExpectRequest('302 Redirect Test', host + 'asis/302_to_ok.asis', expectURL = [
	host + 'asis/302_to_ok.asis',
Esempio n. 8
0
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
# 
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

from downpour import BaseRequest, BaseFetcher, logger
import logging

logger.setLevel(logging.DEBUG)

# Read in a set of urls to fetch
with file('urls.txt') as f:
	reqs = [BaseRequest(u) for u in f.read().strip().split('\n')]

# Now start it!
BaseFetcher(100, reqs).start()