Exemple #1
0
    def __init__(self,
                 concurrent_requests=128,
                 download_delay=0,
                 download_timeout=5,
                 retry_on_timeout=False,
                 queue_size=1024):
        """
        Crawler engine, the brain of this crawler.

        :param concurrent_requests: how many requests you want to handle simultaneously
        :param download_delay: download delay for two batches, default is 0
        :param download_timeout: download timeout
        :param retry_on_timeout: failed requests on timeout will be retried when set to True
        :param queue_size: the size of responses and requests queue
        """
        self.logger = logging.getLogger(__name__)
        self.status = False
        self.concurrent_requests = concurrent_requests
        self.download_delay = download_delay
        self.engine_idle_timeout = 1.5 * download_timeout
        self.download_timeout = download_timeout
        self.retry_on_download_timeout = retry_on_timeout
        self._requests_queue = Queue(queue_size)
        self._responses_queue = Queue(queue_size)
        self._spiders = {}

        # filter duplicate requests in the queue, we use BloomFilter instead of a set container
        self._seen = pybloom.ScalableBloomFilter()
Exemple #2
0
def domain_grab(urls, http_obj=None, pool_size=10, retries=5, proxy=None, delay=10, debug=True, queue_links=UberIterator()):
	if isinstance(urls, basestring):
		if '\n' in urls:
			urls = [url.strip() for url in urls.split('\n') if len(url.strip())]
		else:
			urls = [urls]
	domains = {urlparse.urlparse(url).netloc for url in urls}
	queue_links += urls
	seen_links = pybloom.ScalableBloomFilter(initial_capacity=100, error_rate=0.001, mode=pybloom.ScalableBloomFilter.SMALL_SET_GROWTH)
	seen_links.add([url for url in urls])
	while queue_links:
		if debug:
			progress_counter = 0
			progress_total = len(queue_links)

		for page in multi_grab(queue_links,http_obj=http_obj,pool_size=pool_size,retries=retries,proxy=proxy,delay=delay):
			if debug:
				progress_counter += 1
				print 'Got %s, Link %s/%s (%s%%)' % (page.final_url,progress_counter,progress_total,int((float(progress_counter)/progress_total)*100))
			if urlparse.urlparse(page.final_url).netloc in domains:
				new_links = {link for link in page.internal_links() if link not in seen_links and link.lower().split('.')[-1] not in ('jpg','gif','jpeg','pdf','doc','docx','ppt','txt')}
				queue_links += list(new_links)
				[seen_links.add(link) for link in new_links]
				yield page

		if debug:
			print 'Seen Links: %s' %  len(seen_links)
			print 'Bloom Capacity: %s' % seen_links.capacity
			print 'Links in Queue: %s' % len(queue_links)
Exemple #3
0
 def __init__(self, name=None):
     if name and not name.endswith('.bloom'):
         name += '.bloom'
     self.name = name
     self.add_counter = 0
     try:
         self.bloom = pybloom.ScalableBloomFilter.fromfile(
             open(self.name, 'rb'))
     except:
         self.bloom = pybloom.ScalableBloomFilter(
             initial_capacity=100,
             error_rate=0.001,
             mode=pybloom.ScalableBloomFilter.SMALL_SET_GROWTH)
Exemple #4
0
    def __init__(self,
                 black_patterns=(CONFIG_URLPATTERN_ALL, ),
                 white_patterns=("^http", ),
                 capacity=None):
        """
        constructor
        """
        self.re_black_list = [
            re.compile(_pattern, flags=re.IGNORECASE)
            for _pattern in black_patterns
        ]
        self.re_white_list = [
            re.compile(_pattern, flags=re.IGNORECASE)
            for _pattern in white_patterns
        ]

        self.url_set = set() if not capacity else None
        self.bloom_filter = pybloom.ScalableBloomFilter(
            capacity, error_rate=0.001) if capacity else None
        return
Exemple #5
0
 def __init__(self,
              is_link_interesting,
              gui=False,
              timeout=5,
              **browser_kwargs):
     '''
     is_link_interesting(a_href, a_text): a function that looks at a link
                                          text and target url, and returns
                                          True if the crawler should follow
                                          the link
     gui: True if you want to see the crawler
     timeout: How much to wait for the url to be loaded and JS to execute
     browser_kwargs: these are passed directly to the spynner module
     '''
     self.timeout = timeout
     self.is_link_interesting = is_link_interesting
     # Setup the browser
     self.download_dir_tmp = tempfile.mkdtemp(prefix='crawler_')
     browser_config = {
         'debug_level':
         spynner.WARNING,
         'download_directory':
         self.download_dir_tmp,
         'user_agent':
         'Mozilla/5.0 (compatible; MSIE 9.0;'
         ' Windows NT 6.1; Trident/5.0)'
     }
     browser_config.update(browser_kwargs)
     self.browser = spynner.browser.Browser(**browser_kwargs)
     self.browser.set_html_parser(pyquery.PyQuery)
     if gui:
         self.browser.create_webview()
         self.browser.show()
     # Create the bloom filter
     self.bloom_filter = pybloom.ScalableBloomFilter()
     # Create the queue
     self.queue = Queue.Queue()
Exemple #6
0
 def __init__(self, start_items=10000, err_rate=0.0001):
     self.bloom = pybloom.ScalableBloomFilter(10000, err, 4)