def __init__(self, cache=None, cache_file=None, read_cache=True, write_cache=True, use_network=True, user_agent=None, timeout=30, delay=5, proxies=None, proxy_file=None, max_proxy_errors=5, opener=None, headers=None, data=None, num_retries=0, num_redirects=1, force_html=False, force_ascii=False, max_size=None, default='', pattern=None): """ `cache' is a pdict object to use for the cache `cache_file' sets filename to store cached data `read_cache' sets whether to read from the cache `write_cache' sets whether to write to the cache `use_network' sets whether to download content not in the cache `user_agent' sets the User Agent to download content with `timeout' is the maximum amount of time to wait for http response `delay' is the minimum amount of time (in seconds) to wait after downloading content from a domain per proxy `proxy_file' is a filename to read proxies from `max_proxy_errors' is the maximum number of consecutive errors allowed per proxy before discarding an error is only counted if another proxy is able to successfully download the URL set to None to disable `proxies' is a list of proxies to cycle through when downloading content `opener' sets an optional opener to use instead of using urllib2 directly `headers' are the headers to include in the request `data' is what to post at the URL `num_retries' sets how many times to try downloading a URL when get an error `num_redirects' sets how many times the URL is allowed to be redirected, to avoid infinite loop `force_html' sets whether to download non-text data `force_ascii' sets whether to only return ascii characters `max_size' determines maximum number of bytes that will be downloaded, or None to disable `default' is what to return when no content can be downloaded `pattern' is a regular expression that the downloaded HTML has to match to be considered a valid download """ socket.setdefaulttimeout(timeout) need_cache = read_cache or write_cache if pdict and need_cache: cache_file = cache_file or settings.cache_file self.cache = cache or pdict.PersistentDict(cache_file) else: self.cache = None if need_cache: common.logger.info('Cache disabled because could not import pdict') self.settings = adt.Bag( read_cache = read_cache, write_cache = write_cache, use_network = use_network, delay = delay, proxies = collections.deque((common.read_list(proxy_file) if proxy_file else []) or proxies or []), proxy_file = proxy_file, max_proxy_errors = max_proxy_errors, user_agent = user_agent, opener = opener, headers = headers, data = data, num_retries = num_retries, num_redirects = num_redirects, force_html = force_html, force_ascii = force_ascii, max_size = max_size, default = default, pattern = pattern ) self.last_load_time = self.last_mtime = time.time()
def parse_proxy(proxy): """Parse a proxy into its fragments Returns a dict with username, password, host, and port >>> f = parse_proxy('login:[email protected]:8080') >>> f.username 'login' >>> f.password 'pw' >>> f.host '66.197.208.200' >>> f.port '8080' >>> f = parse_proxy('66.197.208.200') >>> f.username == f.password == f.port == '' True >>> f.host '66.197.208.200' """ fragments = adt.Bag() if isinstance(proxy, basestring): match = re.match( '((?P<username>\w+):(?P<password>\w+)@)?(?P<host>\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3})(:(?P<port>\d+))?', proxy) if match: groups = match.groupdict() fragments.username = groups.get('username') or '' fragments.password = groups.get('password') or '' fragments.host = groups.get('host') fragments.port = groups.get('port') or '' return fragments
def __init__(self, cache=None, cache_file=None, read_cache=True, write_cache=True, use_network=True, user_agent=None, timeout=30, delay=5, proxies=None, proxy_file=None, max_proxy_errors=5, opener=None, headers=None, data=None, num_retries=0, num_redirects=1, force_html=False, force_ascii=False, max_size=None, default='', pattern=None, acceptable_errors=None): socket.setdefaulttimeout(timeout) need_cache = read_cache or write_cache if pdict and need_cache: cache_file = cache_file or settings.cache_file self.cache = cache or pdict.PersistentDict(cache_file) else: self.cache = None if need_cache: common.logger.info( 'Cache disabled because could not import pdict') self.settings = adt.Bag( read_cache=read_cache, write_cache=write_cache, use_network=use_network, delay=delay, proxies=(common.read_list(proxy_file) if proxy_file else []) or proxies or [], proxy_file=proxy_file, max_proxy_errors=max_proxy_errors, user_agent=user_agent, opener=opener, headers=headers, data=data, num_retries=num_retries, num_redirects=num_redirects, force_html=force_html, force_ascii=force_ascii, max_size=max_size, default=default, pattern=pattern, acceptable_errors=acceptable_errors) self.last_load_time = self.last_mtime = time.time() self.num_downloads = self.num_errors = 0
def __init__(self, url=None, urls=None, url_iter=None, num_threads=20, cb=None, depth=True, max_errors=None, pattern=None, **kwargs): self.settings = adt.Bag(read_cache=True, write_cache=True, num_redirects=5, num_retries=2, timeout=20, headers={}, num_threads=num_threads, cb=cb, url_iter=url_iter, depth=depth, pattern=pattern) self.settings.update(**kwargs) self.D = download.Download(**kwargs) self.kwargs = kwargs # queue of html to be written to cache self.cache_queue = [] # URL's that are waiting to download self.download_queue = collections.deque() if urls: self.download_queue.extend(urls) if url: self.download_queue.append( url ) # XXX create compressed dict data type for large in memory? # URL's currently downloading self.processing = {} # defereds that are downloading self.downloading = [] # URL's that have been found before self.found = adt.HashDict() for url in self.download_queue: self.found[url] = True self.state = download.State() self.max_errors = max_errors self.num_errors = 0 # counter for the number of subsequent errors
def get(self, url, **kwargs): """Download this URL and return the HTML. By default HTML is cached so only have to download once. url: what to download kwargs: override any of the arguments passed to constructor """ self.reload_proxies() self.proxy = None # the current proxy self.final_url = None # for tracking redirects self.response_code = '' # keep response code self.response_headers = {} # keep response headers self.downloading_error = None # keep downloading error self.num_downloads = self.num_errors = 0 # track the number of downloads made # update settings with any local overrides settings = adt.Bag(self.settings) settings.update(kwargs) # check cache for whether this content is already downloaded key = self.get_key(url, settings.data) if self.cache and settings.read_cache: try: html = self.cache[key] if self.invalid_response(html, settings.pattern): # invalid result from download html = None except KeyError: pass # have not downloaded yet else: if not html and settings.num_retries > 0: # try downloading again common.logger.debug('Redownloading') settings.num_retries -= 1 else: # return previously downloaded content return html or settings.default if not settings.use_network: # only want previously cached content return settings.default html = None failed_proxies = set( ) # record which proxies failed to download for this URL # attempt downloading content at URL while settings.num_retries >= 0 and html is None: settings.num_retries -= 1 if settings.proxy: self.proxy = settings.proxy else: self.proxy = self.get_proxy(settings.proxies) # crawl slowly for each domain to reduce risk of being blocked self.throttle(url, delay=settings.delay, proxy=self.proxy) html = self.fetch(url, headers=settings.headers, data=settings.data, proxy=self.proxy, user_agent=settings.user_agent, opener=settings.opener, pattern=settings.pattern, max_size=settings.max_size) if html: # successfully downloaded self.num_downloads += 1 if settings.max_proxy_errors is not None: Download.proxy_performance.success(self.proxy) # record which proxies failed for this download for proxy in failed_proxies: if Download.proxy_performance.error( self.proxy) > settings.max_proxy_errors: # this proxy has had too many errors so remove common.logger.warning( 'Removing unstable proxy from list after %d consecutive errors: %s' % (settings.max_proxy_errors, self.proxy)) settings.proxies.remove(self.proxy) else: # download failed - try again self.num_errors += 1 failed_proxies.add(self.proxy) if html: if settings.num_redirects > 0: # allowed to redirect redirect_url = get_redirect(url=url, html=html) if redirect_url: # found a redirection common.logger.debug('%s redirecting to %s' % (url, redirect_url)) settings.num_redirects -= 1 html = self.get(redirect_url, **settings) or '' # make relative links absolute so will still work after redirect relative_re = re.compile( '(<\s*a[^>]+href\s*=\s*["\']?)(?!http)([^"\'>]+)', re.IGNORECASE) try: html = relative_re.sub( lambda m: m.group(1) + urlparse.urljoin( url, m.group(2)), html) except UnicodeDecodeError: pass html = self._clean_content(html=html, max_size=settings.max_size, force_html=settings.force_html, force_ascii=settings.force_ascii) if self.cache and settings.write_cache: # cache results self.cache[key] = html if url != self.final_url: # cache what URL was redirected to self.cache.meta(key, dict(url=self.final_url)) # return default if no content return html or settings.default