def __init__(self, proxy=None, no_proxy=False, repeats_on_failure=5, average_wait_between_requests=0, max_wait_between_requests=0, timeout=60): self.repeats_on_failure = repeats_on_failure self.proxy = proxy self.no_proxy = no_proxy self.opener = self.build_opener() self.last_request_time = 0 self.average_wait_between_requests = average_wait_between_requests self.max_wait_between_requests = max_wait_between_requests self.sleeper = Sleeper() self.timeout = timeout
def __init__(self, proxy=None, no_proxy=False, repeats_on_failure=5, average_wait_between_requests=0, max_wait_between_requests=0, timeout=60): self.repeats_on_failure=repeats_on_failure self.proxy=proxy self.no_proxy=no_proxy self.opener=self.build_opener() self.last_request_time=0 self.average_wait_between_requests=average_wait_between_requests self.max_wait_between_requests=max_wait_between_requests self.sleeper=Sleeper() self.timeout=timeout
class HTTPClient: global_cookies_jar = CookieJar() def __init__(self, proxy=None, no_proxy=False, repeats_on_failure=5, average_wait_between_requests=0, max_wait_between_requests=0, timeout=60): self.repeats_on_failure = repeats_on_failure self.proxy = proxy self.no_proxy = no_proxy self.opener = self.build_opener() self.last_request_time = 0 self.average_wait_between_requests = average_wait_between_requests self.max_wait_between_requests = max_wait_between_requests self.sleeper = Sleeper() self.timeout = timeout class Error(Exception): pass class AdditionalHeaders(urllib2.BaseHandler): def http_request(self, req): req.add_header( 'User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0' ) req.add_header( 'Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ) req.add_header('Accept-Language', 'en-us,en;q=0.5') req.add_header('Accept-Encoding', 'gzip, deflate') return req https_request = http_request def build_opener(self): handlers = [ HTTPClient.AdditionalHeaders, urllib2.HTTPHandler, ] if self.no_proxy: handlers.append(urllib2.ProxyHandler({})) elif self.proxy: handlers.append( urllib2.ProxyHandler({ 'http': self.proxy, 'https': self.proxy })) handlers.extend([ HTTPEquivProcessor, urllib2.HTTPCookieProcessor(HTTPClient.global_cookies_jar) ]) opener = urllib2.build_opener(*handlers) return opener def _get_random_interval(self, mid, max): while True: r = random.gauss(mid, (max - mid) / 3) if r >= 0 and r <= max: return r def _delay(self, url): if not self.average_wait_between_requests or not self.average_wait_between_requests: return pause = self._get_random_interval(self.average_wait_between_requests, self.max_wait_between_requests) time_from_last_request = time() - self.last_request_time pause = pause - time_from_last_request if pause > 0: logging.debug('Waiting %f seconds to %s' % (pause, url)) self.sleeper.sleep(pause) self.last_request_time = time() def open_url(self, url, post_args=None, resume=None, refer_url=None): if post_args: post_args = bytes(urlencode(post_args), 'UTF-8') retries = self.repeats_on_failure res = None req = urllib2.Request(url, post_args) if refer_url: req.add_header('Referer', refer_url) self._delay(url) while retries: try: #with Timeout(timeout+1) : # sometimes socket timeout is not working res = self.opener.open(req, timeout=self.timeout) break except (IOError, urllib2.HTTPError, BadStatusLine, IncompleteRead, socket.timeout) as e: if isinstance(e, urllib2.HTTPError) and hasattr( e, 'code') and str(e.code) == '404': logging.warn('Url %s not found\n%s', url, res.read()) raise NotFound('Url %s not found' % url) pause = self._get_random_interval( self.average_wait_between_requests, self.max_wait_between_requests) logging.warn( 'IO or HTTPError (%s) while trying to get url %s, will retry in %f secs' % (str(e), url, pause)) retries -= 1 self.sleeper.sleep(pause) self.last_request_time = time() if not res: raise HTTPClient.Error('Cannot load resource %s' % url) return res def save_file(self, url, filename, post_args=None, resume=True, refer_url=None): res = self.open_url(url, post_args, resume, refer_url) p, f = os.path.split(filename) if not os.path.exists(p): os.makedirs(p) with open(filename, 'wb') as f: while True: r = res.read(1048576) if not r: break f.write(r) def load_page(self, url, post_args=None): res = self.open_url(url, post_args) data = decode_data(res) logging.debug('Loaded page from url %s' % url) pg = BeautifulSoup(data, PARSER) return pg def stop(self): self.sleeper.stop()
class HTTPClient: global_cookies_jar=CookieJar() def __init__(self, proxy=None, no_proxy=False, repeats_on_failure=5, average_wait_between_requests=0, max_wait_between_requests=0, timeout=60): self.repeats_on_failure=repeats_on_failure self.proxy=proxy self.no_proxy=no_proxy self.opener=self.build_opener() self.last_request_time=0 self.average_wait_between_requests=average_wait_between_requests self.max_wait_between_requests=max_wait_between_requests self.sleeper=Sleeper() self.timeout=timeout class Error(Exception): pass class AdditionalHeaders(urllib2.BaseHandler): def http_request(self, req): req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0') req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8') req.add_header('Accept-Language', 'en-us,en;q=0.5') req.add_header('Accept-Encoding', 'gzip, deflate') return req https_request=http_request def build_opener(self): handlers=[HTTPClient.AdditionalHeaders, urllib2.HTTPHandler, ] if self.no_proxy: handlers.append(urllib2.ProxyHandler({}) ) elif self.proxy: handlers.append(urllib2.ProxyHandler({'http':self.proxy, 'https':self.proxy})) handlers.extend([ HTTPEquivProcessor, urllib2.HTTPCookieProcessor(HTTPClient.global_cookies_jar)]) opener=urllib2.build_opener(*handlers) return opener def _get_random_interval(self, mid, max): while True: r=random.gauss(mid, (max -mid)/3) if r>=0 and r <= max: return r def _delay(self, url): if not self.average_wait_between_requests or not self.average_wait_between_requests: return pause=self._get_random_interval(self.average_wait_between_requests, self.max_wait_between_requests) time_from_last_request=time()-self.last_request_time pause=pause-time_from_last_request if pause>0: logging.debug('Waiting %f seconds to %s'% (pause, url)) self.sleeper.sleep(pause) self.last_request_time=time() def open_url(self, url, post_args=None, resume=None, refer_url=None): if post_args: post_args=bytes(urlencode(post_args), 'UTF-8') retries=self.repeats_on_failure res=None req=urllib2.Request(url, post_args) if refer_url: req.add_header('Referer', refer_url) self._delay(url) while retries: try: #with Timeout(timeout+1) : # sometimes socket timeout is not working res=self.opener.open(req, timeout=self.timeout) break except (IOError, urllib2.HTTPError, BadStatusLine, IncompleteRead, socket.timeout) as e: if isinstance(e, urllib2.HTTPError) and hasattr(e,'code') and str(e.code)=='404': logging.warn('Url %s not found\n%s', url, res.read()) raise NotFound('Url %s not found'%url) pause=self._get_random_interval(self.average_wait_between_requests, self.max_wait_between_requests) logging.warn('IO or HTTPError (%s) while trying to get url %s, will retry in %f secs' % (str(e),url, pause)) retries-=1 self.sleeper.sleep(pause) self.last_request_time=time() if not res: raise HTTPClient.Error('Cannot load resource %s' % url) return res def save_file(self, url, filename, post_args=None, resume=True, refer_url=None): res=self.open_url(url, post_args, resume, refer_url) p,f=os.path.split(filename) if not os.path.exists(p): os.makedirs(p) with open(filename,'wb') as f: while True: r=res.read(1048576) if not r: break f.write(r) def load_page(self,url, post_args=None): res=self.open_url(url, post_args) data=decode_data(res) logging.debug('Loaded page from url %s' % url) pg=BeautifulSoup(data, PARSER) return pg def stop(self): self.sleeper.stop()