Ejemplo n.º 1
0
 def __init__(self,
              proxy=None,
              no_proxy=False,
              repeats_on_failure=5,
              average_wait_between_requests=0,
              max_wait_between_requests=0,
              timeout=60):
     self.repeats_on_failure = repeats_on_failure
     self.proxy = proxy
     self.no_proxy = no_proxy
     self.opener = self.build_opener()
     self.last_request_time = 0
     self.average_wait_between_requests = average_wait_between_requests
     self.max_wait_between_requests = max_wait_between_requests
     self.sleeper = Sleeper()
     self.timeout = timeout
Ejemplo n.º 2
0
 def __init__(self, proxy=None, no_proxy=False, repeats_on_failure=5,
              average_wait_between_requests=0, max_wait_between_requests=0, timeout=60):
     self.repeats_on_failure=repeats_on_failure
     self.proxy=proxy
     self.no_proxy=no_proxy
     self.opener=self.build_opener()
     self.last_request_time=0
     self.average_wait_between_requests=average_wait_between_requests
     self.max_wait_between_requests=max_wait_between_requests
     self.sleeper=Sleeper()
     self.timeout=timeout
Ejemplo n.º 3
0
class HTTPClient:
    global_cookies_jar = CookieJar()

    def __init__(self,
                 proxy=None,
                 no_proxy=False,
                 repeats_on_failure=5,
                 average_wait_between_requests=0,
                 max_wait_between_requests=0,
                 timeout=60):
        self.repeats_on_failure = repeats_on_failure
        self.proxy = proxy
        self.no_proxy = no_proxy
        self.opener = self.build_opener()
        self.last_request_time = 0
        self.average_wait_between_requests = average_wait_between_requests
        self.max_wait_between_requests = max_wait_between_requests
        self.sleeper = Sleeper()
        self.timeout = timeout

    class Error(Exception):
        pass

    class AdditionalHeaders(urllib2.BaseHandler):
        def http_request(self, req):
            req.add_header(
                'User-Agent',
                'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0'
            )
            req.add_header(
                'Accept',
                'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
            )
            req.add_header('Accept-Language', 'en-us,en;q=0.5')
            req.add_header('Accept-Encoding', 'gzip, deflate')
            return req

        https_request = http_request

    def build_opener(self):
        handlers = [
            HTTPClient.AdditionalHeaders,
            urllib2.HTTPHandler,
        ]
        if self.no_proxy:
            handlers.append(urllib2.ProxyHandler({}))
        elif self.proxy:
            handlers.append(
                urllib2.ProxyHandler({
                    'http': self.proxy,
                    'https': self.proxy
                }))
        handlers.extend([
            HTTPEquivProcessor,
            urllib2.HTTPCookieProcessor(HTTPClient.global_cookies_jar)
        ])
        opener = urllib2.build_opener(*handlers)
        return opener

    def _get_random_interval(self, mid, max):
        while True:
            r = random.gauss(mid, (max - mid) / 3)
            if r >= 0 and r <= max:
                return r

    def _delay(self, url):
        if not self.average_wait_between_requests or not self.average_wait_between_requests:
            return
        pause = self._get_random_interval(self.average_wait_between_requests,
                                          self.max_wait_between_requests)
        time_from_last_request = time() - self.last_request_time
        pause = pause - time_from_last_request
        if pause > 0:
            logging.debug('Waiting %f seconds to %s' % (pause, url))
            self.sleeper.sleep(pause)
        self.last_request_time = time()

    def open_url(self, url, post_args=None, resume=None, refer_url=None):
        if post_args:
            post_args = bytes(urlencode(post_args), 'UTF-8')
        retries = self.repeats_on_failure
        res = None
        req = urllib2.Request(url, post_args)
        if refer_url:
            req.add_header('Referer', refer_url)
        self._delay(url)
        while retries:
            try:
                #with Timeout(timeout+1) : # sometimes socket timeout is not working
                res = self.opener.open(req, timeout=self.timeout)
                break
            except (IOError, urllib2.HTTPError, BadStatusLine, IncompleteRead,
                    socket.timeout) as e:
                if isinstance(e, urllib2.HTTPError) and hasattr(
                        e, 'code') and str(e.code) == '404':
                    logging.warn('Url %s not found\n%s', url, res.read())
                    raise NotFound('Url %s not found' % url)
                pause = self._get_random_interval(
                    self.average_wait_between_requests,
                    self.max_wait_between_requests)
                logging.warn(
                    'IO or HTTPError (%s) while trying to get url %s, will retry in %f secs'
                    % (str(e), url, pause))
                retries -= 1
                self.sleeper.sleep(pause)
                self.last_request_time = time()
        if not res:
            raise HTTPClient.Error('Cannot load resource %s' % url)
        return res

    def save_file(self,
                  url,
                  filename,
                  post_args=None,
                  resume=True,
                  refer_url=None):
        res = self.open_url(url, post_args, resume, refer_url)
        p, f = os.path.split(filename)
        if not os.path.exists(p):
            os.makedirs(p)
        with open(filename, 'wb') as f:

            while True:
                r = res.read(1048576)
                if not r: break
                f.write(r)

    def load_page(self, url, post_args=None):

        res = self.open_url(url, post_args)
        data = decode_data(res)

        logging.debug('Loaded page from url %s' % url)
        pg = BeautifulSoup(data, PARSER)
        return pg

    def stop(self):
        self.sleeper.stop()
Ejemplo n.º 4
0
class HTTPClient:
    global_cookies_jar=CookieJar()
    def __init__(self, proxy=None, no_proxy=False, repeats_on_failure=5,
                 average_wait_between_requests=0, max_wait_between_requests=0, timeout=60):
        self.repeats_on_failure=repeats_on_failure
        self.proxy=proxy
        self.no_proxy=no_proxy
        self.opener=self.build_opener()
        self.last_request_time=0
        self.average_wait_between_requests=average_wait_between_requests
        self.max_wait_between_requests=max_wait_between_requests
        self.sleeper=Sleeper()
        self.timeout=timeout
    
    class Error(Exception): pass
    
    class AdditionalHeaders(urllib2.BaseHandler):
        def http_request(self, req):
            req.add_header('User-Agent', 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:12.0) Gecko/20100101 Firefox/12.0')
            req.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
            req.add_header('Accept-Language', 'en-us,en;q=0.5')
            req.add_header('Accept-Encoding', 'gzip, deflate')
            return req
        https_request=http_request
    
    def build_opener(self):    
        handlers=[HTTPClient.AdditionalHeaders, urllib2.HTTPHandler, ]
        if self.no_proxy:
            handlers.append(urllib2.ProxyHandler({}) )
        elif self.proxy:
            handlers.append(urllib2.ProxyHandler({'http':self.proxy, 'https':self.proxy}))        
        handlers.extend([ HTTPEquivProcessor, urllib2.HTTPCookieProcessor(HTTPClient.global_cookies_jar)])
        opener=urllib2.build_opener(*handlers)
        return opener
    def _get_random_interval(self, mid, max):
        while True:
            r=random.gauss(mid, (max -mid)/3)
            if r>=0 and r <= max:
                return r
    def _delay(self, url):
        if not self.average_wait_between_requests or not self.average_wait_between_requests:
            return
        pause=self._get_random_interval(self.average_wait_between_requests, self.max_wait_between_requests)
        time_from_last_request=time()-self.last_request_time
        pause=pause-time_from_last_request
        if pause>0:
            logging.debug('Waiting %f seconds to %s'% (pause, url))
            self.sleeper.sleep(pause)
        self.last_request_time=time()
        
    def open_url(self, url, post_args=None, resume=None, refer_url=None):
        if post_args:
            post_args=bytes(urlencode(post_args), 'UTF-8')
        retries=self.repeats_on_failure
        res=None
        req=urllib2.Request(url, post_args)
        if refer_url:
            req.add_header('Referer', refer_url)
        self._delay(url)
        while retries:   
            try:
                #with Timeout(timeout+1) : # sometimes socket timeout is not working
                res=self.opener.open(req, timeout=self.timeout)
                break
            except (IOError, urllib2.HTTPError, BadStatusLine, IncompleteRead, socket.timeout) as e:
                if isinstance(e, urllib2.HTTPError) and hasattr(e,'code') and str(e.code)=='404':
                    logging.warn('Url %s not found\n%s', url, res.read())
                    raise NotFound('Url %s not found'%url)
                pause=self._get_random_interval(self.average_wait_between_requests, self.max_wait_between_requests)
                logging.warn('IO or HTTPError (%s) while trying to get url %s, will retry in %f secs' % (str(e),url, pause))
                retries-=1
                self.sleeper.sleep(pause)
                self.last_request_time=time()
        if not res:
            raise HTTPClient.Error('Cannot load resource %s' % url)
        return res
    
    def save_file(self, url, filename, post_args=None, resume=True, refer_url=None): 
        res=self.open_url(url, post_args, resume, refer_url)
        p,f=os.path.split(filename)
        if not os.path.exists(p):
            os.makedirs(p)
        with open(filename,'wb') as f:
            
            while True:
                r=res.read(1048576)
                if not r: break
                f.write(r)
           
        
    def load_page(self,url, post_args=None):
        
        res=self.open_url(url, post_args)        
        data=decode_data(res)
            
        logging.debug('Loaded page from url %s' % url)
        pg=BeautifulSoup(data, PARSER)
        return pg
    
    def stop(self):
        self.sleeper.stop()