Ejemplo n.º 1
0
    def urlopen(self,
                url,
                timeout=30,
                params=None,
                headers=None,
                opener=None,
                multipart=False,
                show_error=True):
        url = urllib2.quote(ss(url), safe="%/:=&?~#+!$,;'@()*[]")

        if not headers: headers = {}
        if not params: params = {}

        # Fill in some headers
        parsed_url = urlparse(url)
        host = '%s%s' % (parsed_url.hostname,
                         (':' +
                          str(parsed_url.port) if parsed_url.port else ''))

        headers['Referer'] = headers.get('Referer',
                                         '%s://%s' % (parsed_url.scheme, host))
        headers['Host'] = headers.get('Host', host)
        headers['User-Agent'] = headers.get('User-Agent', self.user_agent)
        headers['Accept-encoding'] = headers.get('Accept-encoding', 'gzip')
        headers['Connection'] = headers.get('Connection', 'keep-alive')
        headers['Cache-Control'] = headers.get('Cache-Control', 'max-age=0')

        # Don't try for failed requests
        if self.http_failed_disabled.get(host, 0) > 0:
            if self.http_failed_disabled[host] > (time.time() - 900):
                log.info2(
                    'Disabled calls to %s for 15 minutes because so many failed requests.',
                    host)
                if not show_error:
                    raise Exception(
                        'Disabled calls to %s for 15 minutes because so many failed requests'
                    )
                else:
                    return ''
            else:
                del self.http_failed_request[host]
                del self.http_failed_disabled[host]

        self.wait(host)
        try:

            # Make sure opener has the correct headers
            if opener:
                opener.add_headers = headers

            if multipart:
                log.info('Opening multipart url: %s, params: %s',
                         (url, [x for x in params.iterkeys()] if isinstance(
                             params, dict) else 'with data'))
                request = urllib2.Request(url, params, headers)

                if opener:
                    opener.add_handler(MultipartPostHandler())
                else:
                    cookies = cookielib.CookieJar()
                    opener = urllib2.build_opener(
                        urllib2.HTTPCookieProcessor(cookies),
                        MultipartPostHandler)

                response = opener.open(request, timeout=timeout)
            else:
                log.info('Opening url: %s, params: %s',
                         (url, [x for x in params.iterkeys()] if isinstance(
                             params, dict) else 'with data'))

                if isinstance(params, (str, unicode)) and len(params) > 0:
                    data = params
                else:
                    data = tryUrlencode(params) if len(params) > 0 else None

                request = urllib2.Request(url, data, headers)

                if opener:
                    response = opener.open(request, timeout=timeout)
                else:
                    response = urllib2.urlopen(request, timeout=timeout)

            # unzip if needed
            if response.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(response.read())
                f = gzip.GzipFile(fileobj=buf)
                data = f.read()
                f.close()
            else:
                data = response.read()
            response.close()

            self.http_failed_request[host] = 0
        except IOError:
            if show_error:
                log.error('Failed opening url in %s: %s %s',
                          (self.getName(), url, traceback.format_exc(1)))

            # Save failed requests by hosts
            try:
                if not self.http_failed_request.get(host):
                    self.http_failed_request[host] = 1
                else:
                    self.http_failed_request[host] += 1

                    # Disable temporarily
                    if self.http_failed_request[host] > 5 and not isLocalIP(
                            host):
                        self.http_failed_disabled[host] = time.time()

            except:
                log.debug('Failed logging failed requests for %s: %s',
                          (url, traceback.format_exc()))

            raise

        self.http_last_use[host] = time.time()

        return data
Ejemplo n.º 2
0
    def urlopen(self,
                url,
                timeout=30,
                params=None,
                headers=None,
                opener=None,
                multipart=False,
                show_error=True):
        url = ss(url)

        if not headers: headers = {}
        if not params: params = {}

        # Fill in some headers
        headers['Referer'] = headers.get('Referer', urlparse(url).hostname)
        headers['Host'] = headers.get('Host', urlparse(url).hostname)
        headers['User-Agent'] = headers.get(
            'User-Agent',
            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.7; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
        )
        headers['Accept-encoding'] = headers.get('Accept-encoding', 'gzip')

        host = urlparse(url).hostname

        # Don't try for failed requests
        if self.http_failed_disabled.get(host, 0) > 0:
            if self.http_failed_disabled[host] > (time.time() - 900):
                log.info2(
                    'Disabled calls to %s for 15 minutes because so many failed requests.',
                    host)
                if not show_error:
                    raise
                else:
                    return ''
            else:
                del self.http_failed_request[host]
                del self.http_failed_disabled[host]

        self.wait(host)
        try:

            if multipart:
                log.info('Opening multipart url: %s, params: %s',
                         (url, [x for x in params.iterkeys()] if isinstance(
                             params, dict) else 'with data'))
                request = urllib2.Request(url, params, headers)

                if opener:
                    opener.add_handler(MultipartPostHandler())
                else:
                    cookies = cookielib.CookieJar()
                    opener = urllib2.build_opener(
                        urllib2.HTTPCookieProcessor(cookies),
                        MultipartPostHandler)

                response = opener.open(request, timeout=timeout)
            else:
                log.info('Opening url: %s, params: %s',
                         (url, [x for x in params.iterkeys()]))
                data = tryUrlencode(params) if len(params) > 0 else None
                request = urllib2.Request(url, data, headers)

                if opener:
                    response = opener.open(request, timeout=timeout)
                else:
                    response = urllib2.urlopen(request, timeout=timeout)

            # unzip if needed
            if response.info().get('Content-Encoding') == 'gzip':
                buf = StringIO(response.read())
                f = gzip.GzipFile(fileobj=buf)
                data = f.read()
            else:
                data = response.read()

            self.http_failed_request[host] = 0
        except IOError:
            if show_error:
                log.error('Failed opening url in %s: %s %s',
                          (self.getName(), url, traceback.format_exc(1)))

            # Save failed requests by hosts
            try:
                if not self.http_failed_request.get(host):
                    self.http_failed_request[host] = 1
                else:
                    self.http_failed_request[host] += 1

                    # Disable temporarily
                    if self.http_failed_request[host] > 5:
                        self.http_failed_disabled[host] = time.time()

            except:
                log.debug('Failed logging failed requests for %s: %s',
                          (url, traceback.format_exc()))

            raise

        self.http_last_use[host] = time.time()

        return data