Example #1
0
    def process_request(self, request, spider):
        for p in self.no_proxy_patterns:
            if p.search(request.url):
                return
        retries = request.meta.get('retry_times', None)
        #已手动制定代理的不设置
        if 'proxy' in request.meta:
            if retries is None:
                return

        #当到达最大重试次数时,使用本机直接访问,确保失败时始终有一次本机访问.
        if retries == self.max_retry_times:
            now = time.time()
            should_sleep = self.local_interval - (now - self.local_last_use_time)
            if should_sleep > 0:
                log.msg('ProxyMiddleware:use proxy fail,local sleep %s' % should_sleep, log.DEBUG)
                time.sleep(should_sleep)
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme
        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return
        self._set_proxy(request, scheme)
Example #2
0
def get_environ_proxies(netloc):
    """Return a dict of environment proxies."""

    get_proxy = lambda k: os.environ.get(k) or os.environ.get(k.upper())

    # First check whether no_proxy is defined. If it is, check that the URL
    # we're getting isn't in the no_proxy list.
    no_proxy = get_proxy('no_proxy')

    if no_proxy:
        # We need to check whether we match here. We need to see if we match
        # the end of the netloc, both with and without the port.
        no_proxy = no_proxy.replace(' ', '').split(',')

        for host in no_proxy:
            if netloc.endswith(host) or netloc.split(':')[0].endswith(host):
                # The URL does match something in no_proxy, so we don't want
                # to apply the proxies on this URL.
                return {}

    # If the system proxy settings indicate that this URL should be bypassed,
    # don't proxy.
    if proxy_bypass(netloc):
        return {}

    # If we get here, we either didn't have no_proxy set or we're not going
    # anywhere that no_proxy applies to, and the system settings don't require
    # bypassing the proxy for the current URL.
    return getproxies()
Example #3
0
def using_http_proxy(url):
    """
    Return True if the url will use HTTP proxy.
    Returns False otherwise.
    """
    up = urlparse(url)
    return up.scheme.lower() in getproxies() and not proxy_bypass(up.netloc)
  def process_request(self, request, spider):
    #无代理可用 或 不需要代理(如localhost)
    if request.meta.get('no_proxy', 0) >= 1:
      logging.info('免代理 url=[%s]' % request.url)
      return

    #按照scheme取代理
    parsed = urlparse_cached(request)
    scheme = parsed.scheme if parsed.scheme in self.proxy_pool else 'http'
    #不能使用代理
    if proxy_bypass(parsed.hostname):
      logging.info('不能使用代理. parsed.hostname:%s' % parsed.hostname)
      return

    if scheme not in self.proxy_pool or len(self.proxy_pool.get(scheme)) < 1:
      self.update_proxy_pool()
    if len(self.proxy_pool[scheme]) < 1:
      logging.warning("更新代理库后仍然没有适合[%s]的代理,使用http协议替换" % scheme)
      scheme = 'http'

    #设置代理
    creds,proxy_url = self.proxy_pool[scheme].pop()
    logging.debug('取出一个代理. creds=[%s] proxy_url=[%s]' % (creds, proxy_url))
    request.meta['proxy']=proxy_url
    if creds :
      request.headers['Porxy-Authorization'] = creds
    logging.debug('代理设置. url=[%s] proxy_url:%s' % (request.url, proxy_url))
    return
Example #5
0
    def proxy_open(self, req, proxy, type):
        orig_type = req.get_type()
        proxy_type, user, password, hostport = _parse_proxy(proxy)

        if proxy_type is None:
            proxy_type = orig_type

        if req.host and proxy_bypass(req.host):
            return None

        if user and password:
            user_pass = '******' % (unquote(user), unquote(password))
            creds = base64.b64encode(user_pass).strip()
            req.add_header('Proxy-authorization', 'Basic ' + creds)
        hostport = unquote(hostport)
        req.set_proxy(hostport, proxy_type)

        if orig_type == proxy_type or orig_type == 'https':
            # let other handlers take care of it
            return None
        else:
            # need to start over, because the other handlers don't
            # grok the proxy's URL type
            # e.g. if we have a constructor arg proxies like so:
            # {'http': 'ftp://proxy.example.com'}, we may end up turning
            # a request for http://acme.example.com/a into one for
            # ftp://proxy.example.com/a
            return self.parent.open(req, timeout=req.timeout)
Example #6
0
    def proxy_open(self, req, proxy, type):
        orig_type = req.get_type()
        proxy_type, user, password, hostport = _parse_proxy(proxy)

        if proxy_type is None:
            proxy_type = orig_type

        if req.host and proxy_bypass(req.host):
            return None

        if user and password:
            user_pass = '******' % (unquote(user), unquote(password))
            creds = base64.b64encode(user_pass).strip()
            req.add_header('Proxy-authorization', 'Basic ' + creds)
        hostport = unquote(hostport)
        req.set_proxy(hostport, proxy_type)

        if orig_type == proxy_type or orig_type == 'https':
            # let other handlers take care of it
            return None
        else:
            # need to start over, because the other handlers don't
            # grok the proxy's URL type
            # e.g. if we have a constructor arg proxies like so:
            # {'http': 'ftp://proxy.example.com'}, we may end up turning
            # a request for http://acme.example.com/a into one for
            # ftp://proxy.example.com/a
            return self.parent.open(req, timeout=req.timeout)
Example #7
0
def get_environ_proxies(netloc):
    """Return a dict of environment proxies."""

    get_proxy = lambda k: os.environ.get(k) or os.environ.get(k.upper())

    # First check whether no_proxy is defined. If it is, check that the URL
    # we're getting isn't in the no_proxy list.
    no_proxy = get_proxy('no_proxy')

    if no_proxy:
        # We need to check whether we match here. We need to see if we match
        # the end of the netloc, both with and without the port.
        no_proxy = no_proxy.replace(' ', '').split(',')

        for host in no_proxy:
            if netloc.endswith(host) or netloc.split(':')[0].endswith(host):
                # The URL does match something in no_proxy, so we don't want
                # to apply the proxies on this URL.
                return {}

    # If the system proxy settings indicate that this URL should be bypassed,
    # don't proxy.
    if proxy_bypass(netloc):
        return {}

    # If we get here, we either didn't have no_proxy set or we're not going
    # anywhere that no_proxy applies to, and the system settings don't require
    # bypassing the proxy for the current URL.
    return getproxies()
Example #8
0
def open_http(url, data=None):
    """Use HTTP protocol."""
    import httplib
    user_passwd = None
    proxy_passwd = None
    if isinstance(url, str):
        host, selector = urllib.splithost(url)
        if host:
            user_passwd, host = urllib.splituser(host)
            host = urllib.unquote(host)
        realhost = host
    else:
        host, selector = url
        # check whether the proxy contains authorization information
        proxy_passwd, host = urllib.splituser(host)
        # now we proceed with the url we want to obtain
        urltype, rest = urllib.splittype(selector)
        url = rest
        user_passwd = None
        if urltype.lower() != 'http':
            realhost = None
        else:
            realhost, rest = urllib.splithost(rest)
            if realhost:
                user_passwd, realhost = urllib.splituser(realhost)
            if user_passwd:
                selector = "%s://%s%s" % (urltype, realhost, rest)
            if urllib.proxy_bypass(realhost):
                host = realhost

        #print "proxy via http:", host, selector
    if not host: raise IOError('http error', 'no host given')

    if proxy_passwd:
        import base64
        proxy_auth = base64.b64encode(proxy_passwd).strip()
    else:
        proxy_auth = None

    if user_passwd:
        import base64
        auth = base64.b64encode(user_passwd).strip()
    else:
        auth = None
    c = FakeHTTPConnection(host)
    if data is not None:
        c.putrequest('POST', selector)
        c.putheader('Content-Type', 'application/x-www-form-urlencoded')
        c.putheader('Content-Length', '%d' % len(data))
    else:
        c.putrequest('GET', selector)
    if proxy_auth: c.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
    if auth: c.putheader('Authorization', 'Basic %s' % auth)
    if realhost: c.putheader('Host', realhost)
    for args in urllib.URLopener().addheaders:
        c.putheader(*args)
    c.endheaders()
    return c
Example #9
0
def open_http(url, data=None):
    """Use HTTP protocol."""
    import httplib
    user_passwd = None
    proxy_passwd= None
    if isinstance(url, str):
        host, selector = urllib.splithost(url)
        if host:
            user_passwd, host = urllib.splituser(host)
            host = urllib.unquote(host)
        realhost = host
    else:
        host, selector = url
        # check whether the proxy contains authorization information
        proxy_passwd, host = urllib.splituser(host)
        # now we proceed with the url we want to obtain
        urltype, rest = urllib.splittype(selector)
        url = rest
        user_passwd = None
        if urltype.lower() != 'http':
            realhost = None
        else:
            realhost, rest = urllib.splithost(rest)
            if realhost:
                user_passwd, realhost = urllib.splituser(realhost)
            if user_passwd:
                selector = "%s://%s%s" % (urltype, realhost, rest)
            if urllib.proxy_bypass(realhost):
                host = realhost

        #print "proxy via http:", host, selector
    if not host: raise IOError, ('http error', 'no host given')

    if proxy_passwd:
        import base64
        proxy_auth = base64.b64encode(proxy_passwd).strip()
    else:
        proxy_auth = None

    if user_passwd:
        import base64
        auth = base64.b64encode(user_passwd).strip()
    else:
        auth = None
    c = FakeHTTPConnection(host)
    if data is not None:
        c.putrequest('POST', selector)
        c.putheader('Content-Type', 'application/x-www-form-urlencoded')
        c.putheader('Content-Length', '%d' % len(data))
    else:
        c.putrequest('GET', selector)
    if proxy_auth: c.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
    if auth: c.putheader('Authorization', 'Basic %s' % auth)
    if realhost: c.putheader('Host', realhost)
    for args in urllib.URLopener().addheaders: c.putheader(*args)
    c.endheaders()
    return c
Example #10
0
 def ignore_proxy_host (self):
     """Check if self.host is in the $no_proxy ignore list."""
     if urllib.proxy_bypass(self.host):
         return True
     no_proxy = os.environ.get("no_proxy")
     if no_proxy:
         entries = [parse_host_port(x) for x in no_proxy.split(",")]
         for host, port in entries:
             if host.lower() == self.host and port == self.port:
                 return True
     return False
Example #11
0
    def process_request(self, request, spider):
        # ignore if proxy is already seted
        if 'proxy' in request.meta:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Example #12
0
    def process_request(self, request, spider):
        # ignore if proxy is already seted
        if 'proxy' in request.meta:
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Example #13
0
 def proxy_open(self, req, proxy, type):
     orig_type = req.get_type()
     proxy_type, user, password, hostport = _parse_proxy(proxy)
     if proxy_type is None:
         proxy_type = orig_type
     if req.host and proxy_bypass(req.host):
         return
     if user and password:
         user_pass = '******' % (unquote(user), unquote(password))
         creds = base64.b64encode(user_pass).strip()
         req.add_header('Proxy-authorization', 'Basic ' + creds)
     hostport = unquote(hostport)
     req.set_proxy(hostport, proxy_type)
     if orig_type == proxy_type or orig_type == 'https':
         return
     else:
         return self.parent.open(req, timeout=req.timeout)
Example #14
0
 def proxy_open(self, req, proxy, type):
     orig_type = req.get_type()
     proxy_type, user, password, hostport = _parse_proxy(proxy)
     if proxy_type is None:
         proxy_type = orig_type
     if req.host and proxy_bypass(req.host):
         return
     if user and password:
         user_pass = '******' % (unquote(user), unquote(password))
         creds = base64.b64encode(user_pass).strip()
         req.add_header('Proxy-authorization', 'Basic ' + creds)
     hostport = unquote(hostport)
     req.set_proxy(hostport, proxy_type)
     if orig_type == proxy_type or orig_type == 'https':
         return
     else:
         return self.parent.open(req, timeout=req.timeout)
Example #15
0
def find_proxy(url):
    scheme, netloc, path, pars, query, fragment = urlparse.urlparse(url)
    proxies = urllib.getproxies()
    proxyhost = None
    if scheme in proxies:
        if '@' in netloc:
            sidx = netloc.find('@') + 1
        else:
            sidx = 0
        eidx = netloc.find(':')
        if eidx == -1:
            eidx = len(netloc)
        host = netloc[sidx:eidx]
        if not (host == '127.0.0.1' or urllib.proxy_bypass(host)):
            proxyurl = proxies[scheme]
            proxyelems = urlparse.urlparse(proxyurl)
            proxyhost = proxyelems[1]
    if DEBUG:
        print >> sys.stderr, 'find_proxy: Got proxies', proxies, 'selected', proxyhost, 'URL was', url
    return proxyhost
def find_proxy(url):
    scheme, netloc, path, pars, query, fragment = urlparse.urlparse(url)
    proxies = urllib.getproxies()
    proxyhost = None
    if scheme in proxies:
        if '@' in netloc:
            sidx = netloc.find('@') + 1
        else:
            sidx = 0
        eidx = netloc.find(':')
        if eidx == -1:
            eidx = len(netloc)
        host = netloc[sidx:eidx]
        if not (host == '127.0.0.1' or urllib.proxy_bypass(host)):
            proxyurl = proxies[scheme]
            proxyelems = urlparse.urlparse(proxyurl)
            proxyhost = proxyelems[1]
    if DEBUG:
        print >> sys.stderr, 'find_proxy: Got proxies', proxies, 'selected', proxyhost, 'URL was', url
    return proxyhost
Example #17
0
    def process_request(self, request, spider):
        # ignore if proxy is already seted
        if 'proxy' in request.meta:
            # parse out user/password
            # if exists setup authentication
            creds, proxy = self._get_proxy(request.meta['proxy'], 'http')

            if creds:
                request.meta['proxy'] = proxy
                request.headers['Proxy-Authorization'] = 'Basic ' + creds
            return

        parsed = urlparse_cached(request)
        scheme = parsed.scheme

        # 'no_proxy' is only supported by http schemes
        if scheme in ('http', 'https') and proxy_bypass(parsed.hostname):
            return

        if scheme in self.proxies:
            self._set_proxy(request, scheme)
Example #18
0
def find_proxy(url):
    """ Returns proxy host as "host:port" string """
    (scheme, netloc, path, pars, query, fragment) = urlparse.urlparse(url)
    proxies = urllib.getproxies()
    proxyhost = None
    if scheme in proxies:
        if '@' in netloc:
            sidx = netloc.find('@')+1
        else:
            sidx = 0
        # IPVSIX TODO: what if host is IPv6 address
        eidx = netloc.find(':')
        if eidx == -1:
            eidx = len(netloc)
        host = netloc[sidx:eidx]
        if not (host == "127.0.0.1" or urllib.proxy_bypass(host)):
            proxyurl = proxies[scheme]
            proxyelems = urlparse.urlparse(proxyurl)
            proxyhost = proxyelems[1]

    if DEBUG:
        print >>sys.stderr,"find_proxy: Got proxies",proxies,"selected",proxyhost,"URL was",url
    return proxyhost
Example #19
0
def find_proxy(url):
    """ Returns proxy host as "host:port" string """
    (scheme, netloc, path, pars, query, fragment) = urlparse.urlparse(url)
    proxies = urllib.getproxies()
    proxyhost = None
    if scheme in proxies:
        if '@' in netloc:
            sidx = netloc.find('@') + 1
        else:
            sidx = 0
        # IPVSIX TODO: what if host is IPv6 address
        eidx = netloc.find(':')
        if eidx == -1:
            eidx = len(netloc)
        host = netloc[sidx:eidx]
        if not (host == "127.0.0.1" or urllib.proxy_bypass(host)):
            proxyurl = proxies[scheme]
            proxyelems = urlparse.urlparse(proxyurl)
            proxyhost = proxyelems[1]

    if DEBUG:
        print >> sys.stderr, "find_proxy: Got proxies", proxies, "selected", proxyhost, "URL was", url
    return proxyhost
Example #20
0
 def proxy_open(self, req, proxy, type):
     orig_type = req.get_type()
     proxy_type, user, password, hostport = _parse_proxy(proxy)
     if proxy_type is None:
         proxy_type = orig_type
     req.get_host()
     if req.host and proxy_bypass(req.host):
         return None
     if user and password:
         user_pass = '******' % (unquote(user), unquote(password))
         creds = base64.encodestring(user_pass).strip()
         req.add_header('Proxy-authorization', 'Basic ' + creds)
     hostport = unquote(hostport)
     req.set_proxy(hostport, proxy_type)
     if orig_type == proxy_type:
         # let other handlers take care of it
         # XXX this only makes sense if the proxy is before the
         # other handlers
         return None
     else:
         # need to start over, because the other handlers don't
         # grok the proxy's URL type
         return self.parent.open(req)
Example #21
0
    def _setup_connection(self, protocol, netloc):
        """Takes care of managing proxies if any. This is a first attempt to
        manage proxies. Authentication is not yet taken into account. This all
        stuff is not tested yet.

        Parameters
        ----------
        protocol: str
            http or https
        netloc: str
            url to connect to

        Returns
        -------
        HTTP(S)Session
            properly set up in case of proxies
        """
        proxies = urllib.getproxies()
        # We process proxy if a proxy is defined for this protocol and the
        # netloc to connect to is not in the bypass list.
        if protocol in proxies and urllib.proxy_bypass(netloc) == 0:
            proxy = proxies[protocol]
            urltype, proxyhost = urllib.splittype(proxy)
            host, selector = urllib.splithost(proxyhost)
            host, port = urllib.splitport(host)
            if protocol == 'https':
                self.connections[protocol+netloc] = client.HTTPSConnection(host, port)
                self.connections[protocol+netloc].set_tunnel(netloc, 443)
            else:
                self.connections[protocol+netloc] = client.HTTPSConnection(host, port)
                self.connections[protocol+netloc].set_tunnel(netloc, 80)
        else:
            if protocol == 'https':
                self.connections[protocol+netloc] = client.HTTPSConnection(netloc)
            else:
                self.connections[protocol+netloc] = client.HTTPConnection(netloc)
Example #22
0
    def __init__(
        self,
        url,
        method="GET",
        data=None,
        headers=None,
        headers_only=False,
        user_agent=None,
        follow_location=False,
        force_quiet=True,
    ):
        GObjectWrapper.__init__(self)
        self.result = StringIO.StringIO()
        self.result_headers = StringIO.StringIO()

        if isinstance(url, unicode):
            self.url = url.encode("utf-8")
        else:
            self.url = url
        self.method = method
        self.data = data
        self.headers = headers
        self.status = None

        # the actual curl request object
        self.curl = pycurl.Curl()
        if logging.root.level == logging.DEBUG and not force_quiet:
            self.curl.setopt(pycurl.VERBOSE, 1)

        self.curl.setopt(pycurl.WRITEFUNCTION, self.result.write)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.result_headers.write)
        # We want to use gzip and deflate if possible:
        self.curl.setopt(pycurl.ENCODING, "")  # use all available encodings
        self.curl.setopt(pycurl.URL, self.url)

        # let's set the HTTP request method
        if method == "GET":
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif method == "POST":
            self.curl.setopt(pycurl.POST, 1)
        elif method == "PUT":
            self.curl.setopt(pycurl.UPLOAD, 1)
        else:
            self.curl.setopt(pycurl.CUSTOMREQUEST, method)
        if data:
            if method == "PUT":
                self.data = StringIO.StringIO(data)
                self.curl.setopt(pycurl.READFUNCTION, self.data.read)
                self.curl.setopt(pycurl.INFILESIZE, len(self.data.getvalue()))
            else:
                self.curl.setopt(pycurl.POSTFIELDS, self.data)
                self.curl.setopt(pycurl.POSTFIELDSIZE, len(self.data))
        if headers:
            self.curl.setopt(pycurl.HTTPHEADER, headers)
        if headers_only:
            self.curl.setopt(pycurl.HEADER, 1)
            self.curl.setopt(pycurl.NOBODY, 1)
        if user_agent:
            self.curl.setopt(pycurl.USERAGENT, user_agent)
        if follow_location:
            self.curl.setopt(pycurl.FOLLOWLOCATION, 1)

        if libproxy:
            for proxy in proxy_factory.getProxies(self.url):
                # only use the first one
                self.curl.setopt(pycurl.PROXY, proxy)
                break
        else:
            # Proxy: let's be careful to isolate the protocol to ensure that we
            # support the case where http and https might use different proxies
            split_url = self.url.split("://", 1)
            if len(split_url) > 1:
                # We were able to get a protocol
                protocol, address = split_url
                host, _path = urllib.splithost("//" + address)
                proxies = urllib.getproxies()
                if protocol in proxies and not urllib.proxy_bypass(host):
                    self.curl.setopt(pycurl.PROXY, proxies[protocol])

        # self reference required, because CurlMulti will only return
        # Curl handles
        self.curl.request = self
Example #23
0
    def loadPage(self, url, uri=None, method="GET", params="", additionalParams=""):
        if not url:
            logging.error("Request URL undefined")
            tools.exitErr()

        if not url.startswith("http"):
            url = "https://" + url
        urlData = urlparse(url)
        if not uri:
            url = "%s://%s" (urlData.scheme, urlData.netloc)
            uri = urlData.path + '?' + urlData.query

        # prepare params, append to uri
        if params:
            params = urlencode(params) + additionalParams
            if method == "GET":
                uri += ('?' if uri.find('?') == -1 else '&') + params
                params = ""

        # insert local cookies in request
        headers = {
            "Cookie": '; '.join([key + '=' + self.cookies[key] for key in self.cookies.keys()])
        }

        if method == "POST":
            headers["Content-type"] = "application/x-www-form-urlencoded"

        if self._proxy is None or proxy_bypass(urlData.hostname):
            host = urlData.hostname
            port = urlData.port
            real_host = real_port = None
        else:
            host = self._proxy.hostname
            port = self._proxy.port
            real_host = urlData.hostname
            real_port = urlData.port

        logging.debug("Request URL: %s:/%s > %s # %s", url,
                      uri, unquote(params), headers["Cookie"])

        conn = httplib.HTTPSConnection(host, port)

        if real_host is not None:
            conn.set_tunnel(real_host, real_port, headers=self._proxy_auth)
        if config.DEBUG:
            conn.set_debuglevel(1)

        conn.request(method, url + uri, params, headers)
        response = conn.getresponse()
        data = response.read()
        conn.close()

        logging.debug("Response : %s > %s",
                      response.status,
                      response.getheaders())
        result = tools.Struct(status=response.status,
                              location=response.getheader('location', None),
                              data=data)

        # update local cookies
        sk = Cookie.SimpleCookie(response.getheader("Set-Cookie", ""))
        for key in sk:
            self.cookies[key] = sk[key].value
        # delete cookies whose content is "deleteme"
        for key in self.cookies.keys():
            if self.cookies[key] == "deleteme":
                del self.cookies[key]

        return result
def proxied(value):
    netloc = urlparse(value).netloc
    proxied = bool(getproxies_environment()) and not proxy_bypass(netloc)
    return (proxied)
Example #25
0
File: conf.py Project: myMeow/osc
def _build_opener(url):
    from osc.core import __version__

    global config
    apiurl = urljoin(*parse_apisrv_url(None, url))
    if "last_opener" not in _build_opener.__dict__:
        _build_opener.last_opener = (None, None)
    if apiurl == _build_opener.last_opener[0]:
        return _build_opener.last_opener[1]

    # respect no_proxy env variable
    if urllib.proxy_bypass(apiurl):
        # initialize with empty dict
        proxyhandler = urllib2.ProxyHandler({})
    else:
        # read proxies from env
        proxyhandler = urllib2.ProxyHandler()

    # workaround for http://bugs.python.org/issue9639
    authhandler_class = urllib2.HTTPBasicAuthHandler
    if (
        sys.version_info >= (2, 6, 6)
        and sys.version_info < (2, 7, 1)
        and not "reset_retry_count" in dir(urllib2.HTTPBasicAuthHandler)
    ):
        print >> sys.stderr, "warning: your urllib2 version seems to be broken. " "Using a workaround for http://bugs.python.org/issue9639"

        class OscHTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):
            def http_error_401(self, *args):
                response = urllib2.HTTPBasicAuthHandler.http_error_401(self, *args)
                self.retried = 0
                return response

            def http_error_404(self, *args):
                self.retried = 0
                return None

        authhandler_class = OscHTTPBasicAuthHandler
    elif sys.version_info >= (2, 6, 6) and sys.version_info < (2, 7, 1):

        class OscHTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):
            def http_error_404(self, *args):
                self.reset_retry_count()
                return None

        authhandler_class = OscHTTPBasicAuthHandler
    elif sys.version_info >= (2, 6, 5) and sys.version_info < (2, 6, 6):
        # workaround for broken urllib2 in python 2.6.5: wrong credentials
        # lead to an infinite recursion
        class OscHTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):
            def retry_http_basic_auth(self, host, req, realm):
                # don't retry if auth failed
                if req.get_header(self.auth_header, None) is not None:
                    return None
                return urllib2.HTTPBasicAuthHandler.retry_http_basic_auth(self, host, req, realm)

        authhandler_class = OscHTTPBasicAuthHandler

    options = config["api_host_options"][apiurl]
    # with None as first argument, it will always use this username/password
    # combination for urls for which arg2 (apisrv) is a super-url
    authhandler = authhandler_class(urllib2.HTTPPasswordMgrWithDefaultRealm())
    authhandler.add_password(None, apiurl, options["user"], options["pass"])

    if options["sslcertck"]:
        try:
            import oscssl
            from M2Crypto import m2urllib2
        except ImportError, e:
            print e
            raise NoSecureSSLError(
                "M2Crypto is needed to access %s in a secure way.\nPlease install python-m2crypto." % apiurl
            )

        cafile = options.get("cafile", None)
        capath = options.get("capath", None)
        if not cafile and not capath:
            for i in ["/etc/pki/tls/cert.pem", "/etc/ssl/certs"]:
                if os.path.isfile(i):
                    cafile = i
                    break
                elif os.path.isdir(i):
                    capath = i
                    break
        ctx = oscssl.mySSLContext()
        if ctx.load_verify_locations(capath=capath, cafile=cafile) != 1:
            raise Exception("No CA certificates found")
        opener = m2urllib2.build_opener(
            ctx,
            oscssl.myHTTPSHandler(ssl_context=ctx, appname="osc"),
            urllib2.HTTPCookieProcessor(cookiejar),
            authhandler,
            proxyhandler,
        )
Example #26
0
    def loadPage(self,
                 url,
                 uri=None,
                 method="GET",
                 params="",
                 additionalParams=""):
        if not url:
            logging.error("Request URL undefined")
            tools.exitErr()

        if not url.startswith("http"):
            url = "https://" + url
        urlData = urlparse(url)
        if not uri:
            url = "%s://%s" (urlData.scheme, urlData.netloc)
            uri = urlData.path + '?' + urlData.query

        # prepare params, append to uri
        if params:
            params = urlencode(params) + additionalParams
            if method == "GET":
                uri += ('?' if uri.find('?') == -1 else '&') + params
                params = ""

        # insert local cookies in request
        headers = {
            "Cookie":
            '; '.join(
                [key + '=' + self.cookies[key] for key in self.cookies.keys()])
        }

        if method == "POST":
            headers["Content-type"] = "application/x-www-form-urlencoded"

        if self._proxy is None or proxy_bypass(urlData.hostname):
            host = urlData.hostname
            port = urlData.port
            real_host = real_port = None
        else:
            host = self._proxy.hostname
            port = self._proxy.port
            real_host = urlData.hostname
            real_port = urlData.port

        logging.debug("Request URL: %s:/%s > %s # %s", url, uri,
                      unquote(params), headers["Cookie"])

        conn = httplib.HTTPSConnection(host, port)

        if real_host is not None:
            conn.set_tunnel(real_host, real_port, headers=self._proxy_auth)
        if config.DEBUG:
            conn.set_debuglevel(1)

        conn.request(method, url + uri, params, headers)
        response = conn.getresponse()
        data = response.read()
        conn.close()

        logging.debug("Response : %s > %s", response.status,
                      response.getheaders())
        result = tools.Struct(status=response.status,
                              location=response.getheader('location', None),
                              data=data)

        # update local cookies
        sk = Cookie.SimpleCookie(response.getheader("Set-Cookie", ""))
        for key in sk:
            self.cookies[key] = sk[key].value
        # delete cookies whose content is "deleteme"
        for key in self.cookies.keys():
            if self.cookies[key] == "deleteme":
                del self.cookies[key]

        return result
Example #27
0
def open_http(url, data=None):
    """Use HTTP protocol."""
    import httplib

    user_passwd = None
    proxy_passwd = None
    if isinstance(url, str):
        host, selector = urllib.splithost(url)
        if host:
            user_passwd, host = urllib.splituser(host)
            host = urllib.unquote(host)
        realhost = host
    else:
        host, selector = url
        # check whether the proxy contains authorization information
        proxy_passwd, host = urllib.splituser(host)
        # now we proceed with the url we want to obtain
        urltype, rest = urllib.splittype(selector)
        url = rest
        user_passwd = None
        if urltype.lower() != "http":
            realhost = None
        else:
            realhost, rest = urllib.splithost(rest)
            if realhost:
                user_passwd, realhost = urllib.splituser(realhost)
            if user_passwd:
                selector = "%s://%s%s" % (urltype, realhost, rest)
            if urllib.proxy_bypass(realhost):
                host = realhost

        # print "proxy via http:", host, selector
    if not host:
        raise IOError, ("http error", "no host given")

    if proxy_passwd:
        import base64

        proxy_auth = base64.b64encode(proxy_passwd).strip()
    else:
        proxy_auth = None

    if user_passwd:
        import base64

        auth = base64.b64encode(user_passwd).strip()
    else:
        auth = None
    h = HTTP(host)
    if data is not None:
        h.putrequest("POST", selector)
        h.putheader("Content-Type", "application/x-www-form-urlencoded")
        h.putheader("Content-Length", "%d" % len(data))
    else:
        h.putrequest("GET", selector)
    if proxy_auth:
        h.putheader("Proxy-Authorization", "Basic %s" % proxy_auth)
    if auth:
        h.putheader("Authorization", "Basic %s" % auth)
    if realhost:
        h.putheader("Host", realhost)
    for args in urllib.URLopener().addheaders:
        h.putheader(*args)
    h.endheaders()
    return h
Example #28
0
File: conf.py Project: stskeeps/osc
def _build_opener(url):
    from osc.core import __version__
    global config
    apiurl = urljoin(*parse_apisrv_url(None, url))
    if 'last_opener' not in _build_opener.__dict__:
        _build_opener.last_opener = (None, None)
    if apiurl == _build_opener.last_opener[0]:
        return _build_opener.last_opener[1]

    # respect no_proxy env variable
    if urllib.proxy_bypass(apiurl):
        # initialize with empty dict
        proxyhandler = urllib2.ProxyHandler({})
    else:
        # read proxies from env
        proxyhandler = urllib2.ProxyHandler()

    # workaround for http://bugs.python.org/issue9639
    authhandler_class = urllib2.HTTPBasicAuthHandler
    if sys.version_info >= (2, 6, 6) and sys.version_info < (2, 7, 1) \
        and not 'reset_retry_count' in dir(urllib2.HTTPBasicAuthHandler):
        print >>sys.stderr, 'warning: your urllib2 version seems to be broken. ' \
            'Using a workaround for http://bugs.python.org/issue9639'

        class OscHTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):
            def http_error_401(self, *args):
                response = urllib2.HTTPBasicAuthHandler.http_error_401(self, *args)
                self.retried = 0
                return response

            def http_error_404(self, *args):
                self.retried = 0
                return None

        authhandler_class = OscHTTPBasicAuthHandler
    elif sys.version_info >= (2, 6, 6) and sys.version_info < (2, 7, 1):
        class OscHTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):
            def http_error_404(self, *args):
                self.reset_retry_count()
                return None

        authhandler_class = OscHTTPBasicAuthHandler
    elif sys.version_info >= (2, 6, 5) and sys.version_info < (2, 6, 6):
        # workaround for broken urllib2 in python 2.6.5: wrong credentials
        # lead to an infinite recursion
        class OscHTTPBasicAuthHandler(urllib2.HTTPBasicAuthHandler):
            def retry_http_basic_auth(self, host, req, realm):
                # don't retry if auth failed
                if req.get_header(self.auth_header, None) is not None:
                    return None
                return urllib2.HTTPBasicAuthHandler.retry_http_basic_auth(self, host, req, realm)

        authhandler_class = OscHTTPBasicAuthHandler

    options = config['api_host_options'][apiurl]
    # with None as first argument, it will always use this username/password
    # combination for urls for which arg2 (apisrv) is a super-url
    authhandler = authhandler_class( \
        urllib2.HTTPPasswordMgrWithDefaultRealm())
    authhandler.add_password(None, apiurl, options['user'], options['pass'])

    if options['sslcertck']:
        try:
            import oscssl
            from M2Crypto import m2urllib2
        except ImportError, e:
            print e
            raise NoSecureSSLError('M2Crypto is needed to access %s in a secure way.\nPlease install python-m2crypto.' % apiurl)

        cafile = options.get('cafile', None)
        capath = options.get('capath', None)
        if not cafile and not capath:
            for i in ['/etc/pki/tls/cert.pem', '/etc/ssl/certs']:
                if os.path.isfile(i):
                    cafile = i
                    break
                elif os.path.isdir(i):
                    capath = i
                    break
        ctx = oscssl.mySSLContext()
        if ctx.load_verify_locations(capath=capath, cafile=cafile) != 1:
            raise Exception('No CA certificates found')
        opener = m2urllib2.build_opener(ctx, oscssl.myHTTPSHandler(ssl_context=ctx, appname='osc'), urllib2.HTTPCookieProcessor(cookiejar), authhandler, proxyhandler)
Example #29
0
    def __init__(self, url, method='GET', data=None, headers=None,
            headers_only=False, user_agent=None, follow_location=False,
            force_quiet=True):
        GObjectWrapper.__init__(self)
        self.result = StringIO.StringIO()
        self.result_headers = StringIO.StringIO()

        if isinstance(url, unicode):
            self.url = url.encode("utf-8")
        else:
            self.url = url
        self.method = method
        self.data = data
        self.headers = headers
        self.status = None

        # the actual curl request object
        self.curl = pycurl.Curl()
        if (logging.root.level == logging.DEBUG and not force_quiet):
            self.curl.setopt(pycurl.VERBOSE, 1)

        self.curl.setopt(pycurl.WRITEFUNCTION, self.result.write)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.result_headers.write)
        # We want to use gzip and deflate if possible:
        self.curl.setopt(pycurl.ENCODING, "") # use all available encodings
        self.curl.setopt(pycurl.URL, self.url)

        # let's set the HTTP request method
        if method == 'GET':
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif method == 'POST':
            self.curl.setopt(pycurl.POST, 1)
        elif method == 'PUT':
            self.curl.setopt(pycurl.UPLOAD, 1)
        else:
            self.curl.setopt(pycurl.CUSTOMREQUEST, method)
        if data:
            if method == "PUT":
                self.data = StringIO.StringIO(data)
                self.curl.setopt(pycurl.READFUNCTION, self.data.read)
                self.curl.setopt(pycurl.INFILESIZE, len(self.data.getvalue()))
            else:
                self.curl.setopt(pycurl.POSTFIELDS, self.data)
                self.curl.setopt(pycurl.POSTFIELDSIZE, len(self.data))
        if headers:
            self.curl.setopt(pycurl.HTTPHEADER, headers)
        if headers_only:
            self.curl.setopt(pycurl.HEADER, 1)
            self.curl.setopt(pycurl.NOBODY, 1)
        if user_agent:
            self.curl.setopt(pycurl.USERAGENT, user_agent)
        if follow_location:
            self.curl.setopt(pycurl.FOLLOWLOCATION, 1)

        if libproxy:
            for proxy in proxy_factory.getProxies(self.url):
                # if we connect to localhost (localtm) with proxy specifically
                # set to direct://, libcurl connects fine, but then asks
                #   GET http://localhost:55555/unit/en/af/whatever
                # instead of
                #   GET /unit/en/af/whatever
                # and it doesn't work. We have to set it specifically to ""
                # though, otherwise it seems to fall back to environment
                # variables.
                if proxy == "direct://":
                    proxy = ""
                self.curl.setopt(pycurl.PROXY, proxy)
                #only use the first one
                break
        else:
            # Proxy: let's be careful to isolate the protocol to ensure that we
            # support the case where http and https might use different proxies
            split_url = self.url.split('://', 1)
            if len(split_url) > 1:
                #We were able to get a protocol
                protocol, address = split_url
                host, _path = urllib.splithost('//' + address)
                proxies = urllib.getproxies()
                if protocol in proxies and not urllib.proxy_bypass(host):
                    self.curl.setopt(pycurl.PROXY, proxies[protocol])

        # self reference required, because CurlMulti will only return
        # Curl handles
        self.curl.request = self