def extractReqTarget(full_link): if "qunar" not in str(full_link): return None if "qrt=" in str(full_link): return full_link.partition('qrt=')[2] if "html.ng" in str(full_link): return 'qde' proto, rest = ur.splittype(full_link) res, rest = ur.splithost(rest) return None if not res else res
def __init__(self, uri, basepath=None): self.basepath = basepath self.mimetype = None self.file = None self.data = None self.uri = None self.local = None self.tmp_file = None uri = uri or str() if type(uri) != str: uri = uri.decode("utf-8") log.debug("FileObject %r, Basepath: %r", uri, basepath) # Data URI if uri.startswith("data:"): m = _rx_datauri.match(uri) self.mimetype = m.group("mime") b64 = urllib_unquote(m.group("data")).encode("utf-8") self.data = base64.b64decode(b64) else: # Check if we have an external scheme if basepath and not urlparse.urlparse(uri).scheme: urlParts = urlparse.urlparse(basepath) else: urlParts = urlparse.urlparse(uri) log.debug("URLParts: {}".format((urlParts, urlParts.scheme))) if urlParts.scheme == 'file': if basepath and uri.startswith('/'): uri = urlparse.urljoin(basepath, uri[1:]) urlResponse = urllib2.urlopen(uri) self.mimetype = urlResponse.info().get( "Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse # Drive letters have len==1 but we are looking # for things like http: elif urlParts.scheme in ('http', 'https'): log.debug("Sending request for {} with httplib".format(uri)) # External data if basepath: uri = urlparse.urljoin(basepath, uri) log.debug("Uri parsed: {}".format(uri)) #path = urlparse.urlsplit(url)[2] #mimetype = getMimeType(path) # Using HTTPLIB server, path = urllib2.splithost(uri[uri.find("//"):]) if uri.startswith("https://"): conn = httplib.HTTPSConnection(server, **httpConfig) else: conn = httplib.HTTPConnection(server) conn.request("GET", path) r1 = conn.getresponse() # log.debug("HTTP %r %r %r %r", server, path, uri, r1) if (r1.status, r1.reason) == (200, "OK"): self.mimetype = r1.getheader( "Content-Type", '').split(";")[0] self.uri = uri log.debug("here") if r1.getheader("content-encoding") == "gzip": import gzip self.file = gzip.GzipFile( mode="rb", fileobj=six.StringIO(r1.read())) else: self.file = pisaTempFile(r1.read()) else: log.debug( "Received non-200 status: {}".format((r1.status, r1.reason))) try: urlResponse = urllib2.urlopen(uri) except urllib2.HTTPError as e: log.error("Could not process uri: {}".format(e)) return self.mimetype = urlResponse.info().get( "Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse else: log.debug("Unrecognized scheme, assuming local file path") # Local data if basepath: if sys.platform == 'win32' and os.path.isfile(basepath): basepath = os.path.dirname(basepath) uri = os.path.normpath(os.path.join(basepath, uri)) if os.path.isfile(uri): self.uri = uri self.local = uri self.setMimeTypeByName(uri) if self.mimetype and self.mimetype.startswith('text'): self.file = open(uri, "r") #removed bytes... lets hope it goes ok :/ else: # removed bytes... lets hope it goes ok :/ self.file = open(uri, "rb")
def __init__(self, uri, basepath=None): self.basepath = basepath self.mimetype = None self.file = None self.data = None self.uri = None self.local = None self.tmp_file = None uri = uri or str() if type(uri) != str: uri = uri.decode("utf-8") log.debug("FileObject %r, Basepath: %r", uri, basepath) # Data URI if uri.startswith("data:"): m = _rx_datauri.match(uri) self.mimetype = m.group("mime") self.data = base64.b64decode(m.group("data").encode("utf-8")) else: # Check if we have an external scheme if basepath and not urlparse.urlparse(uri).scheme: urlParts = urlparse.urlparse(basepath) else: urlParts = urlparse.urlparse(uri) log.debug("URLParts: {}".format((urlParts, urlParts.scheme))) if urlParts.scheme == 'file': if basepath and uri.startswith('/'): uri = urlparse.urljoin(basepath, uri[1:]) urlResponse = urllib2.urlopen(uri) self.mimetype = urlResponse.info().get("Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse # Drive letters have len==1 but we are looking # for things like http: elif urlParts.scheme in ('http', 'https'): log.debug("Sending request for {} with httplib".format(uri)) # External data if basepath: uri = urlparse.urljoin(basepath, uri) log.debug("Uri parsed: {}".format(uri)) #path = urlparse.urlsplit(url)[2] #mimetype = getMimeType(path) # Using HTTPLIB server, path = urllib2.splithost(uri[uri.find("//"):]) if uri.startswith("https://"): conn = httplib.HTTPSConnection(server) else: conn = httplib.HTTPConnection(server) conn.request("GET", path) r1 = conn.getresponse() # log.debug("HTTP %r %r %r %r", server, path, uri, r1) if (r1.status, r1.reason) == (200, "OK"): self.mimetype = r1.getheader("Content-Type", '').split(";")[0] self.uri = uri log.debug("here") if r1.getheader("content-encoding") == "gzip": import gzip self.file = gzip.GzipFile(mode="rb", fileobj=six.StringIO( r1.read())) else: self.file = r1 else: log.debug("Received non-200 status: {}".format( (r1.status, r1.reason))) try: urlResponse = urllib2.urlopen(uri) except urllib2.HTTPError as e: log.error("Could not process uri: {}".format(e)) return self.mimetype = urlResponse.info().get("Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse else: log.debug("Unrecognized scheme, assuming local file path") # Local data if basepath: uri = os.path.normpath(os.path.join(basepath, uri)) if os.path.isfile(uri): self.uri = uri self.local = uri self.setMimeTypeByName(uri) if self.mimetype and self.mimetype.startswith('text'): self.file = open( uri, "r") #removed bytes... lets hope it goes ok :/ else: self.file = open( uri, "rb") #removed bytes... lets hope it goes ok :/
list = dir(urllib2) for s in list: print(s) url = 'http://www.bing.com/images/search?q=%d0%93%d1%80%d1%83%d0%b7%d0%b8%d1%8f&FORM=HDRSC2' url = 'http://www.bing.com/images/search?q=Imanuel+Kant&view=detailv2&&&id=20C00C8B61AC086C2988CB7172395A3AD1B87A9C&selectedIndex=19&ccid=o1O3XNKY&simid=608053578826975455&thid=JN.INhbeB%2bbzfDQuY0MgDnrNA&ajaxhist=0' s1 = urllib2.unquote(url) print(s1) print(url) print('splitattr') x = urllib2.splitattr(url) print(x) print('splithost') x = urllib2.splithost(url) print(x) print('splitpasswd') x = urllib2.splitpasswd(url) print(x) print('splitport') x = urllib2.splitport(url) print(x) print('splittype') x = urllib2.splittype(url) print(x) print('splituser') x = urllib2.splituser(url) print(x) print('splitvalue') x = urllib2.splitvalue(url)
def __init__(self, uri, basepath=None): self.basepath = basepath self.mimetype = None self.file = None self.data = None self.uri = None self.local = None self.tmp_file = None uri = uri or str() if not isinstance(uri, str): uri = uri.decode("utf-8") log.debug("FileObject %r, Basepath: %r", uri, basepath) # Data URI if uri.startswith("data:"): m = _rx_datauri.match(uri) self.mimetype = m.group("mime") b64 = urllib_unquote(m.group("data")) # The data may be incorrectly unescaped... repairs needed b64 = b64.strip("b'").strip("'").encode() b64 = re.sub(b"\\n", b'', b64) b64 = re.sub(b'[^A-Za-z0-9\+\/]+', b'', b64) # Add padding as needed, to make length into a multiple of 4 # b64 += b"=" * ((4 - len(b64) % 4) % 4) self.data = base64.b64decode(b64) else: # Check if we have an external scheme if basepath and not urlparse.urlparse(uri).scheme: urlParts = urlparse.urlparse(basepath) else: urlParts = urlparse.urlparse(uri) log.debug("URLParts: {}".format((urlParts, urlParts.scheme))) if urlParts.scheme == 'file': if basepath and uri.startswith('/'): uri = urlparse.urljoin(basepath, uri[1:]) urlResponse = urllib2.urlopen(uri) self.mimetype = urlResponse.info().get("Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse # Drive letters have len==1 but we are looking # for things like http: elif urlParts.scheme in ('http', 'https'): log.debug("Sending request for {} with httplib".format(uri)) # External data if basepath: uri = urlparse.urljoin(basepath, uri) log.debug("Uri parsed: {}".format(uri)) #path = urlparse.urlsplit(url)[2] #mimetype = getMimeType(path) # Using HTTPLIB server, path = urllib2.splithost(uri[uri.find("//"):]) if uri.startswith("https://"): conn = httplib.HTTPSConnection(server, **httpConfig) else: conn = httplib.HTTPConnection(server) conn.request("GET", path) r1 = conn.getresponse() # log.debug("HTTP %r %r %r %r", server, path, uri, r1) if (r1.status, r1.reason) == (200, "OK"): self.mimetype = r1.getheader("Content-Type", '').split(";")[0] self.uri = uri log.debug("here") if r1.getheader("content-encoding") == "gzip": import gzip self.file = gzip.GzipFile(mode="rb", fileobj=six.BytesIO( r1.read())) else: self.file = pisaTempFile(r1.read()) else: log.debug("Received non-200 status: {}".format( (r1.status, r1.reason))) try: urlResponse = urllib2.urlopen(uri) except urllib2.HTTPError as e: log.error("Could not process uri: {}".format(e)) return self.mimetype = urlResponse.info().get("Content-Type", '').split(";")[0] self.uri = urlResponse.geturl() self.file = urlResponse else: log.debug("Unrecognized scheme, assuming local file path") # Local data if basepath: if sys.platform == 'win32' and os.path.isfile(basepath): basepath = os.path.dirname(basepath) uri = os.path.normpath(os.path.join(basepath, uri)) if os.path.isfile(uri): self.uri = uri self.local = uri self.setMimeTypeByName(uri) if self.mimetype and self.mimetype.startswith('text'): self.file = open( uri, "r") #removed bytes... lets hope it goes ok :/ else: # removed bytes... lets hope it goes ok :/ self.file = open(uri, "rb")
sp1=request.splitquery() TypeError: splitquery() missing 1 required positional argument: 'url' >>> sp1=request.splitquery(url1) >>> spl Traceback (most recent call last): File "<pyshell#31>", line 1, in <module> spl NameError: name 'spl' is not defined >>> sp1 ('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1', 'admin=no¶m=yes:8000') >>> dir(request) ['AbstractBasicAuthHandler', 'AbstractDigestAuthHandler', 'AbstractHTTPHandler', 'BaseHandler', 'CacheFTPHandler', 'ContentTooShortError', 'DataHandler', 'FTPHandler', 'FancyURLopener', 'FileHandler', 'HTTPBasicAuthHandler', 'HTTPCookieProcessor', 'HTTPDefaultErrorHandler', 'HTTPDigestAuthHandler', 'HTTPError', 'HTTPErrorProcessor', 'HTTPHandler', 'HTTPPasswordMgr', 'HTTPPasswordMgrWithDefaultRealm', 'HTTPPasswordMgrWithPriorAuth', 'HTTPRedirectHandler', 'HTTPSHandler', 'MAXFTPCACHE', 'OpenerDirector', 'ProxyBasicAuthHandler', 'ProxyDigestAuthHandler', 'ProxyHandler', 'Request', 'URLError', 'URLopener', 'UnknownHandler', '__all__', '__builtins__', '__cached__', '__doc__', '__file__', '__loader__', '__name__', '__package__', '__spec__', '__version__', '_cut_port_re', '_ftperrors', '_have_ssl', '_localhost', '_noheaders', '_opener', '_parse_proxy', '_proxy_bypass_macosx_sysconf', '_randombytes', '_safe_gethostbyname', '_thishost', '_url_tempfiles', 'addclosehook', 'addinfourl', 'base64', 'bisect', 'build_opener', 'contextlib', 'email', 'ftpcache', 'ftperrors', 'ftpwrapper', 'getproxies', 'getproxies_environment', 'getproxies_registry', 'hashlib', 'http', 'install_opener', 'io', 'localhost', 'noheaders', 'os', 'parse_http_list', 'parse_keqv_list', 'pathname2url', 'posixpath', 'proxy_bypass', 'proxy_bypass_environment', 'proxy_bypass_registry', 'quote', 're', 'request_host', 'socket', 'splitattr', 'splithost', 'splitpasswd', 'splitport', 'splitquery', 'splittag', 'splittype', 'splituser', 'splitvalue', 'ssl', 'string', 'sys', 'tempfile', 'thishost', 'time', 'to_bytes', 'unquote', 'unquote_to_bytes', 'unwrap', 'url2pathname', 'urlcleanup', 'urljoin', 'urlopen', 'urlparse', 'urlretrieve', 'urlsplit', 'urlunparse', 'warnings'] >>> sp1=request.splitattr(url1) >>> sp1 ('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no¶m=yes:8000', []) >>> sp1=request.splithost(url1) >>> sp1 (None, 'https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no¶m=yes:8000') >>> sp1=request.splittag(url1) >>> sp1 ('https://stackoverflow.com/questions/34475051/need-to-install-urllib2-for-python-3-5-1?admin=no¶m=yes:8000', None) >>> sp1=request.urlcleanup(url1) Traceback (most recent call last): File "<pyshell#40>", line 1, in <module> sp1=request.urlcleanup(url1) TypeError: urlcleanup() takes 0 positional arguments but 1 was given >>> sp1=request.urlparse(url1) >>> p1 Traceback (most recent call last): File "<pyshell#42>", line 1, in <module> p1