def unquote(content, encoding='utf-8', errors='replace'): """ Replace %xx escapes by their single-character equivalent. The optional encoding and errors parameters specify how to decode percent-encoded sequences. Wrapper to Python's unquote while remaining compatible with both Python 2 & 3 since the reference to this function changed between versions. Note: errors set to 'replace' means that invalid sequences are replaced by a placeholder character. Args: content (str): The quoted URI string you wish to unquote encoding (:obj:`str`, optional): encoding type errors (:obj:`str`, errors): how to handle invalid character found in encoded string (defined by encoding) Returns: str: The unquoted URI string """ if not content: return '' try: # Python v3.x return _unquote(content, encoding=encoding, errors=errors) except TypeError: # Python v2.7 return _unquote(content)
def textExtraction(wikidocument, lang): #extract the body part body=_body_re.search(wikidocument).group(1) #list internal links internal_links=[(lang, _unquote(url)) for (url, document_name) in _internal_link.findall(body)] #list interlanguage links interlanguage_links=[(lang_ref, _unquote(url)) for (lang_ref, url) in _interlanguage_link.findall(body)] #replace links body=_link_re.sub((lambda match: match.group(2)), body) #supress table toc body=_table_toc_re.sub("\n", body) #supress imgages body=_img_re.sub("", body) #supress scripts body=_script_re.sub("", body) #supress citations body=_cite_re.sub("", body) #supress sups body=_sup_re.sub((lambda match: match.group(1)), body) #supress tables body=_table_re.sub("\n", body) ##supress everything after "see also" #see_also_re=_re.compile("<h2><span class=\"mw-headline\" id=\"Voir_aussi\">Voir aussi</span></h2>", _re.DOTALL) #match=see_also_re.search(body) #if match: #body=body[:match.start()] #only keeps p and hx body="\n".join(_p_and_hx_re.findall(body)) #remove (formating) tags body=_tags_re.sub("", body) #the following is coding dependant body=body.decode("utf8") #split lines body=_end_line_re.sub((lambda match: match.group(0)+"\n"), body) #encoding normalization body=_entity_re.sub(_entity_callback, body) return (body.encode("utf8"), internal_links, interlanguage_links)
def textExtraction(wikidocument, lang): #extract the body part body = _body_re.search(wikidocument).group(1) #list internal links internal_links = [(lang, _unquote(url)) for (url, document_name) in _internal_link.findall(body)] #list interlanguage links interlanguage_links = [(lang_ref, _unquote(url)) for (lang_ref, url) in _interlanguage_link.findall(body)] #replace links body = _link_re.sub((lambda match: match.group(2)), body) #supress table toc body = _table_toc_re.sub("\n", body) #supress imgages body = _img_re.sub("", body) #supress scripts body = _script_re.sub("", body) #supress citations body = _cite_re.sub("", body) #supress sups body = _sup_re.sub((lambda match: match.group(1)), body) #supress tables body = _table_re.sub("\n", body) ##supress everything after "see also" #see_also_re=_re.compile("<h2><span class=\"mw-headline\" id=\"Voir_aussi\">Voir aussi</span></h2>", _re.DOTALL) #match=see_also_re.search(body) #if match: #body=body[:match.start()] #only keeps p and hx body = "\n".join(_p_and_hx_re.findall(body)) #remove (formating) tags body = _tags_re.sub("", body) #the following is coding dependant body = body.decode("utf8") #split lines body = _end_line_re.sub((lambda match: match.group(0) + "\n"), body) #encoding normalization body = _entity_re.sub(_entity_callback, body) return (body.encode("utf8"), internal_links, interlanguage_links)
def unquote(content, encoding='utf-8', errors='replace'): """ common unquote function """ if not content: return '' try: # Python v3.x return _unquote(content, encoding=encoding, errors=errors) except TypeError: # Python v2.7 return _unquote(content)
def unquote(s): s = _unquote(s) # PY3 always returns unicode. PY2 seems to always return what you give it, # which differs from quote's behavior. Just to be safe, make sure it is # unicode before we return. if isinstance(s, bytes_type): s = s.decode('utf-8') return s
def unquote(s): return to_unicode(_unquote(s))
def unquote(string): assert (type(string) in [unicode, str]) return _unquote(string.encode('utf-8')) if isinstance( string, unicode) else _unquote(string)
def getUrlParams(url=None): if url is None: url = getEnv('REQUEST_URI') url = urlparse(url) return dict([(part.split('=')[0], _unquote(part.split('=')[1])) for part in url[4].split('&') if len(part.split('=')) == 2])
def unquote(string): if type(string) is unicode: string = string.encode('utf-8') return _unquote(string)
def unquote(x): if isinstance(x, unicode_type): x = x.encode('utf-8') return _unquote(x).decode('utf-8')
def unquote(s): s = _reencode(s) return _unquote(s).decode('utf-8')
def unquote(*l): return tuple(_unquote(unicodeToStr(s)) for s in l) if len(l) != 1 else _unquote(unicodeToStr(l[0]))
def unquote(s): return unicode(_unquote(s.encode("utf-8")), "utf-8")
# coding=utf-8 # Copyright 2008-9, Sean B. Palmer, inamidst.com # Copyright 2012, Elsie Powell, embolalia.com # Licensed under the Eiffel Forum License 2. from __future__ import unicode_literals, absolute_import, print_function, division import re from sopel import web from sopel.module import commands, example import requests import xmltodict import sys if sys.version_info.major < 3: from urllib import quote_plus, unquote as _unquote unquote = lambda s: _unquote(s.encode('utf-8')).decode('utf-8') else: from urllib.parse import quote_plus, unquote def formatnumber(n): """Format a number with beautiful commas.""" parts = list(str(n)) for i in range((len(parts) - 3), 0, -3): parts.insert(i, ',') return ''.join(parts) r_bing = re.compile(r'<h2(?: class=" b_topTitle")?><a href="([^"]+)"')
def unquote(url): if PY3: return _unquote(u(url), encoding="utf-8") return _unquote(u(url)).decode("utf-8")
def unquote(s): return _unquote(s.encode('utf-8')).decode('utf-8')
def unquote(string): assert(type(string) in [unicode, str]) return _unquote(string.encode('utf-8')) if isinstance(string, unicode) else _unquote(string)
def unquote(s): return networkString(_unquote(nativeString(s)))
def unquote(data, encoding='utf-8', errors='replace'): return _unquote(data).encode('latin1').decode(encoding, errors)
# coding=utf-8 # Copyright 2008-9, Sean B. Palmer, inamidst.com # Copyright 2012, Elsie Powell, embolalia.com # Licensed under the Eiffel Forum License 2. from __future__ import unicode_literals, absolute_import, print_function, division import re import sys if sys.version_info.major < 3: from urllib import unquote as _unquote unquote = lambda s: _unquote(s.encode('utf-8')).decode('utf-8') else: from urllib.parse import unquote import requests import xmltodict from sopel import web from sopel.module import commands, example def formatnumber(n): """Format a number with beautiful commas.""" parts = list(str(n)) for i in range((len(parts) - 3), 0, -3): parts.insert(i, ',') return ''.join(parts) r_bing = re.compile(r'<h2(?: class=" b_topTitle")?><a href="([^"]+)"')
def unquote_to_bytes(data): if isinstance(data, unicode): data = data.encode('ascii') return _unquote(data)
def unquote(s): if isinstance(s, bytes): s = s.decode("ascii") quoted = _unquote(s) return quoted.encode("ascii")
def unquote(x): if isinstance(x, unicode): x = x.encode('utf-8') return _unquote(x).decode('utf-8')
def unquote(value, encoding, errors): return _unquote(value).decode(encoding, errors)