def getUrl(url, cookieJar=None, post=None, timeout=20, headers=None, noredir=False): cookie_handler = urllib_request.HTTPCookieProcessor(cookieJar) if noredir: opener = urllib_request.build_opener( NoRedirection, cookie_handler, urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler()) else: opener = urllib_request.build_opener( cookie_handler, urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler()) #opener = urllib_request.install_opener(opener) req = urllib_request.Request(url) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.154 Safari/537.36' ) if headers: for h, hv in headers: req.add_header(h, hv) response = opener.open(req, post, timeout=timeout) link = response.read() response.close() return link
def _update_opener(self, drop_tls_level=False): """ Builds and installs a new opener to be used by all future calls to :func:`urllib2.urlopen`. """ handlers = [ urllib_request.HTTPCookieProcessor(self._cj), urllib_request.HTTPBasicAuthHandler() ] if self._http_debug: handlers += [urllib_request.HTTPHandler(debuglevel=1)] else: handlers += [urllib_request.HTTPHandler()] if self._proxy: handlers += [urllib_request.ProxyHandler({'http': self._proxy})] try: import platform node = platform.node().lower() except: node = '' if not self._ssl_verify or node == 'xboxone': try: import ssl ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE if self._http_debug: handlers += [ urllib_request.HTTPSHandler(context=ctx, debuglevel=1) ] else: handlers += [urllib_request.HTTPSHandler(context=ctx)] except: pass else: try: import ssl import certifi ctx = ssl.create_default_context(cafile=certifi.where()) if drop_tls_level: ctx.protocol = ssl.PROTOCOL_TLSv1_1 if self._http_debug: handlers += [ urllib_request.HTTPSHandler(context=ctx, debuglevel=1) ] else: handlers += [urllib_request.HTTPSHandler(context=ctx)] except: pass opener = urllib_request.build_opener(*handlers) urllib_request.install_opener(opener)
def _update_opener(self): ''' Builds and installs a new opener to be used by all future calls to :func:`urllib2.urlopen`. ''' if self._http_debug: http = urllib_request.HTTPHandler(debuglevel=1) else: http = urllib_request.HTTPHandler() if self._proxy: opener = urllib_request.build_opener( urllib_request.HTTPCookieProcessor(self._cj), urllib_request.ProxyHandler({'http': self._proxy}), urllib_request.HTTPBasicAuthHandler(), http) else: opener = urllib_request.build_opener( urllib_request.HTTPCookieProcessor(self._cj), urllib_request.HTTPBasicAuthHandler(), http) urllib_request.install_opener(opener)
base_hdrs = {'User-Agent': USER_AGENT, 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'gzip', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive'} openloadhdr = base_hdrs progress = xbmcgui.DialogProgress() dialog = xbmcgui.Dialog() urlopen = urllib_request.urlopen cj = http_cookiejar.LWPCookieJar(TRANSLATEPATH(cookiePath)) Request = urllib_request.Request handlers = [urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler(), urllib_request.HTTPSHandler()] ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers.append(urllib_request.HTTPSHandler(context=ssl_context)) def kodilog(logvar, level=LOGINFO): xbmc.log("@@@@Cumination: " + str(logvar), level) @url_dispatcher.register() def clear_cache(): """ Clear the cache database. """
def getRegexParsed( regexs, url, cookieJar=None, forCookieJarOnly=False, recursiveCall=False, cachedPages={}, rawPost=False, cookie_jar_file=None): #0,1,2 = URL, regexOnly, CookieJarOnly #cachedPages = {} #print 'url',url doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url) # print 'doRegexs',doRegexs,regexs setresolved = True for k in doRegexs: if k in regexs: #print 'processing ' ,k m = regexs[k] #print m cookieJarParam = False if 'cookiejar' in m: # so either create or reuse existing jar #print 'cookiejar exists',m['cookiejar'] cookieJarParam = m['cookiejar'] if '$doregex' in cookieJarParam: cookieJar = getRegexParsed(regexs, m['cookiejar'], cookieJar, True, True, cachedPages) cookieJarParam = True else: cookieJarParam = True #print 'm[cookiejar]',m['cookiejar'],cookieJar if cookieJarParam: if cookieJar == None: #print 'create cookie jar' cookie_jar_file = None if 'open[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split( 'open[')[1].split(']')[0] # print 'cookieJar from file name',cookie_jar_file cookieJar = getCookieJar(cookie_jar_file) # print 'cookieJar from file',cookieJar if cookie_jar_file: saveCookieJar(cookieJar, cookie_jar_file) #cookieJar = http_cookiejar.LWPCookieJar() #print 'cookieJar new',cookieJar elif 'save[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split('save[')[1].split( ']')[0] complete_path = os.path.join(profile, cookie_jar_file) # print 'complete_path',complete_path saveCookieJar(cookieJar, cookie_jar_file) if m['page'] and '$doregex' in m['page']: pg = getRegexParsed(regexs, m['page'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if len(pg) == 0: pg = 'http://regexfailed' m['page'] = pg if 'setcookie' in m and m['setcookie'] and '$doregex' in m[ 'setcookie']: m['setcookie'] = getRegexParsed(regexs, m['setcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[ 'appendcookie']: m['appendcookie'] = getRegexParsed(regexs, m['appendcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'post' in m and '$doregex' in m['post']: m['post'] = getRegexParsed(regexs, m['post'], cookieJar, recursiveCall=True, cachedPages=cachedPages) # print 'post is now',m['post'] if 'rawpost' in m and '$doregex' in m['rawpost']: m['rawpost'] = getRegexParsed(regexs, m['rawpost'], cookieJar, recursiveCall=True, cachedPages=cachedPages, rawPost=True) #print 'rawpost is now',m['rawpost'] if 'rawpost' in m and '$epoctime$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime$', getEpocTime()) if 'rawpost' in m and '$epoctime2$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime2$', getEpocTime2()) link = '' if m['page'] and m[ 'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False: #print 'using cache page',m['page'] link = cachedPages[m['page']] else: if m['page'] and not m['page'] == '' and m['page'].startswith( 'http'): if '$epoctime$' in m['page']: m['page'] = m['page'].replace('$epoctime$', getEpocTime()) if '$epoctime2$' in m['page']: m['page'] = m['page'].replace('$epoctime2$', getEpocTime2()) #print 'Ingoring Cache',m['page'] page_split = m['page'].split('|') pageUrl = page_split[0] header_in_page = None if len(page_split) > 1: header_in_page = page_split[1] # if # proxy = urllib_request.ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse}) # opener = urllib_request.build_opener(proxy) # urllib_request.install_opener(opener) # print 'urllib_request.getproxies',urllib_request.getproxies() current_proxies = urllib_request.ProxyHandler( urllib_request.getproxies()) #print 'getting pageUrl',pageUrl req = urllib_request.Request(pageUrl) if 'proxy' in m: proxytouse = m['proxy'] # print 'proxytouse',proxytouse # urllib_request.getproxies= lambda: {} if pageUrl[:5] == "https": proxy = urllib_request.ProxyHandler( {'https': proxytouse}) #req.set_proxy(proxytouse, 'https') else: proxy = urllib_request.ProxyHandler( {'http': proxytouse}) #req.set_proxy(proxytouse, 'http') opener = urllib_request.build_opener(proxy) urllib_request.install_opener(opener) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1' ) proxytouse = None if 'referer' in m: req.add_header('Referer', m['referer']) if 'accept' in m: req.add_header('Accept', m['accept']) if 'agent' in m: req.add_header('User-agent', m['agent']) if 'x-req' in m: req.add_header('X-Requested-With', m['x-req']) if 'x-addr' in m: req.add_header('x-addr', m['x-addr']) if 'x-forward' in m: req.add_header('X-Forwarded-For', m['x-forward']) if 'setcookie' in m: # print 'adding cookie',m['setcookie'] req.add_header('Cookie', m['setcookie']) if 'appendcookie' in m: # print 'appending cookie to cookiejar',m['appendcookie'] cookiestoApend = m['appendcookie'] cookiestoApend = cookiestoApend.split(';') for h in cookiestoApend: n, v = h.split('=') w, n = n.split(':') ck = http_cookiejar.Cookie( version=0, name=n, value=v, port=None, port_specified=False, domain=w, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cookieJar.set_cookie(ck) if 'origin' in m: req.add_header('Origin', m['origin']) if header_in_page: header_in_page = header_in_page.split('&') for h in header_in_page: n, v = h.split('=') req.add_header(n, v) if not cookieJar == None: # print 'cookieJarVal',cookieJar cookie_handler = urllib_request.HTTPCookieProcessor( cookieJar) opener = urllib_request.build_opener( cookie_handler, urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler()) opener = urllib_request.install_opener(opener) # print 'noredirect','noredirect' in m if 'noredirect' in m: opener = urllib_request.build_opener( cookie_handler, NoRedirection, urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler()) opener = urllib_request.install_opener(opener) elif 'noredirect' in m: opener = urllib_request.build_opener( NoRedirection, urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler()) opener = urllib_request.install_opener(opener) if 'connection' in m: # print '..........................connection//////.',m['connection'] from keepalive import HTTPHandler keepalive_handler = HTTPHandler() opener = urllib_request.build_opener(keepalive_handler) urllib_request.install_opener(opener) #print 'after cookie jar' post = None if 'post' in m: postData = m['post'] #if '$LiveStreamRecaptcha' in postData: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield) splitpost = postData.split(',') post = {} for p in splitpost: n = p.split(':')[0] v = p.split(':')[1] post[n] = v post = urllib_parse.urlencode(post) if 'rawpost' in m: post = m['rawpost'] #if '$LiveStreamRecaptcha' in post: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield) link = '' try: if post: response = urllib_request.urlopen(req, post) else: response = urllib_request.urlopen(req) if response.info().get('Content-Encoding') == 'gzip': import gzip buf = six.BytesIO(response.read()) f = gzip.GzipFile(fileobj=buf) link = f.read() else: link = response.read() if 'proxy' in m and not current_proxies is None: urllib_request.install_opener( urllib_request.build_opener(current_proxies)) link = javascriptUnEscape(link) #print repr(link) #print link This just print whole webpage in LOG if 'includeheaders' in m: #link+=str(response.headers.get('Set-Cookie')) link += '$$HEADERS_START$$:' for b in response.headers: link += b + ':' + response.headers.get( b) + '\n' link += '$$HEADERS_END$$:' # print link response.close() except: pass cachedPages[m['page']] = link #print link #print 'store link for',m['page'],forCookieJarOnly if forCookieJarOnly: return cookieJar # do nothing elif m['page'] and not m['page'].startswith('http'): if m['page'].startswith('$pyFunction:'): val = doEval(m['page'].split('$pyFunction:')[1], '', cookieJar, m) if forCookieJarOnly: return cookieJar # do nothing link = val link = javascriptUnEscape(link) else: link = m['page'] if '$doregex' in m['expres']: m['expres'] = getRegexParsed(regexs, m['expres'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if not m['expres'] == '': #print 'doing it ',m['expres'] if '$LiveStreamCaptcha' in m['expres']: val = askCaptcha(m, link, cookieJar) #print 'url and val',url,val url = url.replace("$doregex[" + k + "]", val) elif m['expres'].startswith( '$pyFunction:') or '#$pyFunction' in m['expres']: #print 'expeeeeeeeeeeeeeeeeeee',m['expres'] val = '' if m['expres'].startswith('$pyFunction:'): val = doEval(m['expres'].split('$pyFunction:')[1], link, cookieJar, m) else: val = doEvalFunction(m['expres'], link, cookieJar, m) if 'ActivateWindow' in m['expres']: return if forCookieJarOnly: return cookieJar # do nothing if 'listrepeat' in m: listrepeat = m['listrepeat'] return listrepeat, eval(val), m, regexs, cookieJar try: url = url.replace(u"$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", six.ensure_text(val)) else: if 'listrepeat' in m: listrepeat = m['listrepeat'] ret = re.findall(m['expres'], link) return listrepeat, ret, m, regexs val = '' if not link == '': #print 'link',link reg = re.compile(m['expres']).search(link) try: val = reg.group(1).strip() except: traceback.print_exc() elif m['page'] == '' or m['page'] == None: val = m['expres'] if rawPost: # print 'rawpost' val = urllib_parse.quote_plus(val) if 'htmlunescape' in m: #val=urllib_parse.unquote_plus(val) import HTMLParser val = HTMLParser.HTMLParser().unescape(val) try: url = url.replace("$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", six.ensure_text(val)) #print 'ur',url #return val else: url = url.replace("$doregex[" + k + "]", '') if '$epoctime$' in url: url = url.replace('$epoctime$', getEpocTime()) if '$epoctime2$' in url: url = url.replace('$epoctime2$', getEpocTime2()) if '$GUID$' in url: import uuid url = url.replace('$GUID$', str(uuid.uuid1()).upper()) if '$get_cookies$' in url: url = url.replace('$get_cookies$', getCookiesString(cookieJar)) if recursiveCall: return url #print 'final url',repr(url) if url == "": return else: return url, setresolved
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', 'Accept-Encoding': 'gzip', 'Accept-Language': 'en-US,en;q=0.8', 'Connection': 'keep-alive' } openloadhdr = base_hdrs progress = xbmcgui.DialogProgress() dialog = xbmcgui.Dialog() urlopen = urllib_request.urlopen cj = http_cookiejar.LWPCookieJar(TRANSLATEPATH(cookiePath)) Request = urllib_request.Request handlers = [ urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler(), urllib_request.HTTPSHandler() ] if (2, 7, 8) < sys.version_info: try: ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [urllib_request.HTTPSHandler(context=ssl_context)] except: pass def kodilog(logvar, level=LOGINFO):
def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=True, cookies=True, replace_headers=False, add_referer=False, only_headers=False, bypass_cloudflare=True, bypass_testcookie=True, no_decode=False, method=None, cache=CACHE_ENABLED, cache_expiration=CACHE_EXPIRATION): """ Descarga una página web y devuelve los resultados :type url: str :type post: dict, str :type headers: dict, list :type timeout: int :type follow_redirects: bool :type cookies: bool, dict :type replace_headers: bool :type add_referer: bool :type only_headers: bool :type bypass_cloudflare: bool :type cache: bool :type cache_expiration: timedelta :return: Resultado """ arguments = locals().copy() if cache: try: cache_key = '|'.join(["%s:%s" %(k,v) for k,v in sorted(arguments.items(),key= lambda x: x[0]) if v]).encode() cache_key = CACHE_PREFIX + hashlib.sha1(cache_key).hexdigest() cacheado = CACHE.get(cache_key) if cacheado: return HTTPResponse(cacheado) except: pass response = {} # Post tipo dict if type(post) == dict: post = urllib_parse.urlencode(post) # Url quote url = urllib_parse.quote(url, safe="%/:=&?~#+!$,;'@()*[]") # Headers por defecto, si no se especifica nada request_headers = default_headers.copy() # Headers pasados como parametros if headers is not None: if not replace_headers: request_headers.update(dict(headers)) else: request_headers = dict(headers) # Referer if add_referer: request_headers["Referer"] = "/".join(url.split("/")[:3]) #logger("Headers:") #logger(request_headers, 'info') # Handlers handlers = list() handlers.append(HTTPHandler(debuglevel=False)) handlers.append(HTTPSHandler(debuglevel=False)) handlers.append(urllib_request.HTTPBasicAuthHandler()) # No redirects if not follow_redirects: handlers.append(NoRedirectHandler()) else: handlers.append(HTTPRedirectHandler()) # Dict con cookies para la sesión if type(cookies) == dict: for name, value in cookies.items(): if not type(value) == dict: value = {'value': value} ck = Cookie( version=0, name=name, value=value.get('value', ''), port=None, port_specified=False, domain=value.get('domain', urllib_parse.urlparse(url)[1]), domain_specified=False, domain_initial_dot=False, path=value.get('path', '/'), path_specified=True, secure=False, expires=value.get('expires', time.time() + 3600 * 24), discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False ) cj.set_cookie(ck) if cookies: handlers.append(urllib_request.HTTPCookieProcessor(cj)) # Opener opener = urllib_request.build_opener(*handlers) # Contador inicio = time.time() # Request req = Request(url, six.ensure_binary(post) if post else None, request_headers, method=method) try: #logger("Realizando Peticion") handle = opener.open(req, timeout=timeout) #logger('Peticion realizada') except HTTPError as handle: #logger('Peticion realizada con error') response["sucess"] = False response["code"] = handle.code response["error"] = handle.__dict__.get("reason", str(handle)) response["headers"] = dict(handle.headers.items()) response['cookies'] = get_cookies(urllib_parse.urlparse(url)[1]) if not only_headers: #logger('Descargando datos...') response["data"] = handle.read() else: response["data"] = b"" response["time"] = time.time() - inicio response["url"] = handle.geturl() except Exception as e: #logger('Peticion NO realizada') response["sucess"] = False response["code"] = e.__dict__.get("errno", e.__dict__.get("code", str(e))) response["error"] = e.__dict__.get("reason", str(e)) response["headers"] = {} response['cookies'] = get_cookies(urllib_parse.urlparse(url)[1]) response["data"] = b"" response["time"] = time.time() - inicio response["url"] = url else: response["sucess"] = True response["code"] = handle.code response["error"] = None response["headers"] = dict(handle.headers.items()) response['cookies'] = get_cookies(urllib_parse.urlparse(url)[1]) if not only_headers: #logger('Descargando datos...') response["data"] = handle.read() else: response["data"] = b"" response["time"] = time.time() - inicio response["url"] = handle.geturl() response['headers'] = dict([(k.lower(), v) for k, v in response['headers'].items()]) #logger("Terminado en %.2f segundos" % (response["time"])) #logger("url: %s" % url) #logger("Response sucess : %s" % (response["sucess"])) #logger("Response code : %s" % (response["code"])) #logger("Response error : %s" % (response["error"])) #logger("Response cookies : %s" % (response["cookies"])) #logger("Response data length: %s" % (len(response["data"]))) #logger("Response headers:") #logger(response['headers']) # Guardamos las cookies if cookies: save_cookies() # Gzip if response["headers"].get('content-encoding') == 'gzip': response["data"] = gzip.GzipFile(fileobj=BytesIO(response["data"])).read() # Binarios no se codifican ni se comprueba cloudflare, etc... if not is_binary(response): response['data'] = six.ensure_str(response['data'], errors='replace') if not no_decode: response["data"] = six.ensure_str(HTMLParser().unescape( six.ensure_text(response['data'], errors='replace') )) # Anti TestCookie if bypass_testcookie: if 'document.cookie="__test="+toHex(slowAES.decrypt(c,2,a,b))+"' in response['data']: a = re.findall('a=toNumbers\("([^"]+)"\)', response['data'])[0].decode("HEX") b = re.findall('b=toNumbers\("([^"]+)"\)', response['data'])[0].decode("HEX") c = re.findall('c=toNumbers\("([^"]+)"\)', response['data'])[0].decode("HEX") arguments['bypass_testcookie'] = False if not type(arguments['cookies']) == dict: arguments['cookies'] = {'__test': ii11.new(a, ii11.MODE_CBC, b).decrypt(c).encode("HEX")} else: arguments['cookies']['__test'] = ii11.new(a, ii11.MODE_CBC, b).decrypt(c).encode("HEX") response = downloadpage(**arguments).__dict__ # Anti Cloudflare if bypass_cloudflare: response = retry_if_cloudflare(response, arguments) if cache: CACHE.set(cache_key, response, expiration=cache_expiration) return HTTPResponse(response)