def _update_opener(self, drop_tls_level=False): """ Builds and installs a new opener to be used by all future calls to :func:`urllib2.urlopen`. """ handlers = [ urllib_request.HTTPCookieProcessor(self._cj), urllib_request.HTTPBasicAuthHandler() ] if self._http_debug: handlers += [urllib_request.HTTPHandler(debuglevel=1)] else: handlers += [urllib_request.HTTPHandler()] if self._proxy: handlers += [urllib_request.ProxyHandler({'http': self._proxy})] try: import platform node = platform.node().lower() except: node = '' if not self._ssl_verify or node == 'xboxone': try: import ssl ctx = ssl.create_default_context() ctx.check_hostname = False ctx.verify_mode = ssl.CERT_NONE if self._http_debug: handlers += [ urllib_request.HTTPSHandler(context=ctx, debuglevel=1) ] else: handlers += [urllib_request.HTTPSHandler(context=ctx)] except: pass else: try: import ssl import certifi ctx = ssl.create_default_context(cafile=certifi.where()) if drop_tls_level: ctx.protocol = ssl.PROTOCOL_TLSv1_1 if self._http_debug: handlers += [ urllib_request.HTTPSHandler(context=ctx, debuglevel=1) ] else: handlers += [urllib_request.HTTPSHandler(context=ctx)] except: pass opener = urllib_request.build_opener(*handlers) urllib_request.install_opener(opener)
def urlopen(request): if not urlopen._opener_installed: handler = [] proxy_http = ucr_get('proxy/http') if proxy_http: handler.append(urllib_request.ProxyHandler({'http': proxy_http, 'https': proxy_http})) handler.append(HTTPSHandler()) opener = urllib_request.build_opener(*handler) urllib_request.install_opener(opener) urlopen._opener_installed = True return urllib_request.urlopen(request, timeout=60)
def install_opener(ucr): handler = [] proxy_http = ucr.get('proxy/http') if proxy_http: handler.append( urllib_request.ProxyHandler({ 'http': proxy_http, 'https': proxy_http })) handler.append(HTTPSHandler()) opener = urllib_request.build_opener(*handler) urllib_request.install_opener(opener)
def _update_opener(self): ''' Builds and installs a new opener to be used by all future calls to :func:`urllib2.urlopen`. ''' if self._http_debug: http = urllib_request.HTTPHandler(debuglevel=1) else: http = urllib_request.HTTPHandler() if self._proxy: opener = urllib_request.build_opener( urllib_request.HTTPCookieProcessor(self._cj), urllib_request.ProxyHandler({'http': self._proxy}), urllib_request.HTTPBasicAuthHandler(), http) else: opener = urllib_request.build_opener( urllib_request.HTTPCookieProcessor(self._cj), urllib_request.HTTPBasicAuthHandler(), http) urllib_request.install_opener(opener)
def getUrl(url, proxy={}, timeout=TIMEOUT, cookies=True): global cs cookie = [] if proxy: urllib_request.install_opener( urllib_request.build_opener(urllib_request.ProxyHandler(proxy))) elif cookies: cookie = http_cookiejar.LWPCookieJar() opener = urllib_request.build_opener( urllib_request.HTTPCookieProcessor(cookie)) urllib_request.install_opener(opener) req = urllib_request.Request(url) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36' ) try: response = urllib_request.urlopen(req, timeout=timeout) linkSRC = response.read() response.close() except: linkSRC = '' cs = ''.join(['%s=%s;' % (c.name, c.value) for c in cookie]) return linkSRC
def getRegexParsed( regexs, url, cookieJar=None, forCookieJarOnly=False, recursiveCall=False, cachedPages={}, rawPost=False, cookie_jar_file=None): #0,1,2 = URL, regexOnly, CookieJarOnly #cachedPages = {} #print 'url',url doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url) # print 'doRegexs',doRegexs,regexs setresolved = True for k in doRegexs: if k in regexs: #print 'processing ' ,k m = regexs[k] #print m cookieJarParam = False if 'cookiejar' in m: # so either create or reuse existing jar #print 'cookiejar exists',m['cookiejar'] cookieJarParam = m['cookiejar'] if '$doregex' in cookieJarParam: cookieJar = getRegexParsed(regexs, m['cookiejar'], cookieJar, True, True, cachedPages) cookieJarParam = True else: cookieJarParam = True #print 'm[cookiejar]',m['cookiejar'],cookieJar if cookieJarParam: if cookieJar == None: #print 'create cookie jar' cookie_jar_file = None if 'open[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split( 'open[')[1].split(']')[0] # print 'cookieJar from file name',cookie_jar_file cookieJar = getCookieJar(cookie_jar_file) # print 'cookieJar from file',cookieJar if cookie_jar_file: saveCookieJar(cookieJar, cookie_jar_file) #cookieJar = http_cookiejar.LWPCookieJar() #print 'cookieJar new',cookieJar elif 'save[' in m['cookiejar']: cookie_jar_file = m['cookiejar'].split('save[')[1].split( ']')[0] complete_path = os.path.join(profile, cookie_jar_file) # print 'complete_path',complete_path saveCookieJar(cookieJar, cookie_jar_file) if m['page'] and '$doregex' in m['page']: pg = getRegexParsed(regexs, m['page'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if len(pg) == 0: pg = 'http://regexfailed' m['page'] = pg if 'setcookie' in m and m['setcookie'] and '$doregex' in m[ 'setcookie']: m['setcookie'] = getRegexParsed(regexs, m['setcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'appendcookie' in m and m['appendcookie'] and '$doregex' in m[ 'appendcookie']: m['appendcookie'] = getRegexParsed(regexs, m['appendcookie'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if 'post' in m and '$doregex' in m['post']: m['post'] = getRegexParsed(regexs, m['post'], cookieJar, recursiveCall=True, cachedPages=cachedPages) # print 'post is now',m['post'] if 'rawpost' in m and '$doregex' in m['rawpost']: m['rawpost'] = getRegexParsed(regexs, m['rawpost'], cookieJar, recursiveCall=True, cachedPages=cachedPages, rawPost=True) #print 'rawpost is now',m['rawpost'] if 'rawpost' in m and '$epoctime$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime$', getEpocTime()) if 'rawpost' in m and '$epoctime2$' in m['rawpost']: m['rawpost'] = m['rawpost'].replace('$epoctime2$', getEpocTime2()) link = '' if m['page'] and m[ 'page'] in cachedPages and not 'ignorecache' in m and forCookieJarOnly == False: #print 'using cache page',m['page'] link = cachedPages[m['page']] else: if m['page'] and not m['page'] == '' and m['page'].startswith( 'http'): if '$epoctime$' in m['page']: m['page'] = m['page'].replace('$epoctime$', getEpocTime()) if '$epoctime2$' in m['page']: m['page'] = m['page'].replace('$epoctime2$', getEpocTime2()) #print 'Ingoring Cache',m['page'] page_split = m['page'].split('|') pageUrl = page_split[0] header_in_page = None if len(page_split) > 1: header_in_page = page_split[1] # if # proxy = urllib_request.ProxyHandler({ ('https' ? proxytouse[:5]=="https":"http") : proxytouse}) # opener = urllib_request.build_opener(proxy) # urllib_request.install_opener(opener) # print 'urllib_request.getproxies',urllib_request.getproxies() current_proxies = urllib_request.ProxyHandler( urllib_request.getproxies()) #print 'getting pageUrl',pageUrl req = urllib_request.Request(pageUrl) if 'proxy' in m: proxytouse = m['proxy'] # print 'proxytouse',proxytouse # urllib_request.getproxies= lambda: {} if pageUrl[:5] == "https": proxy = urllib_request.ProxyHandler( {'https': proxytouse}) #req.set_proxy(proxytouse, 'https') else: proxy = urllib_request.ProxyHandler( {'http': proxytouse}) #req.set_proxy(proxytouse, 'http') opener = urllib_request.build_opener(proxy) urllib_request.install_opener(opener) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1' ) proxytouse = None if 'referer' in m: req.add_header('Referer', m['referer']) if 'accept' in m: req.add_header('Accept', m['accept']) if 'agent' in m: req.add_header('User-agent', m['agent']) if 'x-req' in m: req.add_header('X-Requested-With', m['x-req']) if 'x-addr' in m: req.add_header('x-addr', m['x-addr']) if 'x-forward' in m: req.add_header('X-Forwarded-For', m['x-forward']) if 'setcookie' in m: # print 'adding cookie',m['setcookie'] req.add_header('Cookie', m['setcookie']) if 'appendcookie' in m: # print 'appending cookie to cookiejar',m['appendcookie'] cookiestoApend = m['appendcookie'] cookiestoApend = cookiestoApend.split(';') for h in cookiestoApend: n, v = h.split('=') w, n = n.split(':') ck = http_cookiejar.Cookie( version=0, name=n, value=v, port=None, port_specified=False, domain=w, domain_specified=False, domain_initial_dot=False, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None}, rfc2109=False) cookieJar.set_cookie(ck) if 'origin' in m: req.add_header('Origin', m['origin']) if header_in_page: header_in_page = header_in_page.split('&') for h in header_in_page: n, v = h.split('=') req.add_header(n, v) if not cookieJar == None: # print 'cookieJarVal',cookieJar cookie_handler = urllib_request.HTTPCookieProcessor( cookieJar) opener = urllib_request.build_opener( cookie_handler, urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler()) opener = urllib_request.install_opener(opener) # print 'noredirect','noredirect' in m if 'noredirect' in m: opener = urllib_request.build_opener( cookie_handler, NoRedirection, urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler()) opener = urllib_request.install_opener(opener) elif 'noredirect' in m: opener = urllib_request.build_opener( NoRedirection, urllib_request.HTTPBasicAuthHandler(), urllib_request.HTTPHandler()) opener = urllib_request.install_opener(opener) if 'connection' in m: # print '..........................connection//////.',m['connection'] from keepalive import HTTPHandler keepalive_handler = HTTPHandler() opener = urllib_request.build_opener(keepalive_handler) urllib_request.install_opener(opener) #print 'after cookie jar' post = None if 'post' in m: postData = m['post'] #if '$LiveStreamRecaptcha' in postData: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # postData=postData.replace('$LiveStreamRecaptcha','manual_recaptcha_challenge_field:'+captcha_challenge+',recaptcha_response_field:'+catpcha_word+',id:'+idfield) splitpost = postData.split(',') post = {} for p in splitpost: n = p.split(':')[0] v = p.split(':')[1] post[n] = v post = urllib_parse.urlencode(post) if 'rawpost' in m: post = m['rawpost'] #if '$LiveStreamRecaptcha' in post: # (captcha_challenge,catpcha_word,idfield)=processRecaptcha(m['page'],cookieJar) # if captcha_challenge: # post=post.replace('$LiveStreamRecaptcha','&manual_recaptcha_challenge_field='+captcha_challenge+'&recaptcha_response_field='+catpcha_word+'&id='+idfield) link = '' try: if post: response = urllib_request.urlopen(req, post) else: response = urllib_request.urlopen(req) if response.info().get('Content-Encoding') == 'gzip': import gzip buf = six.BytesIO(response.read()) f = gzip.GzipFile(fileobj=buf) link = f.read() else: link = response.read() if 'proxy' in m and not current_proxies is None: urllib_request.install_opener( urllib_request.build_opener(current_proxies)) link = javascriptUnEscape(link) #print repr(link) #print link This just print whole webpage in LOG if 'includeheaders' in m: #link+=str(response.headers.get('Set-Cookie')) link += '$$HEADERS_START$$:' for b in response.headers: link += b + ':' + response.headers.get( b) + '\n' link += '$$HEADERS_END$$:' # print link response.close() except: pass cachedPages[m['page']] = link #print link #print 'store link for',m['page'],forCookieJarOnly if forCookieJarOnly: return cookieJar # do nothing elif m['page'] and not m['page'].startswith('http'): if m['page'].startswith('$pyFunction:'): val = doEval(m['page'].split('$pyFunction:')[1], '', cookieJar, m) if forCookieJarOnly: return cookieJar # do nothing link = val link = javascriptUnEscape(link) else: link = m['page'] if '$doregex' in m['expres']: m['expres'] = getRegexParsed(regexs, m['expres'], cookieJar, recursiveCall=True, cachedPages=cachedPages) if not m['expres'] == '': #print 'doing it ',m['expres'] if '$LiveStreamCaptcha' in m['expres']: val = askCaptcha(m, link, cookieJar) #print 'url and val',url,val url = url.replace("$doregex[" + k + "]", val) elif m['expres'].startswith( '$pyFunction:') or '#$pyFunction' in m['expres']: #print 'expeeeeeeeeeeeeeeeeeee',m['expres'] val = '' if m['expres'].startswith('$pyFunction:'): val = doEval(m['expres'].split('$pyFunction:')[1], link, cookieJar, m) else: val = doEvalFunction(m['expres'], link, cookieJar, m) if 'ActivateWindow' in m['expres']: return if forCookieJarOnly: return cookieJar # do nothing if 'listrepeat' in m: listrepeat = m['listrepeat'] return listrepeat, eval(val), m, regexs, cookieJar try: url = url.replace(u"$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", six.ensure_text(val)) else: if 'listrepeat' in m: listrepeat = m['listrepeat'] ret = re.findall(m['expres'], link) return listrepeat, ret, m, regexs val = '' if not link == '': #print 'link',link reg = re.compile(m['expres']).search(link) try: val = reg.group(1).strip() except: traceback.print_exc() elif m['page'] == '' or m['page'] == None: val = m['expres'] if rawPost: # print 'rawpost' val = urllib_parse.quote_plus(val) if 'htmlunescape' in m: #val=urllib_parse.unquote_plus(val) import HTMLParser val = HTMLParser.HTMLParser().unescape(val) try: url = url.replace("$doregex[" + k + "]", val) except: url = url.replace("$doregex[" + k + "]", six.ensure_text(val)) #print 'ur',url #return val else: url = url.replace("$doregex[" + k + "]", '') if '$epoctime$' in url: url = url.replace('$epoctime$', getEpocTime()) if '$epoctime2$' in url: url = url.replace('$epoctime2$', getEpocTime2()) if '$GUID$' in url: import uuid url = url.replace('$GUID$', str(uuid.uuid1()).upper()) if '$get_cookies$' in url: url = url.replace('$get_cookies$', getCookiesString(cookieJar)) if recursiveCall: return url #print 'final url',repr(url) if url == "": return else: return url, setresolved