def __init__(self, url): self._url = url self._opener = urllib2.build_opener( CustomHTTPErrorHandler(), urllib2.HTTPCookieProcessor(cookielib.CookieJar())) self._xsrf_token = None
#!/usr/bin/env python import urllib import urllib2 import cookielib filename = 'login.txt' cookie = cookielib.MozillaCookieJar() cookie.load(filename,ignore_discard=True,ignore_expires=True) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie)) postdata = urllib.urlencode({ 'user':'******', 'pwd':'123456' }) loginUrl = "http://192.168.63.241:8000/cookie.php" result = opener.open(loginUrl,postdata) print result.read()
def http_response(self, request, response): if __debug__: code, msg, hdrs = response.code, response.msg, response.info() self.httpout.write("HTTP/1.x %s %s\n" % (code, msg)) self.httpout.write(str(hdrs)) return response https_request = http_request https_response = http_response # Example cjar = cookielib.LWPCookieJar() opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cjar), HTTPMyDebugProcessor(), ) #opener = urllib2.build_opener(HTTPMyDebugProcessor(),) urllib2.install_opener(opener) ##response = urllib2.urlopen("http://www.google.com") #response = urllib2.urlopen("https://www.idcourts.us/repository/start.do") #response = urllib2.urlopen("https://www.idcourts.us/repository/searchParty.do") req = urllib2.Request( 'http://www.microsoft.com/windows/windows-7/default.aspx') #req = urllib2.Request('https://www.idcourts.us/repository/start.do') res = opener.open(req) print cjar for c in cjar: cookie_str = "%s=%s" % (c.name, c.value)
def getRegexParsed( regexs, url, cookieJar=None, forCookieJarOnly=False, recursiveCall=False): #0,1,2 = URL, regexOnly, CookieJarOnly cachedPages = {} #print 'url',url doRegexs = re.compile('\$doregex\[([^\]]*)\]').findall(url) print 'doRegexs', doRegexs, regexs for rege in doRegexs: k = regexs.find("regex", {"name": rege}) if not k == None: cookieJarParam = False if k.cookiejar: cookieJarParam = k.cookiejar.text if '$doregex' in cookieJarParam: cookieJar = getRegexParsed(regexs, cookieJarParam, cookieJar, True, True) cookieJarParam = True else: cookieJarParam = True if cookieJarParam: if cookieJar == None: #print 'create cookie jar' import cookielib cookieJar = cookielib.LWPCookieJar() #print 'cookieJar new',cookieJar page = k.page.text if '$doregex' in page: page = getRegexParsed(regexs, page, cookieJar, recursiveCall=True) postInput = None if k.post: postInput = k.post.text if '$doregex' in postInput: postInput = getRegexParsed(regexs, postInput, cookieJar, recursiveCall=True) print 'post is now', postInput if page in cachedPages: link = cachedPages[page] else: #print 'Ingoring Cache',m['page'] req = urllib2.Request(page) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows NT 6.1; rv:14.0) Gecko/20100101 Firefox/14.0.1' ) if k.refer: req.add_header('Referer', k.refer.text) if k.agent: req.add_header('User-agent', k.agent.text) if not cookieJar == None: #print 'cookieJarVal',cookieJar cookie_handler = urllib2.HTTPCookieProcessor(cookieJar) opener = urllib2.build_opener( cookie_handler, urllib2.HTTPBasicAuthHandler(), urllib2.HTTPHandler()) opener = urllib2.install_opener(opener) #print 'after cookie jar' post = None if postInput: postData = postInput splitpost = postData.split(',') post = {} for p in splitpost: n = p.split(':')[0] v = p.split(':')[1] post[n] = v post = urllib.urlencode(post) if post: response = urllib2.urlopen(req, post) else: response = urllib2.urlopen(req) link = response.read() response.close() cachedPages[page] = link if forCookieJarOnly: return cookieJar # do nothing print 'link', link print k.expres.text reg = re.compile(k.expres.text).search(link) url = url.replace("$doregex[" + rege + "]", reg.group(1).strip()) if recursiveCall: return url print 'final url', url return url
# -*- coding=utf-8 -*- import urllib import urllib2 import cookielib # 通过CookieJar()类构建一个cookieJar()对象,用来保存cookie的值。 cookie = cookielib.CookieJar() #通过HTTPCookieProcessor()处理器类构建一个处理器对象,又来处理cookie #参数就是构建的CookieJar()对象 cookie_handler = urllib2.HTTPCookieProcessor(cookie) #构建一个自定义的opener opener = urllib2.build_opener(cookie_handler) #构建一个自定义的opener的addheaders的参数,可以添加HTTP报头参数/ opener.addheaders = [( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36" )] #人人网的登录接口 url = "http://www.renren.com.PLogin.do" #需要登录的账户和密码 data = {"email": "*****@*****.**", "password": "******"} #通过urlencode()编码转换 data = urllib.urlencode(data)
batchfd = sys.stdin else: batchfd = open(opts.batchfile, 'r') batchurls = batchfd.readlines() batchurls = [x.strip() for x in batchurls] batchurls = [ x for x in batchurls if len(x) > 0 and not re.search(r'^[#/;]', x) ] except IOError: sys.exit(u'ERROR: batch file could not be read') all_urls = batchurls + args all_urls = map(lambda url: url.strip(), all_urls) # General configuration cookie_processor = urllib2.HTTPCookieProcessor(jar) proxy_handler = urllib2.ProxyHandler() opener = urllib2.build_opener(proxy_handler, cookie_processor, YoutubeDLHandler()) urllib2.install_opener(opener) socket.setdefaulttimeout( 300) # 5 minutes should be enough (famous last words) extractors = gen_extractors() if opts.list_extractors: for ie in extractors: print(ie.IE_NAME) matchedUrls = filter(lambda url: ie.suitable(url), all_urls) all_urls = filter(lambda url: url not in matchedUrls, all_urls) for mu in matchedUrls:
def opener(): return urllib2.build_opener(urllib2.HTTPCookieProcessor())
def __init__(self, host, port, io_loop_executor=None): self.host = host self.port = port self.io_loop_executor = io_loop_executor self.cookie_jar = cookielib.CookieJar() self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie_jar), SilentHTTPErrorProcessor())
def request(url, close=True, redirect=True, error=False, proxy=None, post=None, headers=None, mobile=False, XHR=False, limit=None, referer=None, cookie=None, compression=True, output='', timeout='30'): try: if not url: return handlers = [] if not proxy == None: handlers += [ urllib2.ProxyHandler({'http': '%s' % (proxy)}), urllib2.HTTPHandler ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) if output == 'cookie' or output == 'extended' or not close == True: cookies = cookielib.LWPCookieJar() handlers += [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) if (2, 7, 8) < sys.version_info < (2, 7, 12): try: import ssl ssl_context = ssl.create_default_context() ssl_context.check_hostname = False ssl_context.verify_mode = ssl.CERT_NONE handlers += [urllib2.HTTPSHandler(context=ssl_context)] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) except: pass if url.startswith('//'): url = 'http:' + url _headers = {} try: _headers.update(headers) except: pass if 'User-Agent' in _headers: pass elif not mobile == True: #headers['User-Agent'] = agent() _headers['User-Agent'] = cache.get(randomagent, 1) else: _headers['User-Agent'] = 'Apple-iPhone/701.341' if 'Referer' in _headers: pass elif referer is not None: _headers['Referer'] = referer if not 'Accept-Language' in _headers: _headers['Accept-Language'] = 'en-US' if 'X-Requested-With' in _headers: pass elif XHR == True: _headers['X-Requested-With'] = 'XMLHttpRequest' if 'Cookie' in _headers: pass elif not cookie == None: _headers['Cookie'] = cookie if 'Accept-Encoding' in _headers: pass elif compression and limit is None: _headers['Accept-Encoding'] = 'gzip' if redirect == False: class NoRedirection(urllib2.HTTPErrorProcessor): def http_response(self, request, response): return response opener = urllib2.build_opener(NoRedirection) opener = urllib2.install_opener(opener) try: del _headers['Referer'] except: pass if isinstance(post, dict): post = utils.byteify(post) post = urllib.urlencode(post) url = utils.byteify(url) request = urllib2.Request(url, data=post) _add_request_header(request, _headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) except urllib2.HTTPError as response: if response.code == 503: cf_result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': cf_result = gzip.GzipFile( fileobj=StringIO.StringIO(cf_result)).read() if 'cf-browser-verification' in cf_result: netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) if not netloc.endswith('/'): netloc += '/' ua = _headers['User-Agent'] cf = cache.get(cfcookie().get, 168, netloc, ua, timeout) _headers['Cookie'] = cf request = urllib2.Request(url, data=post) _add_request_header(request, _headers) response = urllib2.urlopen(request, timeout=int(timeout)) else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error == False: return else: log_utils.log( 'Request-Error (%s): %s' % (str(response.code), url), log_utils.LOGDEBUG) if error == False: return if output == 'cookie': try: result = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: result = cf except: pass if close == True: response.close() return result elif output == 'geturl': result = response.geturl() if close == True: response.close() return result elif output == 'headers': result = response.headers if close == True: response.close() return result elif output == 'chunk': try: content = int(response.headers['Content-Length']) except: content = (2049 * 1024) if content < (2048 * 1024): return result = response.read(16 * 1024) if close == True: response.close() return result if limit == '0': result = response.read(224 * 1024) elif not limit == None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile(fileobj=StringIO.StringIO(result)).read() if 'sucuri_cloudproxy_js' in result: su = sucuri().get(result) _headers['Cookie'] = su request = urllib2.Request(url, data=post) _add_request_header(request, _headers) response = urllib2.urlopen(request, timeout=int(timeout)) if limit == '0': result = response.read(224 * 1024) elif not limit == None: result = response.read(int(limit) * 1024) else: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile( fileobj=StringIO.StringIO(result)).read() if 'Blazingfast.io' in result and 'xhr.open' in result: netloc = '%s://%s' % (urlparse.urlparse(url).scheme, urlparse.urlparse(url).netloc) ua = _headers['User-Agent'] _headers['Cookie'] = cache.get(bfcookie().get, 168, netloc, ua, timeout) result = _basic_request(url, headers=_headers, post=post, timeout=timeout, limit=limit) if output == 'extended': try: response_headers = dict([(item[0].title(), item[1]) for item in response.info().items()]) except: response_headers = response.headers response_code = str(response.code) try: cookie = '; '.join( ['%s=%s' % (i.name, i.value) for i in cookies]) except: pass try: cookie = cf except: pass if close == True: response.close() return (result, response_code, response_headers, _headers, cookie) else: if close == True: response.close() return result except Exception as e: log_utils.log('Request-Error: (%s) => %s' % (str(e), url), log_utils.LOGDEBUG) return
def set_cookiejar(self,cj): self.cookiejar = cj saveheaders = self.opener.addheaders self.opener = u2.build_opener(u2.HTTPCookieProcessor(self.cookiejar),GZipProcessor()) self.opener.addheaders = saveheaders
def requestIPs(host): try: #cookie cookieJar = cookielib.LWPCookieJar() cookie_support = urllib2.HTTPCookieProcessor(cookieJar) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) #服务器地址 urlServer = 'http://tools.fastweb.com.cn/index.php/Index/Mdig' urlServerData = 'http://tools.fastweb.com.cn/index.php/Index/sendMdig' urlServerResult = 'http://tools.fastweb.com.cn/index.php/Index/getMdigResultOne' #header headers = {'User-Agent': 'Mozilla/4.0', 'Referer': '******'} #打开主页面 urllib2.urlopen(urlServer) ##发送PING请求 #Post数据 postData = { 'query_type': 'A', 'domain_name': host, 'city': '6,7,8,1,2,3,4,5,15,27,28,29,30,31,22,23,24,25,26,16,17,18,9,10,11,12,13,14,19,20,21,32,33,34,36,37', 'isp': '1,2,3,5,8,12', 'rand': '5244' } postData = urllib.urlencode(postData) # POST if debug: print '正在提交请求.' request = urllib2.Request(urlServerData, postData, headers) response = urllib2.urlopen(request) responseData = json.loads(response.read()) if responseData['status'] == 1: if debug: print '请求已提交.' task_id = responseData['data']['task_id'] view_ids = responseData['data']['view_ids'] _from = responseData['data']['from'] result_id = 0 ipList = [] print '等待服务器返回数据.' #每隔1.5秒检查返回结果 while True: time.sleep(1.5) #Post数据 postData = { 'task_id': task_id, 'view_ids': view_ids, 'from': _from, 'query_type': 'A', 'result_id': result_id } postData = urllib.urlencode(postData) # POST request = urllib2.Request(urlServerResult, postData, headers) response = urllib2.urlopen(request) responseData = json.loads(response.read()) if responseData['status'] == 1 and responseData['info'] == '0': result_id = responseData['data']['result_id'] if type(responseData['data']) != dict: continue for x in responseData['data'].values(): for y in x: if type(y) != dict: continue if y['type'] == 'a': i = y['result'].index('(') ip = y['result'][:i] if not ip in ipList: ipList.append(ip) if debug: print '已取得部分数据,继续等待. (%d条IP数据)' % len(ipList) elif responseData['status'] == 0 and responseData[ 'info'] == '1': print 'IP列表获取成功.' return ipList elif responseData['status'] == 1 and responseData[ 'info'] == '1': raise Exception('获取IP列表过程中出现错误.') else: if debug: print '继续等待服务器返回数据.' else: raise Exception(responseData['info']) except Exception, e: print '无法获取IP列表' raise e
import HTMLParser import urlparse import urllib import urllib2 import cookielib import string import re #登录的主页面 hosturl = 'https://www.douban.com/' #post数据接收和处理的页面(我们要向这个页面发送我们构造的Post数据) posturl = 'https://book.douban.com/' #设置一个cookie处理器,它负责从服务器下载cookie到本地,并且在发送请求时带上本地的cookie cj = cookielib.LWPCookieJar() cookie_support = urllib2.HTTPCookieProcessor(cj) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) #打开登录主页面(他的目的是从页面下载cookie,这样我们在再送post数据时就有cookie了,否则发送不成功) h = urllib2.urlopen(hosturl) #构造header,一般header至少要包含一下两项。这两项是从抓到的包里分析得出的。 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:14.0) Gecko/20100101 Firefox/14.0.1', 'Referer': '******' } #构造Post数据,他也是从抓大的包里分析得出的。 postData = { 'op': 'dmlogin',
def logintopronto(username, password, debug): params = urllib.urlencode({ 'loginUserId': username, 'authType': 'Pronto', 'loginPassword': password, 'submit': 'Login' }) opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cookielib.CookieJar())) urllib2.install_opener(opener) puts(colored.white("Contacting ProntoNetworks...")) if debug: if not os.path.exists('debug/'): os.makedirs('debug/') with indent(5, quote=">"): puts(colored.yellow("Fetching site")) mainreq = urllib2.Request( BASE_URL + '/registration/Main.jsp?wispId=1&nasId=00:15:17:c8:09:b1') mainres = urllib2.urlopen(mainreq) if debug: with open('debug/main.txt', 'wb') as f: f.write(mainres.read()) f.close() with indent(5, quote=colored.white("DEBUG:")): puts(colored.red("logged /registration/Main.jsp response")) with indent(5, quote=">"): puts(colored.yellow("Sending credentials")) loginReq = urllib2.Request(BASE_URL + '/registration/chooseAuth.do', params) loginRes = urllib2.urlopen(loginReq) if debug: with open('debug/login.txt', 'wb') as f: f.write(loginRes.read()) f.close() with indent(5, quote=colored.white("DEBUG:")): puts( colored.red("logged /registration/chooseAuth.do response")) with indent(5, quote=">"): puts(colored.yellow("Checking plan")) planreq = urllib2.Request( BASE_URL + '/registration/main.do?content_key=%2FSelectedPlan.jsp') planres = urllib2.urlopen(planreq) planSoup = BeautifulSoup(planres.read()) data = planSoup.findAll('td', attrs={ 'class': 'formFieldRight', 'colspan': '2' }) planDetails = [] for i in range(0, len(data) - 1): kids = data[i].parent.findAll('td') planDetails.append(str(kids[1].text)) if debug: with open('debug/plan.txt', 'wb') as f: f.write(loginRes.read()) f.close() with indent(5, quote=colored.white("DEBUG:")): puts( colored.red( "logged /registration/main.do?content_key=%2FSelectedPlan.jsp response" )) sedate = datetime.strptime(planDetails[2], "%m/%d/%Y %H:%M:%S") # enddate = datetime.strptime(planDetails[3], "%m/%d/%Y %H:%M:%S") cycleStart = getcyclestartdate(sedate) historyparams = urllib.urlencode({ "location": "allLocations", "parameter": "custom", "customStartMonth": cycleStart['mm'], "customStartDay": cycleStart['dd'], "customStartYear": cycleStart['yy'], "customEndMonth": 04, "customEndDay": 01, "customEndYear": 2016, # Lazy, so hardcoding end year. "button": "View" })
def get_cookies(): cj = cookielib.LWPCookieJar() cookie_support = urllib2.HTTPCookieProcessor(cj) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener)
class YoujiaoSpider(Spider): # name='ry' name = 'youjiao' download_delay = 5 allowed_domains = ['youjiao.com'] # start_urls = ['http://www.youjiao.com/sejy/qnkf/'] # start_urls = ['http://www.youjiao.com/etly/kpzs/gxkj/'] start_urls = [ 'http://www.youjiao.com/ysx/ysx/', 'http://www.youjiao.com/ysx/zhengce/', 'http://www.youjiao.com/ysx/zsjz/', 'http://www.youjiao.com/ysx/zexiao/', 'http://www.youjiao.com/ysx/zhinan/', 'http://www.youjiao.com/ysx/xuequfang/', 'http://www.youjiao.com/ysx/rxcs/', 'http://www.youjiao.com/ysx/shiti/', 'http://www.youjiao.com/ysx/xianchang/', 'http://www.youjiao.com/ysx/yxxj/', 'http://www.youjiao.com/ysx/jingyan/', 'http://www.youjiao.com/ysx/mxdt/', 'http://www.youjiao.com/ysx/zdxx/', 'http://www.youjiao.com/sejy/czrj/', 'http://www.youjiao.com/sejy/wenti/', 'http://www.youjiao.com/sejy/jtjy/', 'http://www.youjiao.com/sejy/qnkf/', 'http://www.youjiao.com/sejy/yspy/', 'http://www.youjiao.com/sejy/etxl/', 'http://www.youjiao.com/sejy/tsdx/', 'http://www.youjiao.com/sejy/yyxx/', 'http://www.youjiao.com/sejy/pyxx/', 'http://www.youjiao.com/sejy/jyzj/', 'http://www.youjiao.com/sejy/gwjy/', 'http://www.youjiao.com/sejy/jyxd/', 'http://www.youjiao.com/sejy/ryzd/', 'http://www.youjiao.com/sejy/zaojiao/quming/', 'http://www.youjiao.com/sejy/zaojiao/ceping/', 'http://www.youjiao.com/sejy/zaojiao/yyxw/', 'http://www.youjiao.com/sejy/zaojiao/czzb/', 'http://www.youjiao.com/sejy/zaojiao/zqjy/', 'http://www.youjiao.com/sejy/zaojiao/xgpy/', 'http://www.youjiao.com/sejy/zaojiao/qinzijiaoliu/', 'http://www.youjiao.com/sejy/zaojiao/zjyx/', 'http://www.youjiao.com/sejy/zaojiao/wanju/', 'http://www.youjiao.com/sejy/zaojiao/zlkf/', 'http://www.youjiao.com/sejy/taijiao/tjff/', 'http://www.youjiao.com/sejy/taijiao/tjxd/', 'http://www.youjiao.com/sejy/taijiao/yuyan/', 'http://www.youjiao.com/sejy/taijiao/ydtj/', 'http://www.youjiao.com/sejy/taijiao/fmtj/', 'http://www.youjiao.com/sejy/taijiao/yinyue/', 'http://www.youjiao.com/sejy/taijiao/tjgs/', 'http://www.youjiao.com/sejy/taijiao/tjyy/', 'http://www.youjiao.com/sejy/taijiao/mrtj/', 'http://www.youjiao.com/sejy/taijiao/yytj/', 'http://www.youjiao.com/etly/zlyx/', 'http://www.youjiao.com/etly/etgq/', 'http://www.youjiao.com/etly/etgs/', 'http://www.youjiao.com/etly/seyy/', 'http://www.youjiao.com/etly/gushi/', 'http://www.youjiao.com/etly/lianxi/', 'http://www.youjiao.com/etly/erge/', 'http://www.youjiao.com/etly/youxi/', 'http://www.youjiao.com/etly/etdw/', 'http://www.youjiao.com/etly/mhsj/', 'http://www.youjiao.com/etly/sqgs/', 'http://www.youjiao.com/etly/qzgs/', 'http://www.youjiao.com/etly/kpzs/gxkj/', 'http://www.youjiao.com/etly/kpzs/zwls/', 'http://www.youjiao.com/etly/kpzs/twdl/', 'http://www.youjiao.com/etly/kpzs/kpcs/', 'http://www.youjiao.com/etly/kpzs/smkx/', 'http://www.youjiao.com/etly/kpzs/rtkx/', 'http://www.youjiao.com/etly/kpzs/jckx/', 'http://www.youjiao.com/etly/kpzs/yyys/', 'http://www.youjiao.com/etly/kpzs/hjkx/', 'http://www.youjiao.com/etly/kpzs/jskx/', 'http://www.youjiao.com/etly/kpzs/rcsh/', 'http://www.youjiao.com/etly/kpzs/msqw/', 'http://www.youjiao.com/etly/etdw/', 'http://www.youjiao.com/etly/etwj/', 'http://www.youjiao.com/jkbb/huli/', 'http://www.youjiao.com/jkbb/yfjj/', 'http://www.youjiao.com/jkbb/mianyi/', 'http://www.youjiao.com/jkbb/jibing/', 'http://www.youjiao.com/jkbb/zjdy/', 'http://www.youjiao.com/jkbb/jkys/', 'http://www.youjiao.com/jkbb/yewy/', 'http://www.youjiao.com/jkbb/fushi/', 'http://www.youjiao.com/jkbb/mmyy/', 'http://www.youjiao.com/jkbb/etyy/', 'http://www.youjiao.com/jkbb/bbyp/', 'http://www.youjiao.com/shipu/yqsp/', 'http://www.youjiao.com/shipu/fushi/', 'http://www.youjiao.com/shipu/etsp/', 'http://www.youjiao.com/shipu/jtsp/', 'http://www.youjiao.com/shipu/chsp/', 'http://www.youjiao.com/shipu/pengren/', 'http://www.youjiao.com/shipu/meishi/', 'http://www.youjiao.com/ssmm/mswh/', 'http://www.youjiao.com/shipu/meirong/', 'http://www.youjiao.com/shipu/jianfei/', 'http://www.youjiao.com/yyzn/zhunbeihuaiyun/huaiyunzhishi/', 'http://www.youjiao.com/yyzn/zhunbeihuaiyun/yunqianzhunbei/', 'http://www.youjiao.com/yyzn/zhunbeihuaiyun/huaiyunjinji/', 'http://www.youjiao.com/yyzn/zhunbeihuaiyun/yichuanyousheng/', 'http://www.youjiao.com/yyzn/zhunbeihuaiyun/biyunliuchan/', 'http://www.youjiao.com/yyzn/zhunbeihuaiyun/buyunbuyu/', 'http://www.youjiao.com/yyzn/zhunbeihuaiyun/shengnanshengnv/', 'http://www.youjiao.com/yyzn/gongju/', 'http://www.youjiao.com/ssmm/xmdr/', 'http://www.youjiao.com/ssmm/jtlc/', 'http://www.youjiao.com/ssmm/fscl/', 'http://www.youjiao.com/ssmm/mswh/', 'http://www.youjiao.com/ssmm/pxgx/', 'http://www.youjiao.com/ssmm/meirong/', 'http://www.youjiao.com/ssmm/sushen/', 'http://www.youjiao.com/ssmm/yuedu/', 'http://www.youjiao.com/ssmm/zhichang/', 'http://www.youjiao.com/ssmm/bbsj/', 'http://www.youjiao.com/ssmm/lydj/', 'http://www.youjiao.com/ssmm/jiaju/', 'http://www.youjiao.com/ssmm/fqgx/', 'http://www.youjiao.com/ssmm/mmxl/' ] # url_pattern=[r'.*rank=sale&type=hot.*'] url_pattern = [r'id=.*'] url_extractor = LxmlLinkExtractor(allow=url_pattern) item_dict = {} cj = cookielib.LWPCookieJar() cookie_support = urllib2.HTTPCookieProcessor(cj) opener = urllib2.build_opener(cookie_support, urllib2.HTTPHandler) urllib2.install_opener(opener) def start_requests(self): for url in self.start_urls: """爬取网站分页""" # for i in range(2, 3): # next_url = url + 'index_' + str(i) + '.shtml' # yield SplashRequest(next_url, callback=self.parse, args={ # 'wait': 0.5, 'html': 1, # }) yield SplashRequest(url, callback=self.parse, args={ 'wait': 0.5, 'html': 1, }) def parse(self, response): # print response.url lis = response.xpath("//*[@id='content']/li/div/a/@href").extract() # print lis for li in lis: news_href = li """判断网页是否已经被爬取过""" news_id = news_href.split('/')[-1].split('.')[0] ret = Sql.select_name(news_id) link_hdfs = InsecureClient("http://192.168.10.117:50070", user='******') # Base HDFS web client dir_ls = link_hdfs.list('/testdir') today = datetime.date.today() today_dir = '/testdir' + '/' + str(today) if str(today) not in dir_ls: link_hdfs.makedirs(today_dir) txt_name = news_id + '.txt' yesterday = today - datetime.timedelta(days=1) yesterday_dir = '/testdir' + '/' + str(yesterday) if str(yesterday) not in dir_ls: if ret[0] == 1 and txt_name in link_hdfs.list(today_dir): print "<+_+>-----已经存在HDFS与Mysql中-----<+_+>" pass else: yield Request(news_href, callback=self.get_content) else: if ret[0] == 1 and txt_name in link_hdfs.list(yesterday_dir): print "<+_+>-----已经存在HDFS与Mysql中-----<+_+>" pass elif ret[0] == 1 and txt_name in link_hdfs.list(today_dir): print "<+_+>-----已经存在HDFS与Mysql中-----<+_+>" else: yield Request(news_href, callback=self.get_content) def get_content(self, response): item = YoujiaoItem() """提取info_id""" info_id = response.url.split('/')[-1].split('.')[0] item['info_id'] = info_id """提取新闻标题""" title = response.xpath( "//div[@class='wrapper']/div/div[@class='wrapper']/div[@class='container']" "/div[@class='content']/h1/text()").extract() item['title'] = title[0] """提取新闻地址""" url = response.url item['link'] = url # url_id = url.split('/')[-1].split('.')[0] # print url_id """提取新闻发布时间""" o_time = response.xpath("//p[@class='data']/text()").extract()[-1] # print time online_time = o_time.replace('(', '').replace(')', '').replace( ' ', '').replace(':', '').replace('-', '') item['online_time'] = online_time """提取新闻来源""" news_from = response.xpath("//p[@class='data']/em/text()").extract() try: item['news_from'] = news_from[0] except IndexError: item['news_from'] = news_from """提取新闻来源地址""" item['source'] = None """提取新闻关键字""" tags = response.xpath("//*[@id='xg_tag']/span/a/text()").extract() tag = ','.join(tags) item['tag'] = tag """提取新闻描述""" desc = response.xpath("//meta[@name='description']/@content").extract() item['news_description'] = desc[0] """提取新闻作者""" try: author = response.xpath( "//p[@class='data']/text()").extract()[1].replace(' ', '') author = author.split(':')[-1] item['author'] = author except: author = None item['author'] = author """提取版块""" section = response.xpath( "//div[@class='logoArea']/span/a/text()").extract() section = '-'.join(section) item['section'] = section """提取评论数""" try: cmt_url = response.xpath( "//*[@id='bbs']/iframe[@id='rating']/@src").extract()[0] cmt_req = urllib2.Request(cmt_url) cmt_html = urllib2.urlopen(cmt_req).read() # print cmt_html cmt_re = re.findall(r'<span class="talk">(.*?)</span>', cmt_html) item['comment_count'] = cmt_re[0] except: item['comment_count'] = None """提取内容""" soup = BeautifulSoup(response.text, 'lxml') [script.extract() for script in soup.find_all('script')] # 去除script标签 [style.extract() for style in soup.find_all('style')] # 去除style标签 text_soup = soup.find(class_='content_txt').get_text() if '上一页' in text_soup: text = text_soup.split('上一页')[0] else: text = text_soup.split('幼教网微信')[0] text = text.split('\n') # text = response.xpath("//div[@class='content_txt']/p/text()").extract() body = [] for t_body in text: bdy = t_body.replace('\r', '').replace('\n', '').replace( '\t', '').replace(u'\xa0', '') body.append(bdy) """提取网页内容分页""" try: next_urls = response.xpath( "//div[@class='pages']/a/@href").extract()[:-1] header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/55.0.2883.87 Safari/537.36' } for i in range(len(next_urls)): if item['info_id'] in next_urls[i]: next_url = next_urls[i] next_url_req = urllib2.Request(next_url, headers=header) next_url_html = urllib2.urlopen(next_url_req).read() soup_body = BeautifulSoup( next_url_html, 'html.parser').find(class_='content_txt').get_text() n_body = soup_body.split('上一页')[0] n_body = n_body.split('\n') for bdy in n_body: n_by = bdy.replace('\n', '').replace('\r', '').replace( '\t', '').replace(u'\xa0', '') # print n_by body.append(n_by) except Exception, e: print Exception, "没有下一页内容", e pass while '' in body: body.remove('') item['news_body'] = body return item
def get_cookie(self, netloc, ua, timeout): try: headers = {'User-Agent': ua} request = urllib2.Request(netloc) _add_request_header(request, headers) try: response = urllib2.urlopen(request, timeout=int(timeout)) except urllib2.HTTPError as response: result = response.read(5242880) try: encoding = response.info().getheader('Content-Encoding') except: encoding = None if encoding == 'gzip': result = gzip.GzipFile( fileobj=StringIO.StringIO(result)).read() jschl = re.findall('name="jschl_vc" value="(.+?)"/>', result)[0] init = re.findall('setTimeout\(function\(\){\s*.*?.*:(.*?)};', result)[-1] builder = re.findall(r"challenge-form\'\);\s*(.*)a.v", result)[0] decryptVal = self.parseJSString(init) lines = builder.split(';') for line in lines: if len(line) > 0 and '=' in line: sections = line.split('=') line_val = self.parseJSString(sections[1]) decryptVal = int( eval( str(decryptVal) + sections[0][-1] + str(line_val))) answer = decryptVal + len(urlparse.urlparse(netloc).netloc) query = '%s/cdn-cgi/l/chk_jschl?jschl_vc=%s&jschl_answer=%s' % ( netloc, jschl, answer) if 'type="hidden" name="pass"' in result: passval = re.findall('name="pass" value="(.*?)"', result)[0] query = '%s/cdn-cgi/l/chk_jschl?pass=%s&jschl_vc=%s&jschl_answer=%s' % ( netloc, urllib.quote_plus(passval), jschl, answer) time.sleep(6) cookies = cookielib.LWPCookieJar() handlers = [ urllib2.HTTPHandler(), urllib2.HTTPSHandler(), urllib2.HTTPCookieProcessor(cookies) ] opener = urllib2.build_opener(*handlers) opener = urllib2.install_opener(opener) try: request = urllib2.Request(query) _add_request_header(request, headers) response = urllib2.urlopen(request, timeout=int(timeout)) except: pass cookie = '; '.join(['%s=%s' % (i.name, i.value) for i in cookies]) if 'cf_clearance' in cookie: self.cookie = cookie except: pass
import urllib2 import urllib import cookielib values = {"name": "wowo", "age": "20"} data = urllib.urlencode(values) headers = {"User-Agent": "Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)"} cookie = cookielib.MozillaCookieJar() headler = urllib2.HTTPCookieProcessor(cookie) opener = urllib2.build_opener(headler) request = urllib2.Request("http://www.wangzhi.com", data, headers) response = opener.open(request) print response.read() print cookie for item in cookie: print "-" * 10 print item.name print item.value cookie.save("d:\\app.txt", True, True) response.close()
def VIDEO(url): urlogin = '******' cookiejar = cookielib.LWPCookieJar() cookiejar = urllib2.HTTPCookieProcessor(cookiejar) opener = urllib2.build_opener(cookiejar) urllib2.install_opener(opener) values = { 'ref': 'http://veehd.com/', 'uname': uname, 'pword': pwd, 'submit': 'Login', 'terms': 'on' } user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)' headers = {'User-Agent': user_agent} data = urllib.urlencode(values) req = urllib2.Request(urlogin, data, headers) response = urllib2.urlopen(req) if url.find('flv') > 0: req = urllib2.Request(url) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3' ) response = urllib2.urlopen(req) link = response.read() vpi = re.compile('"/vpi.+?h=(.+?)"').findall(link)[0] req = urllib2.Request('http://veehd.com/vpi?h=' + vpi) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3' ) response = urllib2.urlopen(req) link = response.read() swap = re.compile('"url":"(.+?)"').findall(link)[0] finalurl = swap.replace('%2F', '/').replace('%3F', '?').replace( '%3D', '=').replace('%25', '%').replace('%2F', '/').replace('%26', '&').replace('%3A', ':') if (vhd.getSetting('download') == '0'): dia = xbmcgui.Dialog() ret = dia.select('Streaming Options', ['Play', 'Download']) if (ret == 0): item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item) elif (ret == 1): path = xbmc.translatePath( os.path.join(vhd.getSetting('download_path'), name)) Download(finalurl, path + name + '.avi') else: return elif (vhd.getSetting('download') == '1'): item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item) elif (vhd.getSetting('download') == '2'): path = xbmc.translatePath( os.path.join(vhd.getSetting('download_path'), name)) Download(finalurl, path + name + '.avi') else: return if url.find('flv') < 0: req = urllib2.Request(url) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3' ) response = urllib2.urlopen(req) link = response.read() vpi = re.compile('"/vpi.+?h=(.+?)"').findall(link)[0] req = urllib2.Request('http://veehd.com/vpi?h=' + vpi) req.add_header( 'User-Agent', 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.9.0.3) Gecko/2008092417 Firefox/3.0.3' ) response = urllib2.urlopen(req) link = response.read() finalurl = re.compile('param name="src" value="(.+?)"').findall( link)[0] item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item) if (vhd.getSetting('download') == '0'): dia = xbmcgui.Dialog() ret = dia.select('Streaming Options', ['Play', 'Download']) if (ret == 0): item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item) elif (ret == 1): path = xbmc.translatePath( os.path.join(vhd.getSetting('download_path'), name)) Download(finalurl, path + name + '.avi') else: return elif (vhd.getSetting('download') == '1'): item = xbmcgui.ListItem(path=finalurl) xbmcplugin.setResolvedUrl(pluginhandle, True, item) elif (vhd.getSetting('download') == '2'): path = xbmc.translatePath( os.path.join(vhd.getSetting('download_path'), name)) Download(finalurl, path + name + '.avi') else: return
import re import pdb from lxml import etree headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "fr,fr-fr;q=0.8,en-us;q=0.5,en;q=0.3", "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:33.0) Gecko/20100101 Firefox/33.0", } parser = etree.HTMLParser() url_opener = urllib2.build_opener( urllib2.HTTPCookieProcessor(cookielib.CookieJar())) def go_to_next_page(parameters, response, form_element, form_to_take=1): response_url = response.geturl() request = urllib2.Request(urlparse.urljoin(response_url, form_element.get('action')), headers=headers) try: response = url_opener.open(request, urllib.urlencode(parameters, doseq=True)) except urllib2.HTTPError as response: print "Erreur", response.code print response.read() raise html_page = response.read()
def send(self, original_payload, additional_handlers=[]): # Generate session id and referrers session_id, referrers_data = self._prepare(original_payload) cj = cookielib.CookieJar() additional_handlers.append(urllib2.HTTPCookieProcessor(cj)) opener = urllib2.build_opener(*additional_handlers) # When core.conf contains additional cookies, carefully merge # the new headers killing the needed ones additional_headers = [] additional_ua = '' additional_cookie = '' for h in self.additional_headers: if h[0].lower() == 'user-agent' and h[1]: additional_ua = h[1] if h[0].lower() == 'cookie' and h[1]: cookies = h[1].rstrip(';').split('; ') for cookie in cookies: name, value = cookie.split('=') cj.set_cookie( cookielib.Cookie(version=0, name=name, value=value, port=None, port_specified=False, domain='', domain_specified=True, domain_initial_dot=True, path='/', path_specified=True, secure=False, expires=None, discard=True, comment=None, comment_url=None, rest={'HttpOnly': None})) elif h[0].lower() in ('accept', 'accept-language', 'referer'): # Skip sensible headers pass else: additional_headers.append(h) for referrer_index, referrer_data in enumerate(referrers_data): accept_language_header = self._generate_header_accept_language( referrer_data[1], session_id) accept_header = self._generate_header_accept() opener.addheaders = [ ('Referer', referrer_data[0]), ('Accept-Language', accept_language_header), ('Accept', accept_header), ('User-Agent', (additional_ua if additional_ua else random.choice(self.agents))) ] + additional_headers dlog.debug('[H %i/%i]\n%s\n[C] %s' % (referrer_index, len(referrers_data) - 1, '\n'.join( '> %s: %s' % (h[0], h[1]) for h in opener.addheaders), cj)) url = (self.url if not config.add_random_param_nocache else utils.http.add_random_url_param(self.url)) try: response = opener.open(url).read() except httplib.BadStatusLine as e: # TODO: add this check to the other channels log.warn('Connection closed unexpectedly, aborting command.') return if not response: continue # Multiple debug string may have been printed, using findall matched_debug = self.re_debug.findall(response) if matched_debug: dlog.debug('\n'.join(matched_debug)) matched = self.re_response.search(response) if matched and matched.group(1): return zlib.decompress( utils.strings.sxor(base64.b64decode(matched.group(1)), self.shared_key))
def get_data(host, query, idx, limit, debug, threshold=300, ckey=None, cert=None, das_headers=True): """Contact DAS server and retrieve data for given DAS query""" params = {'input': query, 'idx': idx, 'limit': limit} path = '/das/cache' pat = re.compile('http[s]{0,1}://') if not pat.match(host): msg = 'Invalid hostname: %s' % host raise Exception(msg) url = host + path client = '%s (%s)' % (DAS_CLIENT, os.environ.get('USER', '')) headers = {"Accept": "application/json", "User-Agent": client} encoded_data = urllib.urlencode(params, doseq=True) url += '?%s' % encoded_data req = urllib2.Request(url=url, headers=headers) if ckey and cert: ckey = fullpath(ckey) cert = fullpath(cert) http_hdlr = HTTPSClientAuthHandler(ckey, cert, debug) else: http_hdlr = urllib2.HTTPHandler(debuglevel=debug) proxy_handler = urllib2.ProxyHandler({}) cookie_jar = cookielib.CookieJar() cookie_handler = urllib2.HTTPCookieProcessor(cookie_jar) opener = urllib2.build_opener(http_hdlr, proxy_handler, cookie_handler) fdesc = opener.open(req) data = fdesc.read() fdesc.close() pat = re.compile(r'^[a-z0-9]{32}') if data and isinstance(data, str) and pat.match(data) and len(data) == 32: pid = data else: pid = None iwtime = 2 # initial waiting time in seconds wtime = 20 # final waiting time in seconds sleep = iwtime time0 = time.time() while pid: params.update({'pid': data}) encoded_data = urllib.urlencode(params, doseq=True) url = host + path + '?%s' % encoded_data req = urllib2.Request(url=url, headers=headers) try: fdesc = opener.open(req) data = fdesc.read() fdesc.close() except urllib2.HTTPError as err: return {"status": "fail", "reason": str(err)} if data and isinstance(data, str) and pat.match(data) and len(data) == 32: pid = data else: pid = None time.sleep(sleep) if sleep < wtime: sleep *= 2 elif sleep == wtime: sleep = iwtime # start new cycle else: sleep = wtime if (time.time() - time0) > threshold: reason = "client timeout after %s sec" % int(time.time() - time0) return {"status": "fail", "reason": reason} jsondict = json.loads(data) return jsondict
def uploader(USER_NAME, PASSWORD, FILE_LOCATION, DEBUG="NO"): SERVER_NAME = 'w3-connections.ibm.com' print "\nAttempting to log in to (%s)..." % (SERVER_NAME) # Create authenticated server opener cookieProcessor = urllib2.HTTPCookieProcessor(LWPCookieJar()) opener = urllib2.build_opener(cookieProcessor) # encoded parameters sent in a POST method (over secure connection) encodedForm = urlencode({"j_username": USER_NAME, "j_password": PASSWORD}) # in this test case we used the port numbers, depending on the server, you can ignore these urlform = "https://" + SERVER_NAME + "/wikis/j_security_check" request = urllib2.Request(urlform, encodedForm) opener.addheaders = [("User-agent", "Mozilla/5.0")] # Read the response from the server loggedIn = opener.open(request).read() # Check if the response contains a redirection to the login page # Or check if the response is the login page if string.find(loggedIn, 'window.location.replace') == -1 and string.find( loggedIn, 'X-LConn-Login') == -1: print 'Logged in successfully!' else: print 'Failed to log in.' exit() excelFile = open(FILE_LOCATION, 'r') if DEBUG.upper() == "YES": url = 'https://w3-connections.ibm.com/files/basic/api/userlibrary/' url += 'b8e5e0c0-a38e-1033-9149-ac5876bd6d0c/document/a0c2ab4d-b530-458c-a1f1-6ee6d8c3acfb/media' print "RUNNING IN DEBUG MODE" elif DEBUG.upper() == "NO": exit() url = "https://w3-connections.ibm.com/files/basic/api/" url += "communitylibrary/214fe5d2-2471-4a42-ba99-f53de8dbe081/document/fdc2f92b-03c9-4f77-b83c-d8f96f3c491b/media" else: print "\nInvalid argument, either set the DEBUG argument as \"YES\" or leave it blank." exit() #convert raw file to string mmapped_file_as_string = mmap.mmap(excelFile.fileno(), 0, access=mmap.ACCESS_READ) request2 = urllib2.Request(url, mmapped_file_as_string) #this is a hack I found online to make the downloader in python work as an uploader contenttype = mimetypes.guess_type(FILE_LOCATION)[0] request2.add_header('Content-Type', contenttype) request2.get_method = lambda: 'PUT' opener.open(request2) print "File appears to have been uploaded successfully!" excelFile.close()
def openAnything(source, etag=None, lastmodified=None, agent=USER_AGENT, post_data=None, files=None): """URL, filename, or string --> stream This function lets you define parsers that take any input source (URL, pathname to local or network file, or actual data as a string) and deal with it in a uniform manner. Returned object is guaranteed to have all the basic stdio read methods (read, readline, readlines). Just .close() the object when you're done with it. If the etag argument is supplied, it will be used as the value of an If-None-Match request header. If the lastmodified argument is supplied, it must be a formatted date/time string in GMT (as returned in the Last-Modified header of a previous request). The formatted date/time will be used as the value of an If-Modified-Since request header. If the agent argument is supplied, it will be used as the value of a User-Agent request header. """ if hasattr(source, 'read'): return source if source == '-': return sys.stdin if isinstance(post_data, dict): post_data_dict = post_data post_data = [] for key in post_data_dict.keys(): post_data.append((key, post_data_dict[key])) protocol = urlparse.urlparse(source)[0] if protocol == 'http' or protocol == 'https': # open URL with urllib2 request = urllib2.Request(source) request.add_header('User-Agent', agent) if lastmodified: request.add_header('If-Modified-Since', lastmodified) if etag: request.add_header('If-None-Match', etag) if post_data and files: content_type, body = encode_multipart_formdata(post_data, files) request.add_header('Content-Type', content_type) request.add_data(body) elif post_data: request.add_data(encode_post_data(post_data)) request.add_header('Accept-encoding', 'gzip') opener = urllib2.build_opener(SmartRedirectHandler(), DefaultErrorHandler(), urllib2.HTTPCookieProcessor(cj)) return opener.open(request) # try to open with native open function (if source is a filename) try: return open(source) except (IOError, OSError): pass # treat source as string return StringIO(str(source))
def downloadpage(url, post=None, headers=None, timeout=None, follow_redirects=True, cookies=True, replace_headers=False, add_referer=False, only_headers=False, bypass_cloudflare=True, count_retries=0, random_headers=False, ignore_response_code=False): """ Abre una url y retorna los datos obtenidos @param url: url que abrir. @type url: str @param post: Si contiene algun valor este es enviado mediante POST. @type post: str @param headers: Headers para la petición, si no contiene nada se usara los headers por defecto. @type headers: dict, list @param timeout: Timeout para la petición. @type timeout: int @param follow_redirects: Indica si se han de seguir las redirecciones. @type follow_redirects: bool @param cookies: Indica si se han de usar las cookies. @type cookies: bool @param replace_headers: Si True, los headers pasados por el parametro "headers" sustituiran por completo los headers por defecto. Si False, los headers pasados por el parametro "headers" modificaran los headers por defecto. @type replace_headers: bool @param add_referer: Indica si se ha de añadir el header "Referer" usando el dominio de la url como valor. @type add_referer: bool @param only_headers: Si True, solo se descargarán los headers, omitiendo el contenido de la url. @type only_headers: bool @param random_headers: Si True, utiliza el método de seleccionar headers aleatorios. @type random_headers: bool @param ignore_response_code: Si es True, ignora el método para WebErrorException para error como el error 404 en veseriesonline, pero es un data funcional @type ignore_response_code: bool @return: Resultado de la petición @rtype: HTTPResponse Parametro Tipo Descripción ---------------------------------------------------------------------------------------------------------------- HTTPResponse.sucess: bool True: Peticion realizada correctamente | False: Error al realizar la petición HTTPResponse.code: int Código de respuesta del servidor o código de error en caso de producirse un error HTTPResponse.error: str Descripción del error en caso de producirse un error HTTPResponse.headers: dict Diccionario con los headers de respuesta del servidor HTTPResponse.data: str Respuesta obtenida del servidor HTTPResponse.time: float Tiempo empleado para realizar la petición """ response = {} # Headers por defecto, si no se especifica nada request_headers = default_headers.copy() # Headers pasados como parametros if headers is not None: if not replace_headers: request_headers.update(dict(headers)) else: request_headers = dict(headers) if add_referer: request_headers["Referer"] = "/".join(url.split("/")[:3]) if random_headers or HTTPTOOLS_DEFAULT_RANDOM_HEADERS: request_headers['User-Agent'] = random_useragent() url = urllib.quote(url, safe="%/:=&?~#+!$,;'@()*[]") # Limitar tiempo de descarga si no se ha pasado timeout y hay un valor establecido en la variable global if timeout is None and HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT is not None: timeout = HTTPTOOLS_DEFAULT_DOWNLOAD_TIMEOUT if timeout == 0: timeout = None logger.info("----------------------------------------------") logger.info("downloadpage Alfa: %s" % __version) logger.info("----------------------------------------------") logger.info("Timeout: %s" % timeout) logger.info("URL: " + url) logger.info("Dominio: " + urlparse.urlparse(url)[1]) if post: logger.info("Peticion: POST") else: logger.info("Peticion: GET") logger.info("Usar Cookies: %s" % cookies) logger.info("Descargar Pagina: %s" % (not only_headers)) logger.info("Fichero de Cookies: " + ficherocookies) logger.info("Headers:") for header in request_headers: logger.info("- %s: %s" % (header, request_headers[header])) # Handlers handlers = [urllib2.HTTPHandler(debuglevel=False)] if not follow_redirects: handlers.append(NoRedirectHandler()) if cookies: handlers.append(urllib2.HTTPCookieProcessor(cj)) opener = urllib2.build_opener(*handlers) logger.info("Realizando Peticion") # Contador inicio = time.time() req = urllib2.Request(url, post, request_headers) try: if urllib2.__version__ == "2.4": import socket deftimeout = socket.getdefaulttimeout() if timeout is not None: socket.setdefaulttimeout(timeout) handle = opener.open(req) socket.setdefaulttimeout(deftimeout) else: handle = opener.open(req, timeout=timeout) except urllib2.HTTPError, handle: response["sucess"] = False response["code"] = handle.code response["error"] = handle.__dict__.get("reason", str(handle)) response["headers"] = handle.headers.dict if not only_headers: response["data"] = handle.read() else: response["data"] = "" response["time"] = time.time() - inicio response["url"] = handle.geturl()
def get_opener(): """Return a url opener that handles cookies.""" cookie_jar = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar)) return opener
def give_me_cookie(): cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), urllib2.HTTPHandler()) return opener
def read_body_and_headers(url, post=None, headers=[], follow_redirects=False, timeout=30): _log("read_body_and_headers "+url) if post is not None: _log("read_body_and_headers post="+post) if len(headers)==0: headers.append(["User-Agent","Mozilla/5.0 (Macintosh; Intel Mac OS X 10.8; rv:18.0) Gecko/20100101 Firefox/18.0"]) # Start cookie lib ficherocookies = os.path.join( get_data_path(), 'cookies.dat' ) _log("read_body_and_headers cookies_file="+ficherocookies) cj = None ClientCookie = None cookielib = None # Let's see if cookielib is available try: _log("read_body_and_headers importing cookielib") import cookielib except ImportError: _log("read_body_and_headers cookielib no disponible") # If importing cookielib fails # let's try ClientCookie try: _log("read_body_and_headers importing ClientCookie") import ClientCookie except ImportError: _log("read_body_and_headers ClientCookie not available") # ClientCookie isn't available either urlopen = urllib2.urlopen Request = urllib2.Request else: _log("read_body_and_headers ClientCookie available") # imported ClientCookie urlopen = ClientCookie.urlopen Request = ClientCookie.Request cj = ClientCookie.MozillaCookieJar() else: _log("read_body_and_headers cookielib available") # importing cookielib worked urlopen = urllib2.urlopen Request = urllib2.Request cj = cookielib.MozillaCookieJar() # This is a subclass of FileCookieJar # that has useful load and save methods if cj is not None: # we successfully imported # one of the two cookie handling modules _log("read_body_and_headers Cookies enabled") if os.path.isfile(ficherocookies): _log("read_body_and_headers Reading cookie file") # if we have a cookie file already saved # then load the cookies into the Cookie Jar try: cj.load(ficherocookies, ignore_discard=True) except: _log("read_body_and_headers Wrong cookie file, deleting...") os.remove(ficherocookies) # Now we need to get our Cookie Jar # installed in the opener; # for fetching URLs if cookielib is not None: _log("read_body_and_headers opener using urllib2 (cookielib)") # if we use cookielib # then we get the HTTPCookieProcessor # and install the opener in urllib2 if not follow_redirects: opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),urllib2.HTTPCookieProcessor(cj),NoRedirectHandler()) else: opener = urllib2.build_opener(urllib2.HTTPHandler(debuglevel=http_debug_log_enabled),urllib2.HTTPCookieProcessor(cj)) urllib2.install_opener(opener) else: _log("read_body_and_headers opener using ClientCookie") # if we use ClientCookie # then we get the HTTPCookieProcessor # and install the opener in ClientCookie opener = ClientCookie.build_opener(ClientCookie.HTTPCookieProcessor(cj)) ClientCookie.install_opener(opener) # ------------------------------------------------- # Cookies instaladas, lanza la petición # ------------------------------------------------- # Contador inicio = time.clock() # Diccionario para las cabeceras txheaders = {} # Construye el request if post is None: _log("read_body_and_headers GET request") else: _log("read_body_and_headers POST request") # Añade las cabeceras _log("read_body_and_headers ---------------------------") for header in headers: _log("read_body_and_headers header %s=%s" % (str(header[0]),str(header[1])) ) txheaders[header[0]]=header[1] _log("read_body_and_headers ---------------------------") req = Request(url, post, txheaders) if timeout is None: handle=urlopen(req) else: #Disponible en python 2.6 en adelante --> handle = urlopen(req, timeout=timeout) #Para todas las versiones: try: import socket deftimeout = socket.getdefaulttimeout() socket.setdefaulttimeout(timeout) handle=urlopen(req) socket.setdefaulttimeout(deftimeout) except: import sys for line in sys.exc_info(): _log( "%s" % line ) # Actualiza el almacén de cookies cj.save(ficherocookies, ignore_discard=True) # Lee los datos y cierra if handle.info().get('Content-Encoding') == 'gzip': buf = StringIO( handle.read()) f = gzip.GzipFile(fileobj=buf) data = f.read() else: data=handle.read() info = handle.info() _log("read_body_and_headers Response") returnheaders=[] _log("read_body_and_headers ---------------------------") for header in info: _log("read_body_and_headers "+header+"="+info[header]) returnheaders.append([header,info[header]]) handle.close() _log("read_body_and_headers ---------------------------") ''' # Lanza la petición try: response = urllib2.urlopen(req) # Si falla la repite sustituyendo caracteres especiales except: req = urllib2.Request(url.replace(" ","%20")) # Añade las cabeceras for header in headers: req.add_header(header[0],header[1]) response = urllib2.urlopen(req) ''' # Tiempo transcurrido fin = time.clock() _log("read_body_and_headers Downloaded in %d seconds " % (fin-inicio+1)) _log("read_body_and_headers body="+data) return data,returnheaders
#!/usr/bin/env python """ Copyright (c) 2004 Dustin Sallings <*****@*****.**> """ import sys import urllib2 import traceback try: import cookielib cookieJar = cookielib.CookieJar() cookieProcessor = urllib2.HTTPCookieProcessor(cookieJar) openerFactory = urllib2.build_opener except ImportError: import ClientCookie cookieJar = ClientCookie.MozillaCookieJar() cookieProcessor = ClientCookie.HTTPCookieProcessor(cookieJar) openerFactory = ClientCookie.build_opener class ErrorHandler(urllib2.HTTPDefaultErrorHandler): def http_error_default(self, req, fp, code, msg, hdrs): print "*** Got an error %d ***" % (code, ) # print self, req, fp, code, msg, headers return fp if __name__ == '__main__': headers = {'SOAPAction': 'Inform', 'Content-type': 'text/xml'}
#!/usr/bin/python import urllib import urllib2 import cookielib import requests url = 'http://10.1.16.65:8000/login/' login_data = {'username': '******', 'password': '******'} data = urllib.urlencode(login_data) #'http://10.1.16.65:8000/upload/' cookie = cookielib.CookieJar() cookieProc = urllib2.HTTPCookieProcessor(cookie) opener = urllib2.build_opener(cookieProc) urllib2.install_opener(opener) urllib2.urlopen(url, data) tt = {'server': 'test', 'submit': 'select', 'time': '2015-08-18 14:57:06'} data1 = urllib.urlencode(tt) print urllib2.urlopen('http://10.1.16.65:8000/altertime/', data1).read()
def prepare_compare_data(): param = ({ 'utype': 2, 'uid': 'imei', 'category': 1, 'offset': 102, 'count': 4 }) req_url = default_server + '/info/' print 'the req url is %s' % (req_url) client = urllib2.Request(req_url) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(client, json.JSONEncoder().encode(param)) js_data = json.loads(response.read()) store_text(1, js_data['post'], 1) # 1 means store into a compare data dir param = ({ 'utype': 2, 'uid': 'imei', 'category': 2, 'offset': 102, 'count': 4 }) req_url = default_server + '/info/' print 'the req url is %s' % (req_url) client = urllib2.Request(req_url) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(client, json.JSONEncoder().encode(param)) js_data = json.loads(response.read()) store_text(2, js_data['post'], 1) # 1 means store into a compare data dir param = ({ 'utype': 2, 'uid': 'imei', 'category': 3, 'offset': 102, 'count': 4 }) req_url = default_server + '/info/' print 'the req url is %s' % (req_url) client = urllib2.Request(req_url) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(client, json.JSONEncoder().encode(param)) js_data = json.loads(response.read()) store_text(3, js_data['post'], 1) # 1 means store into a compare data dir param = ({ 'utype': 2, 'uid': 'imei', 'category': 4, 'offset': 102, 'count': 4 }) req_url = default_server + '/info/' print 'the req url is %s' % (req_url) client = urllib2.Request(req_url) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor()) response = opener.open(client, json.JSONEncoder().encode(param)) js_data = json.loads(response.read()) store_text(4, js_data['post'], 1) # 1 means store into a compare data dir return 0