def init_data(): br = BasicRequests() br.select_user_agent('firefox') s = open('af').read(64 * 1024) odata = [] for m in re.finditer('<a (.*?)>(.*?)</a>', s, re.S): name = m.group(2) attrs = m.group(1) url = None prov = None for m1 in re.finditer('([a-z][a-z0-9]+)="(.*?)"', attrs, re.S): n, v = m1.group(1), m1.group(2) if n == 'href': url = v if n == "prov": prov = v con = br.request_url(url) siteurl = 'unknown' if con is not None: siteurl = con.request.url print name, con.request.url odata.append({ 'name': name, 'url': siteurl, 'imgurl': '', 'prov': prov }) print json.dumps(odata, ensure_ascii=0, indent=4)
def post_for_proxy(): req = BasicRequests() con = req.request_url( 'http://dev.kuaidaili.com/api/getproxy/?orderid=925817981728018&num=50&b_pcchrome=1&b_pcie=1&b_pcff=1&protocol=1&method=2&an_an=1&an_ha=1&sp1=1&quality=1&sort=1&format=json&sep=1' ) if con: return eval(con.text)
def resolve(self, getimg, fc=None): while True: imgcon = getimg(dbgdata=fc) f = StringIO.StringIO() f.write(imgcon) f.seek(0) h = BasicRequests() imgcode = None if fc is not None and isinstance(fc, dict) and "type" in fc: response = h.request_url(self.server, files={'file': imgcon}, data={"province": self._type}) result = response.text.strip() if '"valid":true' in result: try: result = json.loads(result) imgcode = result["answer"] if imgcode == None or imgcode == "": print "验证码图片识别错误 imgcode==None或''" continue except Exception as err: print "验证码图片识别错误,重新校验...,result:", result, "错误原因:", err time.sleep(1) continue else: url = "%s?type=%s" % (self.server, self._type) response = h.request_url(url, files={'file': f}) imgcode = response.text.strip() if imgcode == '<fail>': print "验证码图片识别错误 imgcode==<fail>" continue if isinstance(fc, dict): fc['content'] = imgcon fc['code'] = imgcode return imgcode
def find_ipin_proxy(): ff = IpinFactory() prs = ff.getProxyList() s = BasicRequests() #print json.dumps(prs, ensure_ascii=0).encode('utf-8') res = {} for p in prs: print "trying", p auth, proxies = ff.genRequestsParam(p) con = s.request_url("http://ip.cn/", auth=auth, proxies=proxies, timeout=6) if con is None: continue m = re.search("<code>(.*?)</code>", con.text) if m: sys.stderr.write("%s %s\n" % (p['host'], m.group(1))) res[m.group(1)] = "%s:%s:%s" % (p.get('host'), p.get('port'), p.get('password')) p2, proxies = ff.genTinyProxy(p) print proxies con = s.request_url("http://ip.cn/", proxies=proxies, timeout=5) if con is None: continue m = re.search("<code>(.*?)</code>", con.text) if m: sys.stderr.write("%s %s\n" % (p['host'], m.group(1))) res[m.group(1)] = p2 print "\n".join(res.values())
def __init__(self, sf='jobs'): BasicRequests.__init__(self) self._main_url = 'http://www.zjsfgkw.cn/Document/JudgmentBook' self._court_search_url = 'http://www.zjsfgkw.cn/Judges/GetCountByCountId' self._book_search_url = 'http://www.zjsfgkw.cn/document/JudgmentSearch' self.start_date = '19700101' self.end_date = time.strftime('%Y%m%d', time.localtime()) self.save_file = sf
def get_search_url(opts): # SF_1_1_27=0为中文简历 SF_1_1_27=1为英文简历, 但是从js看并不能搜英文的. b = BasicRequests() # surl = 'http://rdsearch.zhaopin.com/Home/ResultForCustom?SF_1_1_7=7,9&orderBy=DATE_MODIFIED,1&pageSize=60&SF_1_1_27=0&exclude=1' surl = 'http://rdsearch.zhaopin.com/Home/ResultForCustom?SF_1_1_7=8,9&orderBy=DATE_MODIFIED,1&pageSize=60&SF_1_1_27=0&exclude=1' for name, value in opts.items(): surl = b.compose_url(surl, name, value) return surl
def test_ps(): ps = PageStore51() ps.testmode = True br = BasicRequests() br.select_user_agent('firefox') url = "http://jobs.51job.com/beijing-hdq/70320056.html?s=0" con = br.request_url(url) ps.save(int(time.time()), "jd_51job://", url, con.text)
def test_login(self): req = BasicRequests() con = req.request_url(self.main_url + 'login', date={ 'username': '******', 'password': '******' }) print con.headers print con.text
def test_parse_time(): request = BasicRequests() con = request.request_url( 'http://www.zjsfgkw.cn/attachment/documentbook/2016-04-05/0225-0229/html/671a34a7-b068-4025-af13-d9fe4c28ce6a.html' ) m = re.search( ur'[一二三四五六七八九〇零○十]{4}年[一二三四五六七八九〇十○]{1,2}月[一二三四五六七八九〇零○十]{1,3}日', con.text) if m: print date_cs2num(m.group())
def thread_init(self, tid): # self.proxyq is threading-safe proxy = self.proxyq.get(True) basicreq = BasicRequests() basicreq.sp_proxies[proxy] = 0 basicreq._cur_proxy_index = 0 basicreq._auto_change_proxy = False setattr(self._tls, "req", basicreq) with self.locker: Log.info("Thread%d's request prepared..Proxy:%s" % (tid, proxy))
def try_proxy(proxy, url='http://gaokao.chsi.com.cn', tag=u'阳光高考'): req = BasicRequests() req.set_proxy(proxy, 0, False) # con = req.request_url('http://gk.chsi.com.cn/recruit/listSpecBySchool.do?yxdm=11055&start=0 ') con = req.request_url(url, timeout=5) if con: m = re.search(r'<title>[^<]*<\/title>', con.text) if m: print m.group() return re.search(tag, con.text)
def sub_pages(url, con): count = CVZhilianUtil.get_count(url, con) if count > 4000: count = 4000 npages = (count + 60 - 1) / 60 if npages >= 2: b = BasicRequests() for p in range(2, npages + 1): url1 = b.compose_url(url, 'pageIndex', p) yield url1
def test_extract_inner_paper_url(): rq = BasicRequests() con = rq.request_url( 'http://www.zjsfgkw.cn/document/JudgmentDetail/4177773') content = re.search(r'<div class="books_detail_header">.*</IFRAME>', con.text, re.S) m = re.search(r'src="([^"]+)"', content.group()) if m: print m.group(1) else: print content
def test_search(): a = BasicRequests() while True: data = { 'docids1': "1,2,3,4,5,6,6109234,6110168,11070364", "keywords": "武汉" } con = a.request_url("http://localhost:4096/search?hehe=1", data=data) if con is not None: print con.code, con.text time.sleep(10)
def filter_with_speed(proxies, url='http://www.baidu.com', timeout=10): results = [] req = BasicRequests() for proxy in proxies: req.set_proxy(proxy, len(req.sp_proxies), False) try: con = req.request_url(url, timeout=timeout) except Exception: con = None if con: results.append(proxy) return results
def get_child_court(self, court): req = BasicRequests() req.set_proxy(self.proxy) time.sleep(1) print 'fetching child court', court['key'] con = req.request_url('http://wenshu.court.gov.cn/Index/GetChildAllCourt', data={'keyCodeArrayStr': court['key']}) if '<' in con.text: return court = self.parse_results(con) for c in court: self.child_courts.append(c)
def get_court(self): req = BasicRequests() req.set_proxy(self.proxy) for p in self.provinces: time.sleep(1) print 'fetch province', p['name'] con = req.request_url('http://wenshu.court.gov.cn/Index/GetCourt', data={'province': p['name']}) if '<' in con.text: print 'invalid response' continue court = self.parse_results(con) for c in court: self.courts.append(c)
def runjs(url): rq = BasicRequests() con = rq.request_url(url, data={}) if con: print con.text m = re.findall(r'<script[^>]*>(.+?)</script>', con.text, re.S) if m: for js in m: if js == '': continue print js sc = "document = {set cookie(a){console.log(a);}}, window = {innerWidth: 1024, innerHeight: 768, screenX: 200, screenY: 100, screen: {width: 1024, height: 768}}\n" sc += js rv = spider.util.runjs(sc) print 'my results:' print rv
def _do_requests(self, url, **kwargs): rv = BasicRequests._do_requests(self, url, **kwargs) # TODO: replace SimpleCookie with someone better. if rv is not None: curlckjar = getattr(self._curltls, 'cookies', None) if curlckjar is None: curlckjar = CurlCookieJar() curlckjar.add_list(rv.cookies) setattr(self._curltls, 'cookies', curlckjar) return rv
def test_proxy(proxy, url, count=10): c = count rq = BasicRequests() rq.set_proxy(proxy['p']) total = 0 success = 0 while c > 0: try: s = time.time() con = rq.request_url(url) t = time.time() - s except: con = None t = 0 pass c -= 1 if con: success += 1 total += t if success > 0: proxy['v'] = total / success
def test_proxy_speed(url, proxy, t=60): req = BasicRequests() req.set_proxy(proxy, 0, False) s = time.time() count = 0 while time.time() - s <= t: try: req.request_url(url, timeout=5) count += 1 except Exception: pass return count
#!/usr/bin/env python # -*- coding:utf8 -*- from spider.httpreq import BasicRequests if "__main__" == __name__: rq = BasicRequests() rq.set_proxy('106.75.134.190:18888:ipin:ipin1234') con = rq.request_url('http://www.zjsfgkw.cn/document/JudgmentDetail/4062962') if con: print con.text
def __init__(self): BasicRequests.__init__(self) self.url = 'http://www.shenpan.cn/cpws/writopenlist.aspx?typeString=' self.params = {} self.count = 0
def __init__(self): BasicRequests.__init__(self) self._url_format = 'http://gaokao.chsi.com.cn/zyk/pub/myd/schAppraisalTop.action?start=%d' self._url_format2 = 'http://gaokao.chsi.com.cn/zsgs/zhangcheng/listVerifedZszc--method-index,lb-1,start-%d.dhtml' self._schools = []
def __init__(self, url): BasicRequests.__init__(self) self.url = url
'format': 'http://gaokao.chsi.com.cn/zsgs/zhangcheng/listVerifedZszc--method-index,lb-1,start-%d.dhtml', 'url_reg': r'<a href="\/zsgs\/zhangcheng\/listZszc\-\-schId\-\d+\.dhtml".*>([^<]*)<\/a>', 'page_reg': r'<a[^>]*start\-\d+[^>]*>(\d+)<', 'pagesize': 100 }] url_format = [ 'http://gaokao.chsi.com.cn/zyk/pub/myd/schAppraisalTop.action?start=%d', 'http://gaokao.chsi.com.cn/zsgs/zhangcheng/listVerifedZszc--method-index,lb-1,start-%d.dhtml' ] url_pattern = [ r'<a href="#" onclick="doDialog\(\'\d+\',\'([^\']+)\'\);', r'<a href="\/zsgs\/zhangcheng\/listZszc\-\-schId\-\d+\.dhtml".*>([^<]*)<\/a>' ] page_pattern = [ r'<a[^>]*start=\d+[^>]*>(\d+)<', r'<a[^>]*start\-\d+[^>]*>(\d+)<' ] request = BasicRequests() schools = [] for site in sites: schools += GetSchoolName.fetch_schools(request, site['format'], site['pagesize'], site['url_reg'], site['page_reg']) ss = [] for c in schools: if c not in ss: ss.append(c) GetSchoolName.save(schools)
def __init__(self): BasicRequests.__init__(self) self.url = 'http://www.ftcourt.gov.cn/cpwspt/writopenlist.aspx?cls=0' self.params = {} self.count = 0
def __init__(self): BasicRequests.__init__(self) self.session = requests.Session()
def test_find(self): req = BasicRequests() con = req.request_url(self.main_url + '?key=' + 'mumas') print con.text con = req.request_url(self.main_url + '?key=' + 'skiloop') print con.text
def load_url(url): br = BasicRequests() con = br.request_url(url) print con.text