def init_data(): br = BasicRequests() br.select_user_agent('firefox') s = open('af').read(64 * 1024) odata = [] for m in re.finditer('<a (.*?)>(.*?)</a>', s, re.S): name = m.group(2) attrs = m.group(1) url = None prov = None for m1 in re.finditer('([a-z][a-z0-9]+)="(.*?)"', attrs, re.S): n, v = m1.group(1), m1.group(2) if n == 'href': url = v if n == "prov": prov = v con = br.request_url(url) siteurl = 'unknown' if con is not None: siteurl = con.request.url print name, con.request.url odata.append({ 'name': name, 'url': siteurl, 'imgurl': '', 'prov': prov }) print json.dumps(odata, ensure_ascii=0, indent=4)
def test_ps(): ps = PageStore51() ps.testmode = True br = BasicRequests() br.select_user_agent('firefox') url = "http://jobs.51job.com/beijing-hdq/70320056.html?s=0" con = br.request_url(url) ps.save(int(time.time()), "jd_51job://", url, con.text)
#!/usr/bin/env python # -*- coding:utf8 -*- import io import tesseract_ocr import pyocr from PIL import Image from court.util import Captcha from spider.httpreq import BasicRequests if __name__ == '__main__': rq = BasicRequests() rq.select_user_agent('firefox') rq.set_proxy('106.75.134.191:18888:ipin:ipin1234') # con = rq.request_url('http://ssfw.szcourt.gov.cn/yzm.jsp') con = rq.request_url('http://www.bjcourt.gov.cn/yzm.jpg') if not con: print 'failed to fetch image' else: t = tesseract_ocr.Tesseract() text = t.text_for_bytes(con.content) print text with open('a.jpeg', 'wb') as f: f.write(con.content) print Captcha.resolve('a.jpeg', '1')
def get_area_code(): nr = BasicRequests() nr.select_user_agent('firefox') con = nr.request_url( 'http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201504/t20150415_712722.html' ) xx = spider.util.htmlfind(con.text, '<div class="TRS_PreAppend"', 0) shtml = xx.get_node() stext = re.sub('<.*?>', lambda m: sv(m), shtml) stext = re.sub(' ', ' ', stext) outmap = {} cclist = {} fulloutmap = {} for line in re.split("\n", stext): cns = re.split(r'\s+', line) if len(cns) <= 1: continue code, name = cns name = name.decode('utf-8').strip() if u'直辖县级行政区划' in name: continue if name in [u'市辖区', u'区', u'县', u'矿区', u'郊区', u'城区']: continue if name not in fulloutmap: fulloutmap[name] = [] fulloutmap[name].append(code[0:4]) name1 = get_short_name(name) if name1 is None or name1 == name: continue if name1 not in cclist: cclist[name1] = {} if code[0:4] not in cclist[name1]: cclist[name1][code[0:4]] = [] cclist[name1][code[0:4]].append([code, name]) for key in cclist.keys(): if key == u'吉林': outmap[key] = "2202" elif key == u"海南": outmap[key] = "4600" elif len(cclist[key]) == 1: thekey = cclist[key].keys()[0] outmap[key] = thekey else: preflist = [] for thekey, v in cclist[key].items(): for code, name in v: if code[-2:] == '00': preflist.append(code) if len(preflist) == 0: pass elif len(preflist) == 1: outmap[key] = preflist[0][0:4] else: assert not "nani?" fout = {} for k, v in outmap.items(): fout[k] = v for k, v in fulloutmap.items(): if len(v) == 1: fout[k] = v[0] return fout