def __init__(self, info): SessionRequests.__init__(self) self.info = info self.select_user_agent('firefox') self._con = None self.onl = OnlineOCR(info['prov']) port = int(info['prov']) / 10000 + 8000 #self.onl.server = "http://win.haohaogame.com:%d/codeocr" % port self.onl.server = "http://192.168.0.10:%d/codeocr" % port self.code = None
def test_fetch_paper(url): rq = SessionRequests() con = rq.request_url(url) print con.headers print con.cookies print con.text con = rq.request_url(url) print con.text print con.headers print con.cookies
def check_yantian_cookies(): sq = SessionRequests() con = sq.request_url( 'http://www.shenpan.cn/cpws/writopenlist.aspx?typeString=') print con.headers print con.cookies # print con.content yt = YantianGenQueries() yt.get_form_values() yt.show()
def __init__(self, thcnt): Spider.__init__(self, thcnt) self.request = SessionRequests() self.view_state = None self.event_valid = None self.rand = None self.loc = "浙江" self.data_file = FileSaver("浙江_data.txt") self.have_get_url_file = FileSaver("浙江_get_url.txt") self.init_already() self.login("38037395", "773950")
def load_url_with_cookies(url, cookies): rq = SessionRequests() for c in cookies: rq.add_cookie(c['domain'], c['name'], c['value'], path=c['path'], secure=c['secure']) con = rq.request_url(url) if con: print con.text print con.cookies
def init_req(self, proxies={}): req = getattr(self._curltls, "req", None) if req is None: req = SessionRequests() url = urls[random.randrange(0, len(urls))] while True: con_init = req.request_url(url, proxies=proxies) if con_init is None or con_init.code != 200: print "...初始化失败..." else: break setattr(self._curltls, "req", req) setattr(self._curltls, "init_url", url) return req
def get_session_request(self): sr = SessionRequests() with self.locker: if not isinstance(self.networker.sp_proxies, dict) or len(self.networker.sp_proxies.keys()) == 0: return sr if self.networker._auto_change_proxy: prs = self.networker.sp_proxies.keys() for i in range(0, len(prs)): self.networker._cur_proxy_index = (self.networker._cur_proxy_index+1) % len(prs) selproxy = prs[self.networker._cur_proxy_index] if self.networker.sp_proxies.get(selproxy, 0) <= 10: sr.set_proxy(selproxy, index=0, auto_change=False) break elif self.networker._cur_proxy_index < 0: pass # don't auto change proxy, and the index < 0, no proxy is used. # but don't report an error. else: prs = self.networker.sp_proxies.keys() selproxy = prs[self.networker._cur_proxy_index % len(prs)] self.networker.set_proxy(selproxy, index=0, auto_change=False) return sr
def test_post_list(): ll = [] with open('jobs', 'r') as f: for s in f: ll.append(s) l = ll[3] print l v = eval(l.encode('utf-8')) if not isinstance(v, dict): return p = copy.deepcopy(v['param']) if not isinstance(p, dict): return print p.keys() # p['__EVENTARGUMENT'] = '' p['hdPageIndex'] = '10' # p.pop('hdPageIndex') # p.pop('__VIEWSTATE') if p.has_key('__LASTFOCUS'): p.pop('__LASTFOCUS') # p.pop('__EVENTVALIDATION') p['__EVENTTARGET'] = 'btnNext' print len(p) rq = SessionRequests() # con = rq.request_url(v['url']) # if con and con.text: # m = re.search(r'<title.*', con.text) # if m: # print m.group() # else: # print len(con.text) data = urlencode(p) con = rq.request_url(v['url'], data=p) print p if con and con.text: m = re.search(r'<title.*', con.text) print con.text
def test_post(self): req = SessionRequests() res = req.request_url(self.main_url + 'update', data={ 'encrypt': 'LuJAxGaUMqnDOARGzY9zIe0Rd41opkL7', 'key': 'mumas', 'value': '192.168.1.251' }) print res.headers print res.text res = req.request_url(self.main_url + 'update', data={ 'encrypt': 'LuJAxGaUMqnDOARGzY9zIe0Rd41opkL7', 'key': 'mumaas', 'value': '192.168.1.251' }) print res.headers print res.text
class check(object): def __init__(self): self.sessionReq = SessionRequests() self.sessionReq.load_proxy('../_51job/proxy') def start(self): print 'begin crawler!' url = r"http://gaokao.chsi.com.cn/zsjh/search.do?ccdm=&jhxzdm=&kldm=&method=majorList&sySsdm=11&year=2016&yxdm=10001" header = { "User-Agent": r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0", "Host": r"gaokao.chsi.com.cn", "Referer": r"http://gaokao.chsi.com.cn/zsjh/" } while True: i = random.randint(1, 60) print i con = self.sessionReq.request_url(url, headers=header) if con is None or con.text.strip() is "": print "Nothing back!readd job" time.sleep(i) continue if r"302 Found" in con.text: print "302 Found!" time.sleep(i) continue if r"403 Forbidden" in con.text: print "403 Forbidden!" time.sleep(i) continue if "专业名称" not in con.text or "计划数" not in con.text: print "阳光高考还没更新!" time.sleep(i) continue if "专业名称" in con.text and "计划数" in con.text: print "阳光高考已经更新!" break spider.util.sendmail(['*****@*****.**'], '阳光高考可以爬了!', "请迅速去抓该网站!")
def test_session(): cnt = 10 random.seed = int(time.time()) threads = [] req = SessionRequests() for i in range(0, cnt): t = threading.Thread(target=_session_test, args=(req, i)) threads.append(t) for t in threads: t.start() time.sleep(5) for t in threads: t.join()
def get_code(image_content): retry = 3 while retry >= 0: try: #联众帐号,密码 user = "******" pwd = "15004622415" #软件key 或者作者帐号 soft_key = "fpTof8NxP4FTFOTp6Tfi3ik6TxtfEOkE68nE3foK" s = SessionRequests() lz = Lianzhong(s, user, pwd, soft_key) vcode, vid = lz.create(image_content) return vcode except: traceback.print_exc() retry -= 1 raise RuntimeError()
class X315Spider(Spider): def init_req(self): self.sesnreq.load_proxy("../gongshangju/proxy_all.txt") self.sesnreq.select_user_agent("firefox") self.sesnreq.request_url("http://www.x315.com/index") con = self.sesnreq.request_url( "http://s4.cnzz.com/stat.php?id=1256666136&show=pic1") jsessionid = con.cookies[0].split("\t")[-1] jsscript = "var document={};var window = {};" + con.text + "console.log(document.cookie);" f = open("login.js", "w+b") f.write(jsscript) f.close() os.system("nodejs login.js>cookie.txt") f = open("cookie.txt", "r+b") cookiestr = f.read() self.cookiestr = urllib.unquote( re.search("(CNZZDATA.*?;)", cookiestr).group(1) + "JSESSIONID=" + jsessionid + ";") print self.cookiestr def __init__(self, threadcnt): Spider.__init__(self, threadcnt) self.sesnreq = SessionRequests() self.sesnreq.load_proxy("../gongshangju/proxy1.txt", 0, False) self.sesnreq.select_user_agent("firefox") # self.init_req() def dispatch(self): currline = 0 skipto = 0 endline = 100000 with open(os.environ["HOME"] + "/r1.txt", "rb") as f: while currline < skipto: line = f.readline() currline += 1 while currline < endline: line = f.readline().strip() key = line.split(" ")[2] job = { "type": "t1", "key": key, "line": line, "lineno": currline } self.add_main_job(job) currline += 1 self.add_main_job(None) def jobrunner1(self, job): con = self.sesnreq.request_url( "http://www.x315.com/", headers={"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"}) # self.sesnreq.request_url("http://s4.cnzz.com/stat.php?id=1256666136&show=pic1") url = r"http://www.x315.com/quicksearch?qk=%s&t=1&z=×tamp=%s" % ( "富士康", str(time.time()).split(".")[0] + str(time.time()).split(".")[1][:3]) header = { "Accept": r"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Connection": "Keep-alive", "Content-Type": "application/x-www-form-urlencoded", "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3", "Referer": "http://www.x315.com/", "X-Requested-With": "XMLHttpRequest" } con = self.sesnreq.request_url(url, headers=header) print con.text if ur"查询过于频繁" in con.text: if not self.re_add_job(job): Log.error("readd job failed.==>" + utf8str()) Log.info("查询过于频繁,sleep 10 s.") time.sleep(10)
def __init__(self): self.sessionReq = SessionRequests() self.sessionReq.load_proxy('../_51job/proxy')
class GK66(Spider): """这是使用自己的多线程框架重构的gk66网站爬取代码,没有针对所有省份进行处理,只能针对特定省份的账号密码进行爬取,验证码也是手动的,未进行处理""" def __init__(self, thcnt): Spider.__init__(self, thcnt) self.request = SessionRequests() self.view_state = None self.event_valid = None self.rand = None self.loc = "浙江" self.data_file = FileSaver("浙江_data.txt") self.have_get_url_file = FileSaver("浙江_get_url.txt") self.init_already() self.login("38037395", "773950") def init_already(self): cnt = 0 with open("浙江_get_url.txt") as f: for line in f: line = line.strip() have_get_url.add(line) cnt += 1 print "初始化已经爬过的链接 ", cnt def login(self, username, password): # 验证码 captcha_url = "http://14wj.gk66.cn/ashx/code.ashx" rand = None while True: res = self.request.request_url(captcha_url) if res is not None and res.code == 200: spider.util.FS.dbg_save_file("captcha.jpeg", res.content) rand = raw_input("login ---> 请输入验证码:") #get_code(res.text) # if rand == "retry": continue #break login_url = "http://14wj.gk66.cn/ashx/login.ashx" data = { "username": username, "password": password, "rand": rand, "rempass": "******" } while True: con = self.request.request_url(login_url, data=data) if con is None or con.code != 200: continue content = con.content r_type = eval(content)[0]["type"] if r_type == "2": print "登陆失败:", spider.util.utf8str(content) if u"验证码错误".encode("gb2312") in content: break continue else: self.view_state, self.event_valid = self.prepare_param( ) self.rand = self.prepare_rand() if self.view_state is None or self.event_valid is None or self.rand is None: print "---未获取到view_state event_valid rand ---" continue print "登陆成功..." return continue else: print "login 获取验证码图片失败,", "res is None" if res is None else "res.code = %d" % res.code # login_url = "http://14wj.gk66.cn/ashx/login.ashx" # data = {"username": username, # "password": password, # "rand": rand, # "rempass": "******"} # while True: # con = self.request.request_url(login_url, data=data) # if con is None or con.code != 200: # continue # content = con.content # r_type = eval(content)[0]["type"] # if r_type == "2": # print "登陆失败:", spider.util.utf8str(content) # continue # else: # self.view_state, self.event_valid = self.prepare_param() # self.rand = self.prepare_rand() # if self.view_state is None or self.event_valid is None or self.rand is None: # print "---未获取到view_state event_valid rand ---" # continue # print "登陆成功..." # break def prepare_param(self): search_url = "http://14wj.gk66.cn/wj/fs.aspx" res = self.request.request_url(search_url) if res is not None and res.code == 200: fs_page = res.content soup = BeautifulSoup(fs_page, 'html5lib') view_state = soup.find(attrs={'id': "__VIEWSTATE"}).get("value") event_valid = soup.find(attrs={ 'id': "__EVENTVALIDATION" }).get("value") print "view_state=", view_state, "event_valid=", event_valid return view_state, event_valid return None, None def prepare_rand(self): global FILE_NAME_1 captcha_url_2 = "http://14wj.gk66.cn/ashx/codewj.ashx" while True: con = self.request.request_url(captcha_url_2) if con is None or con.code != 200: print "prepare_rand 请求错误...", "结果为空" if con is None else "http code = %d" % con.code continue spider.util.FS.dbg_save_file("captcha2.jpeg", con.content) rand = raw_input( "prepare_rand ---> 请输入验证码:") #get_code(con.content) if rand == "retry": continue return rand def logout(self): logout_url = "http://www.gk66.cn/loginout.aspx" self.request.request_url(logout_url) def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): nf_list = ["14"] #["06" , "07", "08", "09", "10", "11"] # 年份 wl_list = ["w", "l"] # 文科 理科 bz_list = ["b", "z"] # 本科 专科 for fs in range(732, 810): print "分数:", fs for nf in nf_list: for wl in wl_list: for bz in bz_list: data = { "fs": fs, "nf": nf, "wl": wl, "bz": bz, "pc": "", "ImageButton1.x": 98, "ImageButton1.y": 13 } #print "合成请求数据:", data job = {"data": data} self.add_main_job(job) time.sleep(0.1) self.wait_q_breakable() self.add_job(None, True) def run_job(self, job): data = job["data"] data["__VIEWSTATE"] = self.view_state data["__EVENTVALIDATION"] = self.event_valid data["rand"] = self.rand retry = 3 while retry >= 0: try: self.loop_exec(data) break except: traceback.print_exc() print "出错,sleep 1s" time.sleep(1) retry -= 1 try: self.logout() except: pass self.login("38037395", "773950") def loop_exec(self, data): try: while True: url = self.build_search_url(data) if url != None: break else: self.login("38037395", "773950") except Exception as e: print "build_search_url failure ...", e return page_break = False last_v = {} if url in have_get_url: print "已经爬取,pass" return if "http://14wj.gk66.cn/login.aspx?" in url: raise RuntimeError() for page in range(1, 1000): if page_break: break exec_url = url + "&s=0&page=" + str(page) print "执行链接:", exec_url datas = self.get_score_data(exec_url, page_break=page_break) if len(datas) < 20: page_break = True for v in datas: if v is None: page_break = True break v["location"] = self.loc v["year"] = data["nf"] v["wl"] = data["wl"] v["bz"] = data["bz"] if (str(last_v) == str(v)): page_break = True break last_v = v k = { "location": v["location"], "school": v["school"], "spec": v["spec"], "batch": v["batch"], # 批次 "score": v["score"], "year": v["year"], "wl": v["wl"], "bz": v["bz"] } print v self.data_file.append(spider.util.utf8str(v)) #store_score(k, v) self.recorde_spided(url) def recorde_spided(self, url): self.have_get_url_file.append(url) have_get_url.add(url) def get_score_data(self, data_url, page_break=False): try: page_content = None while True: res = self.request.request_url(data_url) if res is not None and res.code == 200: page_content = res.content if u"对不起,请先登录".encode("gb2312") in page_content: self.logout() self.login("38037395", "773950") continue break else: print "获取页面出错>..", "res is None" if res is None else "res.code == %d " % res.code continue datas = [] if string.find(page_content, u"相近分数".encode("gb2312")) > 0: print "该页面没有数据" return datas soup = BeautifulSoup(page_content, 'html5lib') rows = soup.findAll("tr") if rows is not None and len(rows) > 0: if len(rows) != 20: page_break = True for row in rows: cols = row.findAll("td") if cols is not None and len(cols) == 13: data = {} data["school"] = cols[0].getText() data["spec"] = cols[1].getText() data["rank"] = cols[2].getText() data["score"] = cols[3].getText() data["batch"] = cols[4].getText() data["score_number"] = cols[5].getText() data["spec_number"] = cols[6].getText() data["high_score"] = cols[7].getText() data["high_score_rank"] = cols[8].getText() data["low_score"] = cols[9].getText() data["low_score_rank"] = cols[10].getText() data["average_score"] = cols[11].getText() data["average_score_rank"] = cols[12].getText() datas.append(data) return datas else: print "页面无内容:", page_content except Exception as e: print "get_score_data 发生异常", e return None def build_search_url(self, data): search_url = "http://14wj.gk66.cn/wj/fs.aspx" headers = { "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .N ET4.0C; .NET4.0E)" } resp = self.request.request_url(search_url, data=data, headers=headers) headers = resp.headers m = re.search("Location:(.*)\r\n", headers) if m: location = m.group(1).strip() return "http://14wj.gk66.cn" + location else: return None #location = resp.headers["Location"] def store_score(self, value): print filename + ' being write-->', value obj = Score_gk66.objects( location=value["location"], year=value["year"], bz=value["bz"], wl=value["wl"], school=value['school'], spec=value['spec'], rank=value['rank'], score=value['score'], batch=value["batch"], score_number=value['score_number'], spec_number=value['spec_number'], high_score=value['high_score'], high_score_rank=value['high_score_rank'], low_score=value['low_score'], low_score_rank=value['low_score_rank'], average_score=value['average_score'], average_score_rank=value['average_score_rank']).no_cache().timeout( False).first() if not obj: obj = Score_gk66(location=value["location"], year=value["year"], bz=value["bz"], wl=value["wl"], school=value['school'], spec=value['spec'], rank=value['rank'], score=value['score'], batch=value["batch"], score_number=value['score_number'], spec_number=value['spec_number'], high_score=value['high_score'], high_score_rank=value['high_score_rank'], low_score=value['low_score'], low_score_rank=value['low_score_rank'], average_score=value['average_score'], average_score_rank=value['average_score_rank']) obj.save() self.num_count += 1 print "保存成功:", value else: print u"数据已存在" def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg)
class YggaokaoRetry(Spider): def __init__(self, threadcnt): super(Yggaokao, self).__init__(threadcnt) self.sessionReq = SessionRequests() def dispatch(self): f = open("prov_list", "r+b") currline = 0 skipto = 0 endline = 10000 for line in f: currline += 1 if currline >= skipto: sySsdm = line.split(" ")[0].strip() job = { "sySsdm": sySsdm, "year": "2014", "start": 0, "type": "u1" } self.add_main_job(job) if currline >= endline: break self.wait_q() self.add_job(None, True) def retry(self, con, job): if re.search(u'<h1>An error occurred.</h1>', con.text) or re.search( u'Tinyproxy was unable to', con.text): #should reload this page. if int(job["retrycnt"]) < 5: job["retrycnt"] = int(job["retrycnt"]) + 1 self.add_job(job) return True return False def save_sch_list(self, job, res): with self.locker: fr = open("prov/" + job["sySsdm"] + ".txt", "r+b") f = open("prov/" + job["sySsdm"] + ".txt", "a+b") schlist = re.findall( r'<tr bgcolor="#FFFFFF" onMouseOver="this.style.background=\'#FFFFEE\'" onMouseOut=\"this.' r'style.background=\'#ffffff\'">(.*?)</tr>', res, re.S) for schinfo in schlist: schstr = "" tds = re.findall(r"<td.*?>(.*?)</td>", schinfo, re.S) if len(tds) is 0: spider.runtime.Log.error( job["sySsdm"] + ", start at " + str(job["start"]) + " match error! No td tag! Readd..\n") self.re_add_job(job) ferr = open( "errors/" + job["sySsdm"] + "_" + str(job["start"]) + ".html", "w+b") ferr.write(res) ferr.close() f.close() return schnamelist = re.findall(r'dhtml">(.*?)</a>', tds[0], re.S) if len(schnamelist) is 0: schnamelist = [] schnamelist.append(tds[0].strip()) if schnamelist[0] is "": spider.runtime.Log.error( job["sySsdm"] + ", start at " + str(job["start"]) + " match error! No school name! Readd..\n") self.re_add_job(job) ferr = open( "errors/" + job["sySsdm"] + "_" + str(job["start"]) + ".html", "w+b") ferr.write(res) ferr.close() f.close() return schname = schnamelist[0] if schname in fr.read(): print 'skip...', schname, ", in", job["sySsdm"], ".txt" fr.close() f.close() return schstr += schname if r'span985">985</span>' in tds[0]: schstr += " 985" if r'span211">211</span>' in tds[0]: schstr += " 211" if r'spanyan">研</span>' in tds[0]: schstr += " 研" for i in range(len(tds))[1:(len(tds) - 1)]: schstr += " " + tds[i] stucnt = re.findall(r"doDialog.*?\">(.*?)</a>", tds[len(tds) - 1], re.S)[0].strip() schstr += " " + stucnt f.write(schstr + "\n") f.flush() f.close() def save_sch_detail(self, job, res): if not os.path.exists("detail/" + job["sySsdm"]): os.makedirs("detail/" + job["sySsdm"]) f = open("detail/" + job["sySsdm"] + "/" + job["yxdm"] + ".html", "w+b") f.write(res) f.flush() f.close() def check_should_fetch(self, job): if (job["type"] is "u1"): return True else: if os.path.exists("detail/" + job["sySsdm"] + r"/" + job["yxdm"] + ".html"): return False else: return True def run_job(self, job): if job["type"] is "u1": print "searching %s, start at %d" % (job["sySsdm"], job["start"]) url = "http://gaokao.chsi.com.cn/zsjh/searchZsjh.do?ccdm=&jhxzdm=&kldm=&searchType=1&ssdm=&" \ "sySsdm=%s&year=%s&yxmc=&start=%d" % (job["sySsdm"], job["year"], job["start"]) header = { "User-Agent": r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36", "Referer": r"http://gaokao.chsi.com.cn/zsjh/zsjh2014.jsp", "Origin": r"http://gaokao.chsi.com.cn", "X-Requested-With": r"XMLHttpRequest", "Pragma": r"no-cache" } con = self.sessionReq.request_url(url, headers=header) if con is None or con.text.strip() == "": spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " nothing return ! Readd..\n") time.sleep(10) self.re_add_job(job) return res = con.text if r"302 Found" in res: spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " 302 Found ! Readd..\n") self.re_add_job(job) firstCraw = True setattr(self._curltls, 'firstCraw', firstCraw) return elif re.search(u'无查询结果', res): spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " no data! Readd..\n") self.re_add_job(job) return elif re.search(ur'<H1>错误</H1>', res): time.sleep(3) spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " error occur ! Readd..\n") self.re_add_job(job) firstCraw = True setattr(self._curltls, 'firstCraw', firstCraw) return else: if int(job["start"]) is 0: m = re.search(r"if \(Num > (\d+)", res) if m: pgcnt = int(m.group(1)) while pgcnt > 1: jobb = { "sySsdm": job["sySsdm"], "year": job["year"], "start": (pgcnt - 1) * 20, "type": "u1" } self.add_job(jobb) pgcnt -= 1 else: spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " no more page! Readd!\n") self.re_add_job(job) return yxdms = re.findall(r"doDialog\('(\d+)'", res, re.S) if len(yxdms) == 0: spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " no url! Readd.\n") self.re_add_job(job) return for yxdm in yxdms: job2 = { "yxdm": yxdm, "sySsdm": job["sySsdm"], "year": "2014", "type": "u2" } if not self.check_should_fetch(job2): print "skip...", job['sySsdm'], "/", job["yxdm"] else: self.add_job(job2) self.save_sch_list(job, res) elif job["type"] is "u2": url = r"http://gaokao.chsi.com.cn/zsjh/search.do?" \ r"ccdm=&jhxzdm=&kldm=&method=majorList&sySsdm=%s&year=%s&yxdm=%s" % (job["sySsdm"], job["year"], job["yxdm"]) header = { "User-Agent": r"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/47.0.2526.73 Safari/537.36" } con = self.sessionReq.request_url(url, headers=header) if con is None or con.text.strip() is "": print "Nothing back!readd job" + job.__str__() time.sleep(10) self.re_add_job(job) return if r"302 Found" in con.text: spider.runtime.Log.error(job.__str__() + " 302 Found ! Readd..\n") self.re_add_job(job) return res = con.text self.save_sch_detail(job, res)
def __init__(self, threadcnt): Spider.__init__(self, threadcnt) self.sesnreq = SessionRequests() self.sesnreq.load_proxy("../gongshangju/proxy1.txt", 0, False) self.sesnreq.select_user_agent("firefox")
def __init__(self, threadcnt): super(Yggaokao, self).__init__(threadcnt) self.sessionReq = SessionRequests() self.sessionReq.load_proxy('proxy')
class Yggaokao(Spider): def __init__(self, threadcnt): super(Yggaokao, self).__init__(threadcnt) self.sessionReq = SessionRequests() self.sessionReq.load_proxy('proxy') def dispatch(self): f = open("prov_list", "r+b") currline = 0 skipto = 0 endline = 100 for line in f: currline += 1 if currline >= skipto: sySsdm = line.split(" ")[0].strip() job = { "sySsdm": sySsdm, "year": "2015", "start": 0, "type": "u1" } self.add_main_job(job) if currline >= endline: break self.wait_q() self.add_job(None, True) def check_sch_list(self, url, flag=True): if not os.path.exists("prov"): os.makedirs("prov") if not os.path.exists("prov/check.txt"): f = open('prov/check.txt', 'w') f.close() if (flag): with open("prov/check.txt") as file_: for line in file_: if url in line: return False return True f = open("prov/check.txt", "a+") f.write(url + "\n") f.close() def check_sch_detail(self, job): if not os.path.exists("detail/" + job["sySsdm"]): os.makedirs("detail/" + job["sySsdm"]) return True files = os.listdir("detail/" + job["sySsdm"]) if files.count(job["yxdm"] + ".html") == 0: return True return False def retry(self, con, job): if re.search(u'<h1>An error occurred.</h1>', con.text) or re.search( u'Tinyproxy was unable to', con.text): #should reload this page. if int(job["retrycnt"]) < 5: job["retrycnt"] = int(job["retrycnt"]) + 1 self.add_job(job) return True return False def save_sch_list(self, job, res): if not os.path.exists("prov"): os.makedirs("prov") with self.locker: f = open("prov/" + job["sySsdm"] + ".txt", "a+b") schlist = re.findall( r'<tr bgcolor="#FFFFFF" onMouseOver="this.style.background=\'#FFFFEE\'" onMouseOut=\"this.' r'style.background=\'#ffffff\'">(.*?)</tr>', res, re.S) for schinfo in schlist: schstr = "" tds = re.findall(r"<td.*?>(.*?)</td>", schinfo, re.S) if len(tds) is 0: spider.runtime.Log.error( job["sySsdm"] + ", start at " + str(job["start"]) + " match error! No td tag! Readd..\n") self.re_add_job(job) ferr = open( "errors/" + job["sySsdm"] + "_" + str(job["start"]) + ".html", "w+b") ferr.write(res) ferr.close() f.close() return schnamelist = re.findall(r'dhtml">(.*?)</a>', tds[0], re.S) if len(schnamelist) is 0: schnamelist = [] schnamelist.append(tds[0].strip()) if schnamelist[0] is "": spider.runtime.Log.error( job["sySsdm"] + ", start at " + str(job["start"]) + " match error! No school name! Readd..\n") self.re_add_job(job) ferr = open( "errors/" + job["sySsdm"] + "_" + str(job["start"]) + ".html", "w+b") ferr.write(res) ferr.close() f.close() return schname = schnamelist[0] schstr += schname if r'span985">985</span>' in tds[0]: schstr += " 985" if r'span211">211</span>' in tds[0]: schstr += " 211" if r'spanyan">研</span>' in tds[0]: schstr += " 研" for i in range(len(tds))[1:(len(tds) - 1)]: schstr += " " + tds[i] stucnt = re.findall(r"doDialog.*?\">(.*?)</a>", tds[len(tds) - 1], re.S)[0].strip() schstr += " " + stucnt f.write(schstr + "\n") f.flush() f.close() def save_sch_detail(self, job, res): if not os.path.exists("detail/" + job["sySsdm"]): os.makedirs("detail/" + job["sySsdm"]) f = open("detail/" + job["sySsdm"] + "/" + job["yxdm"] + ".html", "w+b") f.write(res) f.flush() f.close() def run_job(self, job): i = random.randint(15, 30) if job["type"] is "u1": print "searching %s, start at %d" % (job["sySsdm"], job["start"]) url = "http://gaokao.chsi.com.cn/zsjh/searchZsjh--year-%s,searchType-1,sySsdm-%s,start-%d.dhtml" % ( job["year"], job["sySsdm"], job["start"]) #url = "http://gaokao.chsi.com.cn/zsjh/searchZsjh.do?ccdm=&jhxzdm=&kldm=&searchType=1&ssdm=&" \ # "sySsdm=%s&year=%s&yxmc=&start=%d" % (job["sySsdm"], job["year"], job["start"]) header = { "User-Agent": r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0", #"Referer":r"http://gaokao.chsi.com.cn/zsjh/", #"Origin":r"http://gaokao.chsi.com.cn", "Host": r"gaokao.chsi.com.cn", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept - Language": "zh-CN,zh;q=0.5", "Accept - Encoding": "gzip, deflate", "Connection": "keep-alive", "Cache - Control": "max-age=0" #"X-Requested-With":r"XMLHttpRequest", #"Pragma":r"no-cache" } con = self.sessionReq.request_url(url, headers=header) if con is None or con.text.strip() == "": print "result is None!" spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " nothing return ! Readd..\n") time.sleep(i) self.re_add_job(job) return res = con.text if r"302 Found" in res: spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " 302 Found ! Readd..\n") time.sleep(i) self.re_add_job(job) firstCraw = True setattr(self._curltls, 'firstCraw', firstCraw) return elif r"403 " in con.text: print "403 Forbidden (操作被拒绝) 列表页" spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + "403 Forbidden ! Readd..\n") time.sleep(i) self.re_add_job(job) firstCraw = True setattr(self._curltls, 'firstCraw', firstCraw) return elif re.search(u'无查询结果', res): print "无查询结果!" spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " no data! Readd..\n") time.sleep(i) self.re_add_job(job) firstCraw = True setattr(self._curltls, 'firstCraw', firstCraw) return elif re.search(ur'<H1>错误</H1>', res): print "<H1>错误</H1>!" time.sleep(i) spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " error occur ! Readd..\n") self.re_add_job(job) firstCraw = True setattr(self._curltls, 'firstCraw', firstCraw) return else: if int(job["start"]) is 0: m = re.search(r"if \(Num > (\d+)", res) if m: pgcnt = int(m.group(1)) while pgcnt > 1: jobb = { "sySsdm": job["sySsdm"], "year": job["year"], "start": (pgcnt - 1) * 20, "type": "u1" } self.add_job(jobb) pgcnt -= 1 #else: # spider.runtime.Log.error(job["sySsdm"]+", start at "+ str(job["start"]) + " no more page! Readd!\n") # self.re_add_job(job) yxdms = re.findall(r"doDialog\('(\d+)'", res, re.S) if len(yxdms) == 0: spider.runtime.Log.error(job["sySsdm"] + ", start at " + str(job["start"]) + " no url! Readd.\n") time.sleep(i) self.re_add_job(job) return for yxdm in yxdms: job2 = { "yxdm": yxdm, "sySsdm": job["sySsdm"], "year": "2015", "type": "u2" } self.add_job(job2) if self.check_sch_list(url): self.save_sch_list(job, res) self.check_sch_list(url, False) else: print "该列表页已抓取过!" elif job["type"] is "u2": url = r"http://gaokao.chsi.com.cn/zsjh/search.do?" \ r"ccdm=&jhxzdm=&kldm=&method=majorList&sySsdm=%s&year=%s&yxdm=%s" % (job["sySsdm"], job["year"], job["yxdm"]) header = { "User-Agent": r"Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:39.0) Gecko/20100101 Firefox/39.0", "Host": "gaokao.chsi.com.cn", "Referer": "http://gaokao.chsi.com.cn/zsjh/", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept - Language": "zh-CN,zh;q=0.5", "Accept - Encoding": "gzip, deflate", "Connection": "keep-alive", "Cache - Control": "max-age=0" } if self.check_sch_detail(job): con = self.sessionReq.request_url(url, headers=header) if con is None or con.text.strip() is "": print "Nothing back!readd job" + job.__str__() time.sleep(i) self.re_add_job(job) return if r"302 Found" in con.text: print "302 Found!" spider.runtime.Log.error(job.__str__() + " 302 Found ! Readd..\n") time.sleep(i) self.re_add_job(job) return if r"403 " in con.text: print "403 Forbidden (操作被拒绝) 详情页" spider.runtime.Log.error(job.__str__() + " 302 Found ! Readd..\n") time.sleep(i) self.re_add_job(job) return res = con.text self.save_sch_detail(job, res) else: print job["sySsdm"] + ":" + str( job["yxdm"]) + "-----该学校已抓取过!!!"