def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login() self.num_count = 0 self.savefile=FileSaver("youzy.txt") self.__fail_urls = FileSaver("fail_urls.txt")
def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login(1) self.num_count = 0 self.parse_count = 0 self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn) self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt")
class YouzyNPSpider(Spider): def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login(1) self.num_count = 0 self.parse_count = 0 self.savefile=CsvSaver("spider_url_zhuanke_np.csv",fixed+pfcolumn) self.__fail_urls = FileSaver("spider_url_fail_zhuanke_np.txt") def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty() or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count==0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): #f = open("spider_fail_np.txt", "r") f = open("spider_zhuanke_fail_np.txt", "r") while True : line = f.readline().strip() if line: job = {"url":line,"retry":0,"time":0} self.add_job(job, True) else: break f.close() self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls,'failcount',0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): url = jobid.get("url") res = self.sessionReq.request_url(url) self.num_count += 1 if res is None or res.code == 404: # retry = int(jobid.get("retry")) # if retry < 1: # retry+=1 # jobid = {"url":url,"retry":retry} # self.num_count -= 1 # print "url = [ %s ] retrying [ %d ]..."%(url,retry) # self.add_job(jobid) # else: print "url= [%s] response %s "%(url,'is None' if res is None else "code is 404") self.__fail_urls.append(url) elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------ %d " % (url,res.code) self.add_job(jobid) time.sleep(1) elif res.code == 200: if url.__contains__('codeId'): retry = int(jobid.get("retry")) self.parse(res.content,url,retry) else: doc = html.fromstring(res.content) values = doc.xpath('//select[@id="ddlCodes"]/option/@value') if values is not None and len(values) > 0 : for value in values : url2 = url+"&codeId="+value jobid = {"url":url2,"retry":0} self.add_job(jobid) else: retry = int(jobid.get("retry")) self.parse(res.content,url,retry) else: print "%s ########UNKNOWN ERROR###### [ %d ]" %(url, res.code) def parse(self,text,url,retry): doc = html.fromstring(text) fixed_line = [] name = doc.xpath('//div[@class="box"]/h2') if len(name) < 1: # if retry < 3: # retry += 1 # print "page content is none, retrying [%d] url=[%s]"%(retry,url) # jobid = {"url":url,"retry":retry} # self.num_count -= 1 # self.add_job(jobid) # else : print "page content is none,write failure file , url =[%s]"%url self.__fail_urls.append(url) return False fixed_line.append(name[0].text_content().encode('utf-8')) column_fixed = fixed+pfcolumn options = doc.xpath('//select[@id]/option[@selected]') for option in options: fixed_line.append(option.text_content().encode('utf-8')) headers = doc.xpath('//table[@class="table table-hover table-bordered table-th-gray"]/thead/tr/th') header_column= [] for header in headers: header_column.append(header.text_content().encode('utf-8')) bodys = doc.xpath('//table[@class="table table-hover table-bordered table-th-gray"]/tbody') body_list = [] for body in bodys: style = body.attrib.get('style', None) if not style: trs = body.xpath('tr') first = '' rowspan = 0 fo = 0 for tr in trs: fo += 1 tds = tr.xpath('td') if len(tds) == len(header_column): first = tds[0].text_content().encode('utf-8') rowspan = int(tds[0].attrib.get('rowspan')) fo = 1 print "---first=%s,rowspan=%d"%(first,rowspan) td_list = [] if fo <= rowspan and fo >1: td_list.append(first) for td in tds: style = td.attrib.get('style', None) if style: continue td_list.append(td.text_content().encode('utf-8')) body_list.append(td_list) self.handle_line(body_list,header_column) for tr in body_list: line = fixed_line+tr self.savefile.writerline(line) self.parse_count += 1 return True def handle_line(self,body_list,header_column): for tr in body_list: for h in pfcolumn: if not header_column.__contains__(h): index = pfcolumn.index(h) tr.insert(index,'null') def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg)
class YouzySpiderPF(Spider): def __init__(self, thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login(5) self.savefile = CsvSaver("spider_zhuanke_pf.csv", fixed + pfcolumn) self.__fail_urls = FileSaver("spider_zhuanke_fail_pf.txt") self.id_count = 0 self.url_count = 0 self.parse_count = 0 def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty( ) or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count == 0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): f = open("zhuanke.txt", "r") while True: line = f.readline().strip() if line: job = {"id": line, "retry": 0} self.add_job(job, True) else: break f.close() self.wait_q_breakable() self.add_job(None, True) def run_job(self, jobid): if self.id_count % 11 == 0: print "NOW =================================================================== \n [id_count=%d],[url_count=%d],[parse_count=%d]" % ( self.id_count, self.url_count, self.parse_count) id = jobid.get("id") baseUrl = "http://www.youzy.cn/college/pfraction?courseType=-1&year=0&Id=" + id province = { "1", "842", "843", "851", "1128", "845", "834", "844", "848", "847", "855", "849", "850", "859", "837", "846", "839", "852", "860", "854", "840", "841", "1120", "857", "856", "862", "835" } self.id_count += 1 for pro in province: url1 = baseUrl + "&provinceId=" + pro res = self.sessionReq.request_url(url1) self.url_count += 1 if res is None: retry = int(jobid.get("retry")) if retry < 5: retry += 1 jobid = {"id": id, "retry": retry} self.id_count -= 1 print "id %s retrying %d..." % (id, retry) self.add_job(jobid) else: self.__fail_urls.append(url1) elif res.code == 200: doc = html.fromstring(res.content) values = doc.xpath('//select[@id="ddlCodes"]/option/@value') if values is not None and len(values) > 0: for value in values: url2 = url1 + "&codeId=" + value self.req_local(url2, 0) else: self.parse(res.content, url1) elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: time.sleep(1) self.id_count -= 1 self.add_job(jobid) elif res.code == 404: self.__fail_urls.append(url1) print "page [ %s ] is not found ! " % url1 def parse(self, text, url): doc = html.fromstring(text) fixed_line = [] name = doc.xpath('//div[@class="box"]/h2') if len(name) < 1: print "page content is none, url = %s" % url self.__fail_urls.append(url) return False fixed_line.append(name[0].text_content().encode('utf-8')) options = doc.xpath('//select[@id]/option[@selected]') for option in options: fixed_line.append(option.text_content().encode('utf-8')) headers = doc.xpath( '//table[@class="table table-hover table-bordered table-th-gray"]/thead/tr/th' ) header_column = [] for header in headers: style = header.attrib.get('style', None) if style: continue header_column.append(header.text_content().encode('utf-8')) bodys = doc.xpath( '//table[@class="table table-hover table-bordered table-th-gray"]/tbody' ) body_list = [] for body in bodys: style = body.attrib.get('style', None) if not style: trs = body.xpath('tr') for tr in trs: tds = tr.xpath('td') td_list = [] for td in tds: style = td.attrib.get('style', None) if style: continue td_list.append(td.text_content().encode('utf-8')) body_list.append(td_list) self.handle_line(body_list, header_column) for tr in body_list: line = fixed_line + tr self.savefile.writerline(line) self.parse_count += 1 return True def handle_line(self, body_list, header_column): for tr in body_list: for h in pfcolumn: if not header_column.__contains__(h): index = pfcolumn.index(h) tr.insert(index, 'null') def req_local(self, url, retry): res = self.sessionReq.request_url(url) self.url_count += 1 if res is None: if retry < 5: retry += 1 self.url_count -= 1 self.req_local(url, retry) else: self.__fail_urls.append(url) elif res.code == 404: self.__fail_urls.append(url) print "%s ------ 404" % url elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------ %d " % (url, res.code) time.sleep(1) self.url_count -= 1 self.req_local(url, retry) elif res.code == 200: self.parse(res.content, url) else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code #raise AccountErrors.NoAccountError('fatal error') def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg)
class YouzySpider(Spider): def __init__(self,thcnt): Spider.__init__(self, thcnt) self.sessionReq = YouzyLogin() self.sessionReq.do_login() self.num_count = 0 self.savefile=FileSaver("youzy.txt") self.__fail_urls = FileSaver("fail_urls.txt") def wait_q_breakable(self): lt = 0 while True: if not self.job_queue.empty() or not self.job_queue2.empty() or not self.job_queue3.empty(): time.sleep(5) if time.time() < lt + 1 and self._running_count==0: return True time.sleep(2) lt = time.time() if self._worker_count == 0: return False def dispatch(self): self.bs = BinSaver("youzy_job.bin") f = open("url_cfraction918-.txt", "r") while True : line = f.readline() if line.strip(): job = {"url":line.strip()} self.add_job(job, True) else: break f.close() self.wait_q_breakable() self.add_job(None, True) def get_fail_cnt(self, addv): fc = getattr(self._curltls,'failcount',0) if (addv): fc += addv setattr(self._curltls, 'failcount', fc) return fc def run_job(self, jobid): url = jobid.get("url") print "url == ",url tid = self.get_tid() res = self.sessionReq.request_url(url) self.num_count += 1 if res is None: if self.get_fail_cnt(1) < 10: self.add_job(jobid) else: self.__fail_urls.append(url) raise AccountErrors.NoAccountError("failcount = [ %d ]" % (self.get_fail_cnt(0))) return else: setattr(self._curltls,'failcount',0) if res.code == 404: print "%s ------ 404" % url return elif res.code == 503 or res.code == 500 or res.code == 502 or res.code == 504: print "%s ------ %d " % (url,res.code) self.add_job(jobid) time.sleep(0.8) return elif res.code == 200: print "%s ------ saving " % url self.parse(res.content) con = [] type = {"key":con} str1 = json.dumps(type) self.savefile.append(str1) #print "content======\n",res.content #self.bs.append(fn, res.text) else: print "#######################################UNKNOWN ERROR############################################# [ %d ]" % res.code #Log.error("unknown error...") #Log.errorbin("%s" %url, res.text) raise AccountErrors.NoAccountError('fatal error') def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail('*****@*****.**', '%s DONE' % sys.argv[0], msg) def parse(self,text): #doc = html.fromstring(get_doc("daxue.html")) #print "type=========",type(doc) doc = html.fromstring(text) part1_template ='{}\t\t\t\t\t{}\t\t\t\t\t{}\t\t\t\t\t{}' part1_list = [] els = doc.xpath('//select[@id]/option[@selected]') for el in els: print el part1_list.append(el.text_content().encode('utf-8')) print part1_template.format(*part1_list) print "=================================================================================" header_template = '{}\t\t\t\t\t{}\t\t\t\t\t{}\t\t\t\t\t{}\t\t\t\t\t{}' header_list = [] headers = doc.xpath('//table[@class="table table-hover table-bordered table-th-gray"]/thead/tr/th') for header in headers: style = header.attrib.get('style', None) if style: continue header_list.append(header.text_content().encode('utf-8')) print header_template.format(*header_list) bodys = doc.xpath('//table[@class="table table-hover table-bordered table-th-gray"]/tbody') td_list = [] for body in bodys: style = body.attrib.get('style', None) if not style: trs = body.xpath('tr') for tr in trs: tds = tr.xpath('td') for td in tds: style = td.attrib.get('style', None) if style: continue td_list.append(td.text_content().encode('utf-8')) print header_template.format(*td_list)