class GkChsiSpecialSpider(BaseGkChsiFsxSpider): """ 学信网阳光高考省市分数线单用户单线程爬虫,抓取高校录取层次对,高校,高校代码,层次 seeds title:省市,科类,科类代码,层次,层次代码,年份,院校代码,院校名称 """ def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0, captcha_limit=50000000, kldms=None, seeds='spec_seeds', recover=False, sleep_max=5, ua='firefox'): super(GkChsiSpecialSpider, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua) self.special_saver = GkChsiSpecialPaperStore('yggk_spec_' + prefix) self.detail_saver = GkChsiDetailPaperStore('yggk_detail_' + prefix) self.prefix = prefix self.seeds = seeds if proxy: self.set_proxy(proxy) self.recover = recover self.kldms = kldms self.parser = HTMLParser.HTMLParser() self.curl_share = None self.login() self.info_saver = LinkSaver(prefix + '_spec_data') self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s' def dispatch(self): with open(self.seeds, 'r') as f: for l in f: param = l.strip().split(',') if len(param) != 8: logging.warn('invalid seeds %s', l) continue self.add_main_job({ 'wclx': 1, 'yxdm': param[6], 'kldm': param[2], 'bkcc': param[4], 'start': 0, 'years': param[5], 'yxmc': param[7].decode('utf-8') }) time.sleep(2) self.wait_q() self.add_job(None) def handle_job(self, jobid): url = 'http://gk.chsi.com.cn/recruit/listSpecBySchool.do' # con = self.request_url(url, data={'yxdm': jobid['yxdm']}) # 发送请求不接受数据 url1 = 'http://gk.chsi.com.cn/recruit/listSchByYxmc.do' con = self.request_url(url1, data={ 'wclx': 1, 'yxmc': jobid['yxmc'], 'kldm': jobid['kldm'], 'bkcc': jobid['bkcc'], 'start': jobid['start'], 'years': jobid['years'] }) if not con or not con.text: self.on_work_failed(None, jobid, url1) return # con = self.request_url(url, data=jobid) # 服务器需要之前的请求传递参数,因为这个页面只接收两个参数 con = self.request_url(url, data={ 'yxdm': jobid['yxdm'], 'start': jobid['start'] }) if not con or not con.text: self.on_work_failed(None, jobid, url) return jtitle = '%s/%s/%s/%s/%s/%s' % (jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'], jobid['start']) self.special_saver.save(int(time.time()), jtitle, url, con.text) if 0 == jobid['start']: m = re.search(ur'共 (\d+) 页', con.text) if m: pages = int(m.group(1)) logging.info('found %d pages for %s', pages, str(jobid)) for page in range(1, pages): data = copy.deepcopy(jobid) data['start'] = page * 20 self.add_job(data) else: logging.warn('failed to parse pages %s', str(jobid)) specials = self.extract_special(con.text) logging.info('found %d specials from', len(specials)) for zydm, zymc in specials: content = self.extract_detail(zydm, zymc, jobid, jtitle, 0) if content is None: continue m = re.search(ur'共 (\d+) 页', content) if not m: continue page_cnt = int(m.group(1)) if page_cnt <= 1: continue for p in range(1, page_cnt): self.extract_detail(zydm, zymc, jobid, jtitle, p) def extract_detail(self, zydm, zymc, jobid, jtitle, page): logging.info('parsing special %s,%s', zymc, zydm) detail_url = self.detail_url_format % (jobid['years'], jobid['yxdm'], zydm, page * 10) detail_content = self.request_url(detail_url) if not detail_content or not detail_content.text: logging.error('fail to fetch %s', detail_url) self.info_saver.append('detail failed:%s,%s' % (str(jobid), detail_url)) return self.detail_saver.save(int(time.time()), '%s/%s/%s' % (jtitle, zydm, page), detail_url, detail_content.text) return detail_content.text def extract_special(self, content): return re.findall( r'<input type="radio" name="zydm" value="([\d\w]*)" class="radio">([^<]*)</td>', content) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "saved: %d\n" % self.special_saver.saved_count spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class GkChsiSchoolSpider(BaseGkChsiFsxSpider): """ 学信网阳光高考省市分数线单用户单线程爬虫,抓取高校录取层次对,高校,高校代码,层次 """ def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0.0, highscore=750, captcha_limit=50000, kldms=None, seeds=None, recover=False, sleep_max=5, ua='firefox'): super(GkChsiSchoolSpider, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = [1, 5] self.pagestore = GkChsiSchoolPaperStore('yggk_sch_' + prefix) self.prefix = prefix if proxy: self.set_proxy(proxy) self.highscore = highscore self.minscore = {} self.recover = recover self.kldms = kldms self.parser = HTMLParser.HTMLParser() self.curl_share = None self.login() self.info_saver = LinkSaver(prefix + '_data') self.seeds = seeds def __del__(self): self.logout() def dispatch(self): kldms = self.fetch_kldms() self.info_saver.append('kldm:' + str(kldms) + '\n') if len(kldms) == 2: self.kldms[0] = str(kldms[0][0]) self.kldms[1] = str(kldms[1][0]) if self.seeds: seeds = [] with open(self.seeds, 'r') as job_saver: lines = job_saver.readlines() job_saver.close() for l in lines: if not self.recover or not self.pagestore.find_any(self.pagestore.channel + '://' + str(l.strip())): seeds.append(eval(l)) for seed in seeds: self.add_main_job(seed) print 'recover %d jobs' % len(seeds) else: s = self.highscore while s >= 50: for kldm in self.kldms: for bkcc in [1, 2]: self.add_main_job( {'highscore': s, 'lowscore': s - 50, 'bkcc': bkcc, 'kldm': kldm, 'years': 15, 'start': 0}) s -= 50 if s > 0: for kldm in self.kldms: for bkcc in [1, 2]: self.add_main_job( {'highscore': s, 'lowscore': 0, 'bkcc': bkcc, 'kldm': kldm, 'years': 15, 'start': 0}) time.sleep(2) self.wait_q() self.add_job(None) def handle_job(self, jobid): url = 'http://gk.chsi.com.cn/recruit/listRecruitSchool.do' con = self.request_url(url, data=jobid) if not con or not con.text: self.on_work_failed(None, jobid, url) return if re.search(u'您输入的数据不符合要求,请按照下面的提示信息进行修改', con.text): logging.info('re add job,%s', str(jobid)) self.re_add_job(jobid) print '查询错误', str(jobid) return self.pagestore.save(int(time.time()), str(jobid), url, con.text) if 0 == jobid['start']: m = re.search(ur'共 (\d+) 页', con.text) if m: pages = int(m.group(1)) logging.info('found %d pages for %s', pages, str(jobid)) for page in range(1, pages): data = copy.deepcopy(jobid) data['start'] = page * 20 self.add_job(data) else: logging.warn('failed to parse pages %s', str(jobid)) def fetch_kldms(self): if self.login_time < -1: raise Exception('failed to login') con = self.request_url('http://gk.chsi.com.cn/recruit/queryByScore.do') if con and con.text: m = re.search(r'<select name="kldm">.*?</select>', con.text, re.S) if m: return re.findall(r'<option value=["\'](\d+)["\'][^>]*>(.*?)<\/option>', m.group()) return [] def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass
class GkChsiDetailSpider(BaseGkChsiFsxSpider): """ 学信网阳光高考省市分数线单用户单线程爬虫,抓取高校录取层次对,高校,高校代码,层次 seeds title:省市,科类,科类代码,层次,层次代码,年份,院校代码,院校名称 """ def __init__(self, threadcnt, account, prefix, proxy=None, sleep=0.0, captcha_limit=50000000, seeds='detail_seeds', recover=False, sleep_max=5, ua='firefox', year='15', bkccs=None, kldms=None): super(GkChsiDetailSpider, self).__init__(threadcnt, account, prefix, proxy, sleep, captcha_limit, sleep_max, ua) if kldms is None: kldms = ['5', '1'] if bkccs is None: bkccs = ['1', '2'] self.pagestore = GkChsiDetailPaperStore('yggk_detail_' + prefix) self.prefix = prefix self.seeds = seeds if proxy: self.set_proxy(proxy) self.kldms = kldms self.bkccs = bkccs self.recover = recover self.parser = HTMLParser.HTMLParser() self.info_saver = LinkSaver(prefix + '_detail_data') self.year = year self.detail_url_format = 'http://gk.chsi.com.cn/recruit/listWeiciBySpec.do?year=%s&yxdm=%s&zydm=%s&start=%s' def dispatch(self): for kldm in self.kldms: for bkcc in self.bkccs: self.post_kldm_bkcc_for_session(kldm, bkcc) seeds = [] with open(self.seeds, 'r') as f: for l in f: if l[0] == '{': data = eval(l.strip()) else: param = l.strip().split(',') if len(param) != 8: logging.warn('invalid seeds %s', l) continue data = { 'wclx': 1, 'yxdm': param[6], 'kldm': param[2], 'bkcc': param[4], 'start': 0, 'years': param[5], 'zydm': param[7], 'zymc': param[8].encode('utf-8') } if data['kldm'] == kldm and bkcc == data[ 'bkcc'] and self.year == data['years']: if self.recover and self.pagestore.find_any( self.pagestore.channel + '://' + self.get_jobid(data)): continue seeds.append(data) for seed in seeds: self.add_main_job(seed) print 'add', len(seeds), 'jobs' time.sleep(2) self.wait_q() self.add_job(None) def handle_job(self, jobid): content = self.extract_detail(jobid) if 0 == jobid['start']: if content is None: return m = re.search(ur'共 (\d+) 页', content) if not m: return page_cnt = int(m.group(1)) if page_cnt <= 1: return for p in range(1, page_cnt): job = copy.deepcopy(jobid) job['start'] = p * 10 self.add_job(job) def get_jobid(self, jobid): return '%s/%s/%s/%s/%s/%s/%s/%s' % ( jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'], jobid['start'], jobid['zydm'], int(jobid['start']) / 10) def extract_detail(self, jobid): logging.info('parsing special %s,%s', jobid['zymc'], jobid['zydm']) detail_url = self.detail_url_format % (jobid['years'], jobid['yxdm'], jobid['zydm'], jobid['start']) detail_content = self.request_url(detail_url) if not detail_content or not detail_content.text: logging.error('fail to fetch %s', detail_url) self.info_saver.append('detail failed:%s,%s' % (str(jobid), detail_url)) return jtitle = '%s/%s/%s/%s/%s/%s' % (jobid['yxdm'], jobid['years'], jobid['kldm'], jobid['bkcc'], jobid['wclx'], jobid['start']) self.pagestore.save( int(time.time()), '%s/%s/%s' % (jtitle, jobid['zydm'], int(jobid['start']) / 10), detail_url, detail_content.text) return detail_content.text def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**'], '%s DONE' % self._name, msg) elif evt == 'STARTED': # spider.misc.stacktracer.trace_start('res.trace.html') pass