class CVZhilianSplit(GenQueries): def __init__(self, thcnt, ac): GenQueries.__init__(self, thcnt) self._last_time = 0.0 self.zlm = MRLManager(ac, new_ZLLogin) self.headers = { 'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom' } self.search_cnt = 0 def init_conditions(self): # 更新日期 固定为6个月 # 年龄,性别,学历,户口所在地,当前工作状态,现居住地,企业性质,企业规模 CVZhilianData.add(self, 'SF_1_1_7', [['4,9', '最近一个月']]) CVZhilianData.add(self, 'SF_1_1_8', CVZhilianData.agelist) CVZhilianData.add(self, 'SF_1_1_9', CVZhilianData.gender) CVZhilianData.add(self, 'SF_1_1_6', qdata.provs) #现居住地 CVZhilianData.add(self, 'SF_1_1_5', CVZhilianData.edugr) CVZhilianData.add(self, 'SF_1_1_10', qdata.provs) #户口所在地 CVZhilianData.add(self, 'SF_1_1_29', CVZhilianData.workstate) CVZhilianData.add(self, 'SF_1_1_31', CVZhilianData.corp_type) CVZhilianData.add(self, 'SF_1_1_30', CVZhilianData.corp_size) self.zlm.ensure_login_do(None, lambda n: 1, None) cansearch = self.zlm.cur_worker().cansearch self.zlm.release_obj() if not cansearch: raise RuntimeError("this account can't search!") def search_cnt_checker(self, net): self.search_cnt += 1 if self.search_cnt > 380: self.search_cnt = 0 raise LoginErrors.AccountHoldError() def need_split(self, opts, level, isLast): url = CVZhilianUtil.get_search_url(opts) con = self.zlm.el_request(url, headers=self.headers, prechecker=self.search_cnt_checker) if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', url) time.sleep(5) return self.need_split(opts, level, isLast) cnt = CVZhilianUtil.get_count(url, con) if cnt == 0: return 0 return cnt >= 4000
class CVLPSpider(Spider): def __init__(self, thcnt, acs, type=1): Spider.__init__(self, thcnt) self._name = 'cvlpspider' self.lpm = MRLManager(acs, new_LPQYLogin) if type == 2: self.lpm = MRLManager(acs, new_LPLTLogin) self.pagestore = LPCVStore() self.hasher = spider.util.LocalHashChecker() self.lpm.ensure_login_do(None, lambda n:1, None) self.lpm.release_obj() self.imgcnt = 0 self._type = type self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' if self._type == 2: self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' self.stat = spider.runtime.StatDict() def run_job(self, jobd): if jobd.get('type') == 'cvurl': cvid = jobd.get('jobid') url = self.url_prefix.format(cvid) qstring = "liepincv://"+cvid if self.pagestore.check_should_fetch(qstring): self.stat.inc('cv') o = self.lpm.el_request(url, headers=Cdata.headers, allow_redirects=True) if o is None: self.add_job(jobd) return None self.pagestore.save(time.time(), cvid, url, o.text) time.sleep(3) else: print '======%s has downloaded=====' % qstring def dispatch(self): with open("res.spider.txt", 'rb') as f: for line in f: line = line.split("\t") if not line: continue self.add def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail(['*****@*****.**', '*****@*****.**'], '%s DONE' % sys.argv[0], msg)
class CVZhilianGetCV(Spider2): def __init__(self, thcnt, cfgname, acs): Spider2.__init__(self, thcnt) self._name = 'cvzlgetcv_%s' % cfgname self.zlm = MRLManager(acs, new_ZLLogin) self.pagestore = CVZLPageStore() self.hasher = spider.util.LocalHashChecker() self.zlm.ensure_login_do(None, lambda n: 1, None) self.zlm.release_obj() self.imgcnt = 0 def init_jobs(self): return def wait_job(self): return self.wait_job_by_condition() def push_job(self, j): if j is None: self._no_more_wait_job = True else: self.add_job(j) def _get_image(self, refurl): imgurl = "http://rd2.zhaopin.com/s/loginmgr/monitorvalidatingcode.asp?t=" + str( int(time.time()) * 1000) con = self.zlm.el_request(imgurl, headers={'Referer': refurl}) if con is None: Log.warning("fetch image failed, sleep 1s") time.sleep(1) return self._get_image(refurl) return con.content def get_cv(self, url): #http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM622670859R90250000000_1_1?searchresume=1 con = self.zlm.el_request(url) if con is None: return None if u"您需要输入验证码才能继续后续的操作" in con.text: self.imgcnt += 1 if self.imgcnt > 10: self.imgcnt = 0 self.zlm.set_nologin() return None for i in range(0, 5): code = OnlineOCR('zhilian2').resolve( lambda dbgdata=None: self._get_image(url)) purl = "http://rd.zhaopin.com/resumePreview/resume/_CheackValidatingCode?validatingCode=" + code con = self.zlm.el_request(purl, data={'validatingCode': code}, headers={'Referer': url}) if con is not None: if re.search('true', con.text, re.I): time.sleep(5) return None Log.warning('验证码输入失败') time.sleep(2) #连续失败了5次, 换帐号!! self.zlm.set_nologin() self.imgcnt = 0 return None return con def run_job(self, jobid): # {'type':'cv', 'url':'http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM321509749R90250002000_1_1?searchresume=1'} if self.get_job_type(jobid) == 'cv': url = jobid['url'] m = re.search(ur'/([0-9A-Z]+)_\d+_\d+', url) if m is None: Log.error('invalid cv url', url) return jdid = m.group(1) if self.pagestore.check_should_fetch(jdid): con = self.get_cv(url) if con is None: self.add_job(jobid) return if u"该简历已被求职者删除" in con.text: return if u"抱歉,该简历已被删除" in con.text: return if u"由于系统繁忙,一会再来看一下吧" in con.text: Log.warning("url %s 繁忙不可获得" % url) return getime = int(time.time()) self.pagestore.save(getime, jdid, url, con.text) else: Log.errinfo("跳过拉取简历%s" % jdid) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': title = ' '.join(sys.argv) + ' DONE' msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**', '*****@*****.**'], title, msg)
class CVLPSpider(Spider): def __init__(self, thcnt, acs, type=1, process_num=0, max_process_cnt=1): Spider.__init__(self, thcnt) self._name = 'cvlpspider' self.lpm = MRLManager(acs, new_LPQYLogin) if type == 2: self.lpm = MRLManager(acs, new_LPLTLogin) self.pagestore = LPCVStore() self.hasher = spider.util.LocalHashChecker() self.lpm.ensure_login_do(None, lambda n: 1, None) self.lpm.release_obj() self.imgcnt = 0 self._type = type self._process_num = process_num self._max_process_cnt = max_process_cnt self._spider_cnt = 0 self._start_time = datetime.datetime.today() self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' if self._type == 2: self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' self.stat = spider.runtime.StatDict() self._limit_cnt = 200 def run_job(self, cvid): url = self.url_prefix.format(cvid) qstring = "liepincv://" + cvid if self.pagestore.check_should_fetch(qstring): self.stat.inc('cv') o = self.lpm.el_request(url, headers=Cdata.headers, allow_redirects=True) if o is None: self.add_job(cvid) return None self.pagestore.save(time.time(), cvid, url, o.text) time.sleep(5) self._spider_cnt += 1 self._check_if_stop() print "start: %s - now: %s || spider cnt: %d" % ( self._start_time, datetime.datetime.today(), self._spider_cnt) else: print '======%s has downloaded=====' % qstring def _check_if_stop(self): if self._spider_cnt % self._limit_cnt == 0: Log.info("spider %d pages, sleep 60*5s today" % self._spider_cnt) time.sleep(60 * 5) def dispatch(self): with open(Cdata.IDS_FILE, 'rb') as f: for index, line in enumerate(f): if index % self._max_process_cnt != self._process_num: continue line = line.strip() if not line: continue if self.pagestore.find_any("%s://%s" % ("cv_liepin", line)): continue if not self._is_needed_cv(line): continue self.add_main_job(line) self.add_main_job(None) def _is_needed_cv(self, line): if not hasattr(self, 'not_need_cvs'): self.not_need_cvs = set() if os.path.exists(LPCVConfig.NOT_NEED_CV_FN): with open(LPCVConfig.NOT_NEED_CV_FN, 'rb') as f: for line in f: line = line.strip() if not line: continue self.not_need_cvs.add(line) if os.path.exists(LPCVConfig.NOT_ACCESS_BY_QIYE): with open(LPCVConfig.NOT_ACCESS_BY_QIYE, 'rb') as f: for line in f: line = line.strip() if not line: continue self.not_need_cvs.add(line) if line in self.not_need_cvs: return False return True def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail( ['*****@*****.**'], 'CVLPSpider process %d, DONE' % self._process_num, msg + '\n saved: %d' % self.pagestore.saved_count)