class CVZhilianSplit(GenQueries): def __init__(self, thcnt, ac): GenQueries.__init__(self, thcnt) self._last_time = 0.0 self.zlm = MRLManager(ac, new_ZLLogin) self.headers = { 'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom' } self.search_cnt = 0 def init_conditions(self): # 更新日期 固定为6个月 # 年龄,性别,学历,户口所在地,当前工作状态,现居住地,企业性质,企业规模 CVZhilianData.add(self, 'SF_1_1_7', [['4,9', '最近一个月']]) CVZhilianData.add(self, 'SF_1_1_8', CVZhilianData.agelist) CVZhilianData.add(self, 'SF_1_1_9', CVZhilianData.gender) CVZhilianData.add(self, 'SF_1_1_6', qdata.provs) #现居住地 CVZhilianData.add(self, 'SF_1_1_5', CVZhilianData.edugr) CVZhilianData.add(self, 'SF_1_1_10', qdata.provs) #户口所在地 CVZhilianData.add(self, 'SF_1_1_29', CVZhilianData.workstate) CVZhilianData.add(self, 'SF_1_1_31', CVZhilianData.corp_type) CVZhilianData.add(self, 'SF_1_1_30', CVZhilianData.corp_size) self.zlm.ensure_login_do(None, lambda n: 1, None) cansearch = self.zlm.cur_worker().cansearch self.zlm.release_obj() if not cansearch: raise RuntimeError("this account can't search!") def search_cnt_checker(self, net): self.search_cnt += 1 if self.search_cnt > 380: self.search_cnt = 0 raise LoginErrors.AccountHoldError() def need_split(self, opts, level, isLast): url = CVZhilianUtil.get_search_url(opts) con = self.zlm.el_request(url, headers=self.headers, prechecker=self.search_cnt_checker) if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', url) time.sleep(5) return self.need_split(opts, level, isLast) cnt = CVZhilianUtil.get_count(url, con) if cnt == 0: return 0 return cnt >= 4000
class CVZhilianSearch(Spider2): PAGE_TEMPLATE = "http://rd.zhaopin.com/resumepreview/resume/viewone/1/%s_1_1" CRAWLER_RANGE_MAP = { '3d': '1,9', #最近三天 '1w': '2,9', #最近一周 '2w': '3,9', #最近两周 '1m': '4,9', #最近一个月 '2m': '5,9', #最近2个月 '3m': '6,9', #最近3个月 '6m': '7,9', #最近6个月 '1y': '8,9', #最近1年 } def __init__(self, thcnt, acs): Spider2.__init__(self, thcnt) self._name = 'cv_zhilian' self.jobpusher = None self.zlm = MRLManager(acs, new_ZLLogin) self.headers = { 'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom' } self.search_cnt = 0 self.crawler_range = None def init_jobs(self): # fn = 'cv_zhilian.queries.txt' # fn = 'split1y.txt' fn = "one_month_splitly.txt" self.add_main_job_file({'type': 'main'}, fn) def search_cnt_checker(self, net): # 当搜索次数到达一定数量时, 必须换帐号登录, 否则可能被封. self.search_cnt += 1 if self.search_cnt > 500: self.search_cnt = 0 raise LoginErrors.AccountHoldError() def run_job(self, job): jt = self.get_job_type(job) if jt == 'main': joburl = CVZhilianUtil.get_search_url(json.loads(job['line'])) # if this account can't search, then giveup. con = self.zlm.el_request(joburl, headers=self.headers, hint='search', prechecker=self.search_cnt_checker) if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', joburl) self.add_job(job) return for su in CVZhilianUtil.sub_pages(joburl, con): self.add_job({'type': 'search', 'url': su}) self.parse_page(joburl, con) elif jt == 'search': joburl = job['url'] # if self.crawler_range: # joburl = CVZhilianUtil.get_count() con = self.zlm.el_request(joburl, headers=self.headers, hint='search') if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', joburl) self.add_job(job) return self.parse_page(joburl, con) def parse_page(self, url, con): if u"请修改或适当减少搜索项再进行搜索" in con.text: if not self.zlm.cur_worker().cansearch: # Account BLOCKED ?? self.zlm.cur_worker().isvalid = False raise RuntimeError("AccountBlocked") Log.error("NO_RESULT_OR_BLOCK", url) return try: hf = spider.util.htmlfind(con.text, 'div class="resumes-list"', 0) node = hf.get_node() a = re.findall(ur'''tag="([a-zA-Z0-9]+)_1"''', node) a = spider.util.unique_list(a) for i in a: # print "found_cv", i self.jobpusher({ 'type': 'cv', 'url': CVZhilianSearch.PAGE_TEMPLATE % i }) except: msg = "unknown search result %s" % url Log.error(msg, "sleep 5s.") Log.errorbin(msg, con.text) time.sleep(5) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': self.jobpusher(None) elif evt == 'STARTED': #spider.misc.stacktracer.trace_start('res.trace.html') pass