Example #1
0
class CVZhilianSplit(GenQueries):
    def __init__(self, thcnt, ac):
        GenQueries.__init__(self, thcnt)
        self._last_time = 0.0
        self.zlm = MRLManager(ac, new_ZLLogin)
        self.headers = {
            'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom'
        }
        self.search_cnt = 0

    def init_conditions(self):
        # 更新日期 固定为6个月
        # 年龄,性别,学历,户口所在地,当前工作状态,现居住地,企业性质,企业规模
        CVZhilianData.add(self, 'SF_1_1_7', [['4,9', '最近一个月']])
        CVZhilianData.add(self, 'SF_1_1_8', CVZhilianData.agelist)
        CVZhilianData.add(self, 'SF_1_1_9', CVZhilianData.gender)
        CVZhilianData.add(self, 'SF_1_1_6', qdata.provs)  #现居住地
        CVZhilianData.add(self, 'SF_1_1_5', CVZhilianData.edugr)
        CVZhilianData.add(self, 'SF_1_1_10', qdata.provs)  #户口所在地
        CVZhilianData.add(self, 'SF_1_1_29', CVZhilianData.workstate)
        CVZhilianData.add(self, 'SF_1_1_31', CVZhilianData.corp_type)
        CVZhilianData.add(self, 'SF_1_1_30', CVZhilianData.corp_size)
        self.zlm.ensure_login_do(None, lambda n: 1, None)
        cansearch = self.zlm.cur_worker().cansearch
        self.zlm.release_obj()
        if not cansearch:
            raise RuntimeError("this account can't search!")

    def search_cnt_checker(self, net):
        self.search_cnt += 1
        if self.search_cnt > 380:
            self.search_cnt = 0
            raise LoginErrors.AccountHoldError()

    def need_split(self, opts, level, isLast):
        url = CVZhilianUtil.get_search_url(opts)
        con = self.zlm.el_request(url,
                                  headers=self.headers,
                                  prechecker=self.search_cnt_checker)
        if con.code == 404:
            con = None
        if con is None:
            Log.warning('请求搜索页失败', url)
            time.sleep(5)
            return self.need_split(opts, level, isLast)
        cnt = CVZhilianUtil.get_count(url, con)
        if cnt == 0:
            return 0
        return cnt >= 4000
Example #2
0
class CVZhilianSearch(Spider2):

    PAGE_TEMPLATE = "http://rd.zhaopin.com/resumepreview/resume/viewone/1/%s_1_1"
    CRAWLER_RANGE_MAP = {
        '3d': '1,9',  #最近三天
        '1w': '2,9',  #最近一周
        '2w': '3,9',  #最近两周
        '1m': '4,9',  #最近一个月
        '2m': '5,9',  #最近2个月
        '3m': '6,9',  #最近3个月
        '6m': '7,9',  #最近6个月
        '1y': '8,9',  #最近1年
    }

    def __init__(self, thcnt, acs):
        Spider2.__init__(self, thcnt)
        self._name = 'cv_zhilian'
        self.jobpusher = None
        self.zlm = MRLManager(acs, new_ZLLogin)
        self.headers = {
            'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom'
        }
        self.search_cnt = 0

        self.crawler_range = None

    def init_jobs(self):
        # fn = 'cv_zhilian.queries.txt'
        # fn = 'split1y.txt'
        fn = "one_month_splitly.txt"
        self.add_main_job_file({'type': 'main'}, fn)

    def search_cnt_checker(self, net):
        # 当搜索次数到达一定数量时, 必须换帐号登录, 否则可能被封.
        self.search_cnt += 1
        if self.search_cnt > 500:
            self.search_cnt = 0
            raise LoginErrors.AccountHoldError()

    def run_job(self, job):
        jt = self.get_job_type(job)
        if jt == 'main':
            joburl = CVZhilianUtil.get_search_url(json.loads(job['line']))
            # if this account can't search, then giveup.
            con = self.zlm.el_request(joburl,
                                      headers=self.headers,
                                      hint='search',
                                      prechecker=self.search_cnt_checker)
            if con.code == 404:
                con = None
            if con is None:
                Log.warning('请求搜索页失败', joburl)
                self.add_job(job)
                return
            for su in CVZhilianUtil.sub_pages(joburl, con):
                self.add_job({'type': 'search', 'url': su})
            self.parse_page(joburl, con)
        elif jt == 'search':
            joburl = job['url']
            # if self.crawler_range:
            #     joburl = CVZhilianUtil.get_count()
            con = self.zlm.el_request(joburl,
                                      headers=self.headers,
                                      hint='search')
            if con.code == 404:
                con = None
            if con is None:
                Log.warning('请求搜索页失败', joburl)
                self.add_job(job)
                return
            self.parse_page(joburl, con)

    def parse_page(self, url, con):
        if u"请修改或适当减少搜索项再进行搜索" in con.text:
            if not self.zlm.cur_worker().cansearch:
                # Account BLOCKED ??
                self.zlm.cur_worker().isvalid = False
                raise RuntimeError("AccountBlocked")
            Log.error("NO_RESULT_OR_BLOCK", url)
            return
        try:
            hf = spider.util.htmlfind(con.text, 'div class="resumes-list"', 0)
            node = hf.get_node()
            a = re.findall(ur'''tag="([a-zA-Z0-9]+)_1"''', node)
            a = spider.util.unique_list(a)
            for i in a:
                # print "found_cv", i
                self.jobpusher({
                    'type': 'cv',
                    'url': CVZhilianSearch.PAGE_TEMPLATE % i
                })
        except:
            msg = "unknown search result %s" % url
            Log.error(msg, "sleep 5s.")
            Log.errorbin(msg, con.text)
            time.sleep(5)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            self.jobpusher(None)
        elif evt == 'STARTED':
            #spider.misc.stacktracer.trace_start('res.trace.html')
            pass