Beispiel #1
0
class CVZhilianSplit(GenQueries):
    def __init__(self, thcnt, ac):
        GenQueries.__init__(self, thcnt)
        self._last_time = 0.0
        self.zlm = MRLManager(ac, new_ZLLogin)
        self.headers = {
            'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom'
        }
        self.search_cnt = 0

    def init_conditions(self):
        # 更新日期 固定为6个月
        # 年龄,性别,学历,户口所在地,当前工作状态,现居住地,企业性质,企业规模
        CVZhilianData.add(self, 'SF_1_1_7', [['4,9', '最近一个月']])
        CVZhilianData.add(self, 'SF_1_1_8', CVZhilianData.agelist)
        CVZhilianData.add(self, 'SF_1_1_9', CVZhilianData.gender)
        CVZhilianData.add(self, 'SF_1_1_6', qdata.provs)  #现居住地
        CVZhilianData.add(self, 'SF_1_1_5', CVZhilianData.edugr)
        CVZhilianData.add(self, 'SF_1_1_10', qdata.provs)  #户口所在地
        CVZhilianData.add(self, 'SF_1_1_29', CVZhilianData.workstate)
        CVZhilianData.add(self, 'SF_1_1_31', CVZhilianData.corp_type)
        CVZhilianData.add(self, 'SF_1_1_30', CVZhilianData.corp_size)
        self.zlm.ensure_login_do(None, lambda n: 1, None)
        cansearch = self.zlm.cur_worker().cansearch
        self.zlm.release_obj()
        if not cansearch:
            raise RuntimeError("this account can't search!")

    def search_cnt_checker(self, net):
        self.search_cnt += 1
        if self.search_cnt > 380:
            self.search_cnt = 0
            raise LoginErrors.AccountHoldError()

    def need_split(self, opts, level, isLast):
        url = CVZhilianUtil.get_search_url(opts)
        con = self.zlm.el_request(url,
                                  headers=self.headers,
                                  prechecker=self.search_cnt_checker)
        if con.code == 404:
            con = None
        if con is None:
            Log.warning('请求搜索页失败', url)
            time.sleep(5)
            return self.need_split(opts, level, isLast)
        cnt = CVZhilianUtil.get_count(url, con)
        if cnt == 0:
            return 0
        return cnt >= 4000
Beispiel #2
0
class CVLPSpider(Spider):
    def __init__(self, thcnt, acs, type=1):
        Spider.__init__(self, thcnt)
        self._name = 'cvlpspider'
        self.lpm = MRLManager(acs, new_LPQYLogin)

        if type == 2:
            self.lpm = MRLManager(acs, new_LPLTLogin)
        self.pagestore = LPCVStore()
        self.hasher = spider.util.LocalHashChecker()
        self.lpm.ensure_login_do(None, lambda n:1, None)
        self.lpm.release_obj()
        self.imgcnt = 0
        self._type = type

        self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        if self._type == 2:
            self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        self.stat = spider.runtime.StatDict()

    def run_job(self, jobd):
        if jobd.get('type') == 'cvurl':
            cvid = jobd.get('jobid')
            url = self.url_prefix.format(cvid)

            qstring = "liepincv://"+cvid
            if self.pagestore.check_should_fetch(qstring):
                self.stat.inc('cv')
                o = self.lpm.el_request(url, headers=Cdata.headers, allow_redirects=True)
                if o is None:
                    self.add_job(jobd)
                    return None
                self.pagestore.save(time.time(), cvid, url, o.text)
                time.sleep(3)
            else:
                print '======%s has downloaded=====' % qstring

    def dispatch(self):
        with open("res.spider.txt", 'rb') as f:
            for line in f:
                line = line.split("\t")
                if not line:
                    continue
                self.add

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail(['*****@*****.**', '*****@*****.**'], '%s DONE' % sys.argv[0], msg)
Beispiel #3
0
class CVZhilianGetCV(Spider2):
    def __init__(self, thcnt, cfgname, acs):
        Spider2.__init__(self, thcnt)
        self._name = 'cvzlgetcv_%s' % cfgname
        self.zlm = MRLManager(acs, new_ZLLogin)
        self.pagestore = CVZLPageStore()
        self.hasher = spider.util.LocalHashChecker()
        self.zlm.ensure_login_do(None, lambda n: 1, None)
        self.zlm.release_obj()
        self.imgcnt = 0

    def init_jobs(self):
        return

    def wait_job(self):
        return self.wait_job_by_condition()

    def push_job(self, j):
        if j is None:
            self._no_more_wait_job = True
        else:
            self.add_job(j)

    def _get_image(self, refurl):
        imgurl = "http://rd2.zhaopin.com/s/loginmgr/monitorvalidatingcode.asp?t=" + str(
            int(time.time()) * 1000)
        con = self.zlm.el_request(imgurl, headers={'Referer': refurl})
        if con is None:
            Log.warning("fetch image failed, sleep 1s")
            time.sleep(1)
            return self._get_image(refurl)
        return con.content

    def get_cv(self, url):
        #http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM622670859R90250000000_1_1?searchresume=1
        con = self.zlm.el_request(url)
        if con is None:
            return None

        if u"您需要输入验证码才能继续后续的操作" in con.text:
            self.imgcnt += 1
            if self.imgcnt > 10:
                self.imgcnt = 0
                self.zlm.set_nologin()
                return None

            for i in range(0, 5):
                code = OnlineOCR('zhilian2').resolve(
                    lambda dbgdata=None: self._get_image(url))
                purl = "http://rd.zhaopin.com/resumePreview/resume/_CheackValidatingCode?validatingCode=" + code
                con = self.zlm.el_request(purl,
                                          data={'validatingCode': code},
                                          headers={'Referer': url})
                if con is not None:
                    if re.search('true', con.text, re.I):
                        time.sleep(5)
                        return None
                Log.warning('验证码输入失败')
                time.sleep(2)
            #连续失败了5次, 换帐号!!
            self.zlm.set_nologin()
            self.imgcnt = 0
            return None

        return con

    def run_job(self, jobid):
        # {'type':'cv', 'url':'http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM321509749R90250002000_1_1?searchresume=1'}
        if self.get_job_type(jobid) == 'cv':
            url = jobid['url']
            m = re.search(ur'/([0-9A-Z]+)_\d+_\d+', url)
            if m is None:
                Log.error('invalid cv url', url)
                return
            jdid = m.group(1)
            if self.pagestore.check_should_fetch(jdid):
                con = self.get_cv(url)
                if con is None:
                    self.add_job(jobid)
                    return
                if u"该简历已被求职者删除" in con.text:
                    return
                if u"抱歉,该简历已被删除" in con.text:
                    return
                if u"由于系统繁忙,一会再来看一下吧" in con.text:
                    Log.warning("url %s 繁忙不可获得" % url)
                    return
                getime = int(time.time())
                self.pagestore.save(getime, jdid, url, con.text)
            else:
                Log.errinfo("跳过拉取简历%s" % jdid)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            title = ' '.join(sys.argv) + ' DONE'
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**', '*****@*****.**'],
                                 title, msg)
Beispiel #4
0
class CVLPSpider(Spider):
    def __init__(self, thcnt, acs, type=1, process_num=0, max_process_cnt=1):
        Spider.__init__(self, thcnt)
        self._name = 'cvlpspider'
        self.lpm = MRLManager(acs, new_LPQYLogin)

        if type == 2:
            self.lpm = MRLManager(acs, new_LPLTLogin)
        self.pagestore = LPCVStore()
        self.hasher = spider.util.LocalHashChecker()
        self.lpm.ensure_login_do(None, lambda n: 1, None)
        self.lpm.release_obj()
        self.imgcnt = 0
        self._type = type

        self._process_num = process_num
        self._max_process_cnt = max_process_cnt

        self._spider_cnt = 0
        self._start_time = datetime.datetime.today()
        self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        if self._type == 2:
            self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        self.stat = spider.runtime.StatDict()

        self._limit_cnt = 200

    def run_job(self, cvid):

        url = self.url_prefix.format(cvid)
        qstring = "liepincv://" + cvid
        if self.pagestore.check_should_fetch(qstring):
            self.stat.inc('cv')
            o = self.lpm.el_request(url,
                                    headers=Cdata.headers,
                                    allow_redirects=True)
            if o is None:
                self.add_job(cvid)
                return None
            self.pagestore.save(time.time(), cvid, url, o.text)
            time.sleep(5)
            self._spider_cnt += 1
            self._check_if_stop()
            print "start: %s - now: %s || spider cnt: %d" % (
                self._start_time, datetime.datetime.today(), self._spider_cnt)
        else:
            print '======%s has downloaded=====' % qstring

    def _check_if_stop(self):
        if self._spider_cnt % self._limit_cnt == 0:
            Log.info("spider %d pages, sleep 60*5s today" % self._spider_cnt)
            time.sleep(60 * 5)

    def dispatch(self):
        with open(Cdata.IDS_FILE, 'rb') as f:

            for index, line in enumerate(f):
                if index % self._max_process_cnt != self._process_num:
                    continue

                line = line.strip()
                if not line:
                    continue

                if self.pagestore.find_any("%s://%s" % ("cv_liepin", line)):
                    continue
                if not self._is_needed_cv(line):
                    continue

                self.add_main_job(line)

        self.add_main_job(None)

    def _is_needed_cv(self, line):
        if not hasattr(self, 'not_need_cvs'):
            self.not_need_cvs = set()

            if os.path.exists(LPCVConfig.NOT_NEED_CV_FN):
                with open(LPCVConfig.NOT_NEED_CV_FN, 'rb') as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue

                        self.not_need_cvs.add(line)

            if os.path.exists(LPCVConfig.NOT_ACCESS_BY_QIYE):
                with open(LPCVConfig.NOT_ACCESS_BY_QIYE, 'rb') as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
                        self.not_need_cvs.add(line)

        if line in self.not_need_cvs:
            return False

        return True

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail(
                ['*****@*****.**'],
                'CVLPSpider process %d, DONE' % self._process_num,
                msg + '\n saved: %d' % self.pagestore.saved_count)