Example #1
0
class CV51ContactSpider(downloader_base.CVContactSpiderBase):

    REAL_URL_TEMPLATE = "http://ehire.51job.com/Candidate/ResumeView.aspx?hidUserID=%s&hidEvents=23&hidKey=303514e18dbc5600b3ecfff9abb76510"

    def __init__(self, thcnt, cvaccs):
        self.cv51nm = MRLManager(cvaccs, new_51Login)
        downloader_base.CVContactSpiderBase.__init__(self, thcnt, 'cv_51job')
        self.page_store = cv_download_page.CV51DownloadPageStore()

        self.log = log_util.MLog(self.__class__.__name__, config.LOGGING_FILE)

    def run_job(self, job):

        realUrl = job.get('realUrl', '')
        cvId = job['cvId']

        indexUrl = "%s://%s" % ("cv_51job", cvId)
        if not realUrl or 'fake' in realUrl:
            realUrl = self.get_real_url(cvId)

        # 设置状态
        self._cv_status.update({indexUrl: config.StatusCode.DOWNLOADING})
        data = {
            "doType": "SearchToCompanyHr",
            "userId": cvId,
            "strWhere": '',
        }
        content, status = self.with_check_request(self._download_url,
                                                  data=data,
                                                  realUrl=realUrl)

        if not content:
            self.log.info('downloaded cv page fail: %s, readd to the queue' %
                          indexUrl)
            self.re_add_job(job)
            return

        status = self.page_store.save(time.time(), cvId, realUrl, content)

        # 失败 重试
        if status == config.StatusCode.AFTER_DOWNLOADING_FAIL:
            self.log.warn("cv %s download fail, readd to the queue" % indexUrl)
            self.re_add_job(job)

        self._cv_status.update({indexUrl: status})

    def with_check_request(self, url, data, realUrl):
        res = self.cv51nm.el_request(url, data=data)

        # if u'简历已在公司人才夹中' in res.text:
        res = self.cv51nm.el_request(realUrl)

        if u"此人简历保密" in res.text:
            return res.text, config.StatusCode.CV_CLOSED

        return res.text, ''

    def get_real_url(self, cvId):

        return CV51ContactSpider.REAL_URL_TEMPLATE % cvId
Example #2
0
class ChinaHrCVGet(Spider):
    def __init__(self, thread_cnt):
        Spider.__init__(self, thread_cnt)
        self._name = 'chinahr_cv_get'
        self.lgm = MRLManager(HrConfig.ac, new_ChinaHrLogin)
        self.page_store = CVChinahrStore()

    def dispatch(self):
        with open('res.cv_chinahr.txt') as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue
                line = line.split("||")[0]
                self.add_main_job({
                    "url": json.loads(line),
                    "type": 'search',
                    'page': '1'
                })

        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):

        type_ = jobid.get("type", "")
        if not type_:
            return

        if "search" == type_:
            params = jobid.get('url')
            params.update({"page": jobid.get('page')})
            real_url = spider.util.compose_url_param(CData.SEARCH_URL, params)
            res = self.lgm.el_request(real_url)
            find = re.findall(r'cvId="(.*?)"', res.text, re.S)
            for cvid in find:
                self.add_job({"cvid": cvid, 'type': 'cvpage'})

            if jobid.get('page') == '1':
                self.parse_next(res.text, params)

        if "cvpage" == type_:
            real_url = CData.CV_PAGE_URL_TMPLATE.format(jobid.get('cvid'))
            res = self.lgm.el_request(real_url)
            self.page_store.save(time.time(), jobid.get('cvid'), real_url,
                                 res.text)

    def parse_next(self, content, params):
        p = copy.deepcopy(params)
        del p['page']
        find = re.search(u'搜索到.*?<span>(\d+).*?</span>', content, re.S)
        if find:
            totalpage = (int(find.group(1)) + 20) / 20
            for page in range(2, totalpage):
                self.add_job({'url': p, 'type': 'search', 'page': str(page)})

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail(['*****@*****.**'], 'cv_chinahr done', msg)
Example #3
0
class CVZhilianSplit(GenQueries):
    def __init__(self, thcnt, ac):
        GenQueries.__init__(self, thcnt)
        self._last_time = 0.0
        self.zlm = MRLManager(ac, new_ZLLogin)
        self.headers = {
            'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom'
        }
        self.search_cnt = 0

    def init_conditions(self):
        # 更新日期 固定为6个月
        # 年龄,性别,学历,户口所在地,当前工作状态,现居住地,企业性质,企业规模
        CVZhilianData.add(self, 'SF_1_1_7', [['4,9', '最近一个月']])
        CVZhilianData.add(self, 'SF_1_1_8', CVZhilianData.agelist)
        CVZhilianData.add(self, 'SF_1_1_9', CVZhilianData.gender)
        CVZhilianData.add(self, 'SF_1_1_6', qdata.provs)  #现居住地
        CVZhilianData.add(self, 'SF_1_1_5', CVZhilianData.edugr)
        CVZhilianData.add(self, 'SF_1_1_10', qdata.provs)  #户口所在地
        CVZhilianData.add(self, 'SF_1_1_29', CVZhilianData.workstate)
        CVZhilianData.add(self, 'SF_1_1_31', CVZhilianData.corp_type)
        CVZhilianData.add(self, 'SF_1_1_30', CVZhilianData.corp_size)
        self.zlm.ensure_login_do(None, lambda n: 1, None)
        cansearch = self.zlm.cur_worker().cansearch
        self.zlm.release_obj()
        if not cansearch:
            raise RuntimeError("this account can't search!")

    def search_cnt_checker(self, net):
        self.search_cnt += 1
        if self.search_cnt > 380:
            self.search_cnt = 0
            raise LoginErrors.AccountHoldError()

    def need_split(self, opts, level, isLast):
        url = CVZhilianUtil.get_search_url(opts)
        con = self.zlm.el_request(url,
                                  headers=self.headers,
                                  prechecker=self.search_cnt_checker)
        if con.code == 404:
            con = None
        if con is None:
            Log.warning('请求搜索页失败', url)
            time.sleep(5)
            return self.need_split(opts, level, isLast)
        cnt = CVZhilianUtil.get_count(url, con)
        if cnt == 0:
            return 0
        return cnt >= 4000
Example #4
0
class CVLPSpider(Spider):
    def __init__(self, thcnt, acs, type=1):
        Spider.__init__(self, thcnt)
        self._name = 'cvlpspider'
        self.lpm = MRLManager(acs, new_LPQYLogin)

        if type == 2:
            self.lpm = MRLManager(acs, new_LPLTLogin)
        self.pagestore = LPCVStore()
        self.hasher = spider.util.LocalHashChecker()
        self.lpm.ensure_login_do(None, lambda n:1, None)
        self.lpm.release_obj()
        self.imgcnt = 0
        self._type = type

        self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        if self._type == 2:
            self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        self.stat = spider.runtime.StatDict()

    def run_job(self, jobd):
        if jobd.get('type') == 'cvurl':
            cvid = jobd.get('jobid')
            url = self.url_prefix.format(cvid)

            qstring = "liepincv://"+cvid
            if self.pagestore.check_should_fetch(qstring):
                self.stat.inc('cv')
                o = self.lpm.el_request(url, headers=Cdata.headers, allow_redirects=True)
                if o is None:
                    self.add_job(jobd)
                    return None
                self.pagestore.save(time.time(), cvid, url, o.text)
                time.sleep(3)
            else:
                print '======%s has downloaded=====' % qstring

    def dispatch(self):
        with open("res.spider.txt", 'rb') as f:
            for line in f:
                line = line.split("\t")
                if not line:
                    continue
                self.add

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail(['*****@*****.**', '*****@*****.**'], '%s DONE' % sys.argv[0], msg)
Example #5
0
class QccSpider2(Spider):
    def __init__(self, thcnt):
        Spider.__init__(self, thcnt)
        account = [{'qqno': '285259106', 'qqpwd': '123@456'}]
        self.pagestore = QccPageStore()
        self.qcc_acc_manager = MRLManager(account, self._newqcc, shared=True)

    def _newqcc(self, ac):
        a = QccLogin(ac)
        a.load_proxy('curproxy0', index=1, auto_change=False)
        return a

    def dispatch(self):
        # self.qcclogin.do_login()
        f = open("r1.txt", "rb")

        currline = 0
        skipto = 363000
        endline = 1000000

        while currline < skipto:
            f.readline()
            currline += 1

        while currline < endline:
            line = f.readline()
            if not line:
                break
            line = line.strip()
            currline += 1
            key = line.split(" ")[-1].strip()
            job = {"kw": key, "page": "1", "type": "u1", 'line': currline}
            self.add_main_job(job)
        self.wait_q()
        self.add_main_job(None)

    def run_job(self, jobid):
        tp = self.get_job_type(jobid)
        if tp == 'u1':
            print "searching", jobid['kw'], "line:", jobid['line']
            data = {
                'key':
                jobid['kw'],
                'token':
                hashlib.md5('f625a5b661058ba5082ca508f99ffe1b' +
                            jobid['kw']).hexdigest(),
                'type':
                0
            }
            url = 'http://qichacha.com/gongsi_getList'
            con = self.qcc_acc_manager.el_request(
                url, data=data, headers={'Referer': 'http://qichacha.com/'})
            if con is None:
                time.sleep(10)
                self.add_job(jobid)
                return
            try:
                if con.text.strip() == 'null':
                    print "NO DATA", jobid['kw'], "line:", jobid['line']
                    return
                j = json.loads(con.text)
                for job in j:
                    ## [{"KeyNo":"b37b1d9b84ad1ac179ddfcef5d0d533d","Name":"\u6d1b\u9633\u7261\u4e39\u901a\u8baf\u80a1\u4efd\u6709\u9650\u516c\u53f8"}]
                    kid = job["KeyNo"]
                    name = job["Name"]
                    self.add_job({'type': 'u2', 'kid': kid, 'name': name})
            except:
                Log.errorbin(jobid['kw'], con.text)
        elif tp == 'u2':
            kid = jobid['kid']
            url = 'http://qichacha.com/firm_CN_' + kid
            if self.pagestore.check_should_fetch(kid):
                con = self.request_url(url)
                if con is None:
                    self.add_job(jobid)
                    return
                if self.pagestore.save(int(time.time()), kid, url, con.text):
                    print jobid['name'], kid, "saved"
            else:
                print "skip", kid
Example #6
0
class CVZLContactSpider(downloader_base.CVContactSpiderBase):
    def __init__(self, thcnt, cvaccs):
        self.cvaccs = cvaccs
        self.zlgm = MRLManager(cvaccs, new_ZLLogin)
        downloader_base.CVContactSpiderBase.__init__(self, thcnt, 'cv_zhilian')
        self.page_store = cv_download_page.CVZLDownloadPageStore()

        self.log = log_util.MLog(self.__class__.__name__, config.LOGGING_FILE)

    def run_job(self, job):

        realUrl = job['realUrl']
        cvId = job['cvId']
        indexUrl = "%s://%s" % (self.channel, cvId)

        # 设置状态
        self._cv_status.update({indexUrl: config.StatusCode.DOWNLOADING})

        page_template = config.CV_PAGE_TEMPLATE.get('cv_zhilian')
        cv_page_url = page_template.format(cvId)
        data = self.get_post_data(cvId, cv_page_url)
        content, status = self.with_check_request(self._download_url,
                                                  data=data,
                                                  realUrl=cv_page_url)

        if not content:
            self.log.warn('downloaded cv page fail: %s, readd to the queue ' %
                          indexUrl)
            self.re_add_job(job)
            return

        status = self.page_store.save(time.time(), cvId, realUrl, content)

        # 失败 重试
        if status == config.StatusCode.AFTER_DOWNLOADING_FAIL:
            self.log.warn("cv %s download fail, readd to the queue" % indexUrl)
            self.re_add_job(job)
            return

        self._cv_status.update({indexUrl: status})

    def get_post_data(self, cvId, cv_page_url):

        res = self.zlgm.el_request(cv_page_url)
        find = re.search(ur'简历名称.*?<strong[^<>]*>(.*?)</strong>', res.text,
                         re.S)
        if not find:
            Log.errinfo("find zhilian cvname exception")
            return None

        cvname = find.group(1)
        cvname = re.sub(ur'&#160;&#160;', '', cvname)
        data = {
            "extID": cvId,
            "versionNumber": 1,
            "favoriteID": "113460230",
            "resumeName": cvname,
            "dType": 0
        }
        return data

    def try_next_proxy(self):
        self.zlgm = MRLManager(self.cvaccs, new_ZLLogin)

    def with_check_request(self, url, data, realUrl):
        res = self.zlgm.el_request(url, data=data)

        if re.search(ur'您的登录IP(.*)存在异常行为,已被暂时冻结', res.text):
            print "trying next proxy ...."
            self.try_next_proxy()
            return self.with_check_request(url, data, realUrl)

        if ur'此应聘者的简历已被下载' in res.text:
            Log.info("already download, url = %s" % realUrl)
Example #7
0
class CVZhilianGetCV(Spider2):
    def __init__(self, thcnt, cfgname, acs):
        Spider2.__init__(self, thcnt)
        self._name = 'cvzlgetcv_%s' % cfgname
        self.zlm = MRLManager(acs, new_ZLLogin)
        self.pagestore = CVZLPageStore()
        self.hasher = spider.util.LocalHashChecker()
        self.zlm.ensure_login_do(None, lambda n: 1, None)
        self.zlm.release_obj()
        self.imgcnt = 0

    def init_jobs(self):
        return

    def wait_job(self):
        return self.wait_job_by_condition()

    def push_job(self, j):
        if j is None:
            self._no_more_wait_job = True
        else:
            self.add_job(j)

    def _get_image(self, refurl):
        imgurl = "http://rd2.zhaopin.com/s/loginmgr/monitorvalidatingcode.asp?t=" + str(
            int(time.time()) * 1000)
        con = self.zlm.el_request(imgurl, headers={'Referer': refurl})
        if con is None:
            Log.warning("fetch image failed, sleep 1s")
            time.sleep(1)
            return self._get_image(refurl)
        return con.content

    def get_cv(self, url):
        #http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM622670859R90250000000_1_1?searchresume=1
        con = self.zlm.el_request(url)
        if con is None:
            return None

        if u"您需要输入验证码才能继续后续的操作" in con.text:
            self.imgcnt += 1
            if self.imgcnt > 10:
                self.imgcnt = 0
                self.zlm.set_nologin()
                return None

            for i in range(0, 5):
                code = OnlineOCR('zhilian2').resolve(
                    lambda dbgdata=None: self._get_image(url))
                purl = "http://rd.zhaopin.com/resumePreview/resume/_CheackValidatingCode?validatingCode=" + code
                con = self.zlm.el_request(purl,
                                          data={'validatingCode': code},
                                          headers={'Referer': url})
                if con is not None:
                    if re.search('true', con.text, re.I):
                        time.sleep(5)
                        return None
                Log.warning('验证码输入失败')
                time.sleep(2)
            #连续失败了5次, 换帐号!!
            self.zlm.set_nologin()
            self.imgcnt = 0
            return None

        return con

    def run_job(self, jobid):
        # {'type':'cv', 'url':'http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM321509749R90250002000_1_1?searchresume=1'}
        if self.get_job_type(jobid) == 'cv':
            url = jobid['url']
            m = re.search(ur'/([0-9A-Z]+)_\d+_\d+', url)
            if m is None:
                Log.error('invalid cv url', url)
                return
            jdid = m.group(1)
            if self.pagestore.check_should_fetch(jdid):
                con = self.get_cv(url)
                if con is None:
                    self.add_job(jobid)
                    return
                if u"该简历已被求职者删除" in con.text:
                    return
                if u"抱歉,该简历已被删除" in con.text:
                    return
                if u"由于系统繁忙,一会再来看一下吧" in con.text:
                    Log.warning("url %s 繁忙不可获得" % url)
                    return
                getime = int(time.time())
                self.pagestore.save(getime, jdid, url, con.text)
            else:
                Log.errinfo("跳过拉取简历%s" % jdid)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            title = ' '.join(sys.argv) + ' DONE'
            msg += "saved: %d\n" % self.pagestore.saved_count
            spider.util.sendmail(['*****@*****.**', '*****@*****.**'],
                                 title, msg)
Example #8
0
class CVZhilianSearch(Spider2):

    PAGE_TEMPLATE = "http://rd.zhaopin.com/resumepreview/resume/viewone/1/%s_1_1"
    CRAWLER_RANGE_MAP = {
        '3d': '1,9',  #最近三天
        '1w': '2,9',  #最近一周
        '2w': '3,9',  #最近两周
        '1m': '4,9',  #最近一个月
        '2m': '5,9',  #最近2个月
        '3m': '6,9',  #最近3个月
        '6m': '7,9',  #最近6个月
        '1y': '8,9',  #最近1年
    }

    def __init__(self, thcnt, acs):
        Spider2.__init__(self, thcnt)
        self._name = 'cv_zhilian'
        self.jobpusher = None
        self.zlm = MRLManager(acs, new_ZLLogin)
        self.headers = {
            'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom'
        }
        self.search_cnt = 0

        self.crawler_range = None

    def init_jobs(self):
        # fn = 'cv_zhilian.queries.txt'
        # fn = 'split1y.txt'
        fn = "one_month_splitly.txt"
        self.add_main_job_file({'type': 'main'}, fn)

    def search_cnt_checker(self, net):
        # 当搜索次数到达一定数量时, 必须换帐号登录, 否则可能被封.
        self.search_cnt += 1
        if self.search_cnt > 500:
            self.search_cnt = 0
            raise LoginErrors.AccountHoldError()

    def run_job(self, job):
        jt = self.get_job_type(job)
        if jt == 'main':
            joburl = CVZhilianUtil.get_search_url(json.loads(job['line']))
            # if this account can't search, then giveup.
            con = self.zlm.el_request(joburl,
                                      headers=self.headers,
                                      hint='search',
                                      prechecker=self.search_cnt_checker)
            if con.code == 404:
                con = None
            if con is None:
                Log.warning('请求搜索页失败', joburl)
                self.add_job(job)
                return
            for su in CVZhilianUtil.sub_pages(joburl, con):
                self.add_job({'type': 'search', 'url': su})
            self.parse_page(joburl, con)
        elif jt == 'search':
            joburl = job['url']
            # if self.crawler_range:
            #     joburl = CVZhilianUtil.get_count()
            con = self.zlm.el_request(joburl,
                                      headers=self.headers,
                                      hint='search')
            if con.code == 404:
                con = None
            if con is None:
                Log.warning('请求搜索页失败', joburl)
                self.add_job(job)
                return
            self.parse_page(joburl, con)

    def parse_page(self, url, con):
        if u"请修改或适当减少搜索项再进行搜索" in con.text:
            if not self.zlm.cur_worker().cansearch:
                # Account BLOCKED ??
                self.zlm.cur_worker().isvalid = False
                raise RuntimeError("AccountBlocked")
            Log.error("NO_RESULT_OR_BLOCK", url)
            return
        try:
            hf = spider.util.htmlfind(con.text, 'div class="resumes-list"', 0)
            node = hf.get_node()
            a = re.findall(ur'''tag="([a-zA-Z0-9]+)_1"''', node)
            a = spider.util.unique_list(a)
            for i in a:
                # print "found_cv", i
                self.jobpusher({
                    'type': 'cv',
                    'url': CVZhilianSearch.PAGE_TEMPLATE % i
                })
        except:
            msg = "unknown search result %s" % url
            Log.error(msg, "sleep 5s.")
            Log.errorbin(msg, con.text)
            time.sleep(5)

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            self.jobpusher(None)
        elif evt == 'STARTED':
            #spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #9
0
class QccSpider(Spider):
    def __init__(self, threadcnt, acc_file):
        # self.qcclogin = QccLogin(acc)
        self.pagestore = QccPageStore()
        self.qcc_acc_manager = MRLManager(
            QccData(acc_file).get_accounts(), QccLogin)
        super(QccSpider, self).__init__(threadcnt)

    def _do_requests(self, url, **kwargs):
        r = Spider._do_requests(self, url, **kwargs)
        if r is None:
            return r
        if r.text.strip() == u"":
            raise ProxyError('ip blocked.')
        return r

    def dispatch(self):
        # self.qcclogin.do_login()
        f = open("r1.txt", "rb")

        currline = 0
        # if len(sys.argv) is 4:
        #     skipto = int(sys.argv[1].strip())
        #     endline = int(sys.argv[2].strip())
        #     Log.warning("skipto %d, endline %d. account file is %s."% (skipto, endline, sys.argv[3]))
        # else:
        #     raise RuntimeError("please use command-line arguments. arg[1]=skipto, arg[2]=endline, arg[3]=account_file_path")
        skipto = 0
        endline = 20000
        for line in f:
            currline += 1
            if currline >= skipto:
                key = line.split(" ")[-1].strip()
                job = {"kw": key, "page": "1", "type": "u1"}
                self.add_main_job(job)
            if currline >= endline:
                break
        self.wait_q()
        self.add_job(None, True)

    def retry(self, con, job):
        if re.search(u'<h1>An error occurred.</h1>', con.text) or re.search(
                u'Tinyproxy was unable to', con.text):
            #should reload this page.
            if int(job["retrycnt"]) < 5:
                job["retrycnt"] = int(job["retrycnt"]) + 1
                self.add_job(job)
                return True
        return False

    def run_job(self, job):
        if job["type"] is "u1":
            key = job["kw"]
            page = str(job["page"])
            url = "http://qichacha.com/search?key=" + key + "&index=name&" + "p=" + page
            # con = self.qcclogin.request_url(url)
            con = self.qcc_acc_manager.el_request(url)
            res = con.text
            if res.strip() == "":
                time.sleep(10)
                self.add_job(job)
                return
            elif re.search(u'小查还没找到数据', res):
                Log.error("key=" + key + ", page=" + page + ", no data!\n")
            else:
                Log.error("searching %s" % key)
                urls = self._match(
                    res, r'<h3 class="site-list-title"><a href="(.*?)"')
                if len(urls) == 0:
                    Log.errorbin("%s %s" % (key, url), con.text)
                    raise AccountErrors.NoAccountError(key)
                for u in urls:
                    job2 = {"url": u, "type": "u2", "retrycnt": "0"}
                    self.add_job(job2)
                # catch page 1 only
                # if page is '1':
                #     corp_count = int(self._match(res, r'<span class="search-key">(.*?)</span>')[0])
                #     pg_count = (corp_count + 9)/10
                #     #not vip limit in 10 pages
                #     if pg_count >= 10:
                #         pg_count = 10
                #     for i in range(2, pg_count+1):
                #         job3 = {"kw": key, "page": str(i), "type": "u1"}
                #         self.add_job(job3)

        elif job["type"] is "u2":
            url = "http://qichacha.com" + job["url"]
            cpid = job["url"][1:]

            if self.pagestore.check_should_fetch(cpid):
                con = self.request_url(url)
                if con is None or self.retry(con, job):
                    return
                self.pagestore.save(int(time.time()), cpid, url, con.text)
            else:
                Log.warning("skip ", cpid)

    def _match(self, content, pattern1, pattern2=''):
        list = re.findall(pattern1, content, re.S)
        result_list = []
        if pattern2 is not '':
            for i in range(len(list)):
                tlist = re.findall(pattern2, list[i], re.S)
                for j in range(len(tlist)):
                    result_list.append(tlist[j].strip())
            return result_list
        else:
            for i in range(len(list)):
                list[i] = list[i].strip()
            return list

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            # spider.util.sendmail(['*****@*****.**', '*****@*****.**'], '%s DONE' % sys.argv[0], msg)
            pass
        elif evt == 'STARTED':
            #spider.misc.stacktracer.trace_start('res.trace.html')
            pass
Example #10
0
class GenQ(GenQueriesLT):
    def __init__(self, thcnt):
        GenQueriesLT.__init__(self, thcnt)
        self.m_db = CVChinahrStore()
        self._name = 'cv_chinahr'
        self.lgm = MRLManager(HrConfig.ac, new_ChinaHrLogin)

        self._tls = threading.local()

    def init_conditions(self):
        CData.add(self, 'live', qdata.city_data)  # 必须作为第一条件,否则会有和直接查看url时显示不一样
        CData.add(self, 'reFreshTime', CData.reFreshTime)
        CData.add(self, 'degree', CData.degree_data)
        CData.add(self, 'sex', CData.gender_data)
        CData.add(self, 'age', CData.age_data)
        CData.add(self, 'workStatus', CData.workStatus)
        CData.add(self, 'salary', CData.salary)
        #简历详细度

        CData.add(self, 'jobType', CData.jobType)
        CData.add(self, 'hasPhoto', CData.hasPhoto)
        CData.add(self, 'workPlace', qdata.city_data)
        CData.add(self, 'jobs', qdata.job_data)

        self.select_user_agent('firefox')

    def translate_data(self, o):
        url = {}
        if 'age' in o:
            m = re.split('-', o['age'])
            url.update({'minAge': m[0]})
            url.update({'maxAge': m[1]})

        if 'salary' in o:
            m = re.split('-', o['salary'])
            url.update({'minSalary': m[0]})
            url.update({'maxSalary': m[1]})

        for key in [
                'degree', 'jobs', 'sex', 'workPlace', 'reFreshTime', 'live',
                'jobType', 'workStatus', 'hasPhoto'
        ]:
            if key in o:
                url.update({key: o[key]})
        return url

    def need_split(self, url, level, isLast):
        params = self.translate_data(url)

        real_url = spider.util.compose_url_param(CData.SEARCH_URL, params)

        res = self.lgm.el_request(real_url)
        count = 0
        find = re.search(u'搜索到.*?<span>(\d+).*?</span>', res.text, re.S)
        if find:
            count = int(find.group(1))
            if count > 2999:
                return True

            if count:
                setattr(self._tls, '_count', count)

            print "real_url: %s || count: %d" % (real_url, count)

        return False

    def log_url(self, url):

        if isinstance(url, dict):
            url = self.translate_data(url)
            url = json.dumps(url)

        count = getattr(self._tls, "_count", None)
        if count:
            url = " %s|| %d" % (url, getattr(self._tls, "_count"))
            self.fs.append(url)
Example #11
0
class CVLPSpider(Spider):
    def __init__(self, thcnt, acs, type=1, process_num=0, max_process_cnt=1):
        Spider.__init__(self, thcnt)
        self._name = 'cvlpspider'
        self.lpm = MRLManager(acs, new_LPQYLogin)

        if type == 2:
            self.lpm = MRLManager(acs, new_LPLTLogin)
        self.pagestore = LPCVStore()
        self.hasher = spider.util.LocalHashChecker()
        self.lpm.ensure_login_do(None, lambda n: 1, None)
        self.lpm.release_obj()
        self.imgcnt = 0
        self._type = type

        self._process_num = process_num
        self._max_process_cnt = max_process_cnt

        self._spider_cnt = 0
        self._start_time = datetime.datetime.today()
        self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        if self._type == 2:
            self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0'
        self.stat = spider.runtime.StatDict()

        self._limit_cnt = 200

    def run_job(self, cvid):

        url = self.url_prefix.format(cvid)
        qstring = "liepincv://" + cvid
        if self.pagestore.check_should_fetch(qstring):
            self.stat.inc('cv')
            o = self.lpm.el_request(url,
                                    headers=Cdata.headers,
                                    allow_redirects=True)
            if o is None:
                self.add_job(cvid)
                return None
            self.pagestore.save(time.time(), cvid, url, o.text)
            time.sleep(5)
            self._spider_cnt += 1
            self._check_if_stop()
            print "start: %s - now: %s || spider cnt: %d" % (
                self._start_time, datetime.datetime.today(), self._spider_cnt)
        else:
            print '======%s has downloaded=====' % qstring

    def _check_if_stop(self):
        if self._spider_cnt % self._limit_cnt == 0:
            Log.info("spider %d pages, sleep 60*5s today" % self._spider_cnt)
            time.sleep(60 * 5)

    def dispatch(self):
        with open(Cdata.IDS_FILE, 'rb') as f:

            for index, line in enumerate(f):
                if index % self._max_process_cnt != self._process_num:
                    continue

                line = line.strip()
                if not line:
                    continue

                if self.pagestore.find_any("%s://%s" % ("cv_liepin", line)):
                    continue
                if not self._is_needed_cv(line):
                    continue

                self.add_main_job(line)

        self.add_main_job(None)

    def _is_needed_cv(self, line):
        if not hasattr(self, 'not_need_cvs'):
            self.not_need_cvs = set()

            if os.path.exists(LPCVConfig.NOT_NEED_CV_FN):
                with open(LPCVConfig.NOT_NEED_CV_FN, 'rb') as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue

                        self.not_need_cvs.add(line)

            if os.path.exists(LPCVConfig.NOT_ACCESS_BY_QIYE):
                with open(LPCVConfig.NOT_ACCESS_BY_QIYE, 'rb') as f:
                    for line in f:
                        line = line.strip()
                        if not line:
                            continue
                        self.not_need_cvs.add(line)

        if line in self.not_need_cvs:
            return False

        return True

    def event_handler(self, evt, msg, **kwargs):
        if evt == 'DONE':
            spider.util.sendmail(
                ['*****@*****.**'],
                'CVLPSpider process %d, DONE' % self._process_num,
                msg + '\n saved: %d' % self.pagestore.saved_count)