class CV51ContactSpider(downloader_base.CVContactSpiderBase): REAL_URL_TEMPLATE = "http://ehire.51job.com/Candidate/ResumeView.aspx?hidUserID=%s&hidEvents=23&hidKey=303514e18dbc5600b3ecfff9abb76510" def __init__(self, thcnt, cvaccs): self.cv51nm = MRLManager(cvaccs, new_51Login) downloader_base.CVContactSpiderBase.__init__(self, thcnt, 'cv_51job') self.page_store = cv_download_page.CV51DownloadPageStore() self.log = log_util.MLog(self.__class__.__name__, config.LOGGING_FILE) def run_job(self, job): realUrl = job.get('realUrl', '') cvId = job['cvId'] indexUrl = "%s://%s" % ("cv_51job", cvId) if not realUrl or 'fake' in realUrl: realUrl = self.get_real_url(cvId) # 设置状态 self._cv_status.update({indexUrl: config.StatusCode.DOWNLOADING}) data = { "doType": "SearchToCompanyHr", "userId": cvId, "strWhere": '', } content, status = self.with_check_request(self._download_url, data=data, realUrl=realUrl) if not content: self.log.info('downloaded cv page fail: %s, readd to the queue' % indexUrl) self.re_add_job(job) return status = self.page_store.save(time.time(), cvId, realUrl, content) # 失败 重试 if status == config.StatusCode.AFTER_DOWNLOADING_FAIL: self.log.warn("cv %s download fail, readd to the queue" % indexUrl) self.re_add_job(job) self._cv_status.update({indexUrl: status}) def with_check_request(self, url, data, realUrl): res = self.cv51nm.el_request(url, data=data) # if u'简历已在公司人才夹中' in res.text: res = self.cv51nm.el_request(realUrl) if u"此人简历保密" in res.text: return res.text, config.StatusCode.CV_CLOSED return res.text, '' def get_real_url(self, cvId): return CV51ContactSpider.REAL_URL_TEMPLATE % cvId
class ChinaHrCVGet(Spider): def __init__(self, thread_cnt): Spider.__init__(self, thread_cnt) self._name = 'chinahr_cv_get' self.lgm = MRLManager(HrConfig.ac, new_ChinaHrLogin) self.page_store = CVChinahrStore() def dispatch(self): with open('res.cv_chinahr.txt') as f: for line in f: line = line.strip() if not line: continue line = line.split("||")[0] self.add_main_job({ "url": json.loads(line), "type": 'search', 'page': '1' }) self.wait_q() self.add_main_job(None) def run_job(self, jobid): type_ = jobid.get("type", "") if not type_: return if "search" == type_: params = jobid.get('url') params.update({"page": jobid.get('page')}) real_url = spider.util.compose_url_param(CData.SEARCH_URL, params) res = self.lgm.el_request(real_url) find = re.findall(r'cvId="(.*?)"', res.text, re.S) for cvid in find: self.add_job({"cvid": cvid, 'type': 'cvpage'}) if jobid.get('page') == '1': self.parse_next(res.text, params) if "cvpage" == type_: real_url = CData.CV_PAGE_URL_TMPLATE.format(jobid.get('cvid')) res = self.lgm.el_request(real_url) self.page_store.save(time.time(), jobid.get('cvid'), real_url, res.text) def parse_next(self, content, params): p = copy.deepcopy(params) del p['page'] find = re.search(u'搜索到.*?<span>(\d+).*?</span>', content, re.S) if find: totalpage = (int(find.group(1)) + 20) / 20 for page in range(2, totalpage): self.add_job({'url': p, 'type': 'search', 'page': str(page)}) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail(['*****@*****.**'], 'cv_chinahr done', msg)
class CVZhilianSplit(GenQueries): def __init__(self, thcnt, ac): GenQueries.__init__(self, thcnt) self._last_time = 0.0 self.zlm = MRLManager(ac, new_ZLLogin) self.headers = { 'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom' } self.search_cnt = 0 def init_conditions(self): # 更新日期 固定为6个月 # 年龄,性别,学历,户口所在地,当前工作状态,现居住地,企业性质,企业规模 CVZhilianData.add(self, 'SF_1_1_7', [['4,9', '最近一个月']]) CVZhilianData.add(self, 'SF_1_1_8', CVZhilianData.agelist) CVZhilianData.add(self, 'SF_1_1_9', CVZhilianData.gender) CVZhilianData.add(self, 'SF_1_1_6', qdata.provs) #现居住地 CVZhilianData.add(self, 'SF_1_1_5', CVZhilianData.edugr) CVZhilianData.add(self, 'SF_1_1_10', qdata.provs) #户口所在地 CVZhilianData.add(self, 'SF_1_1_29', CVZhilianData.workstate) CVZhilianData.add(self, 'SF_1_1_31', CVZhilianData.corp_type) CVZhilianData.add(self, 'SF_1_1_30', CVZhilianData.corp_size) self.zlm.ensure_login_do(None, lambda n: 1, None) cansearch = self.zlm.cur_worker().cansearch self.zlm.release_obj() if not cansearch: raise RuntimeError("this account can't search!") def search_cnt_checker(self, net): self.search_cnt += 1 if self.search_cnt > 380: self.search_cnt = 0 raise LoginErrors.AccountHoldError() def need_split(self, opts, level, isLast): url = CVZhilianUtil.get_search_url(opts) con = self.zlm.el_request(url, headers=self.headers, prechecker=self.search_cnt_checker) if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', url) time.sleep(5) return self.need_split(opts, level, isLast) cnt = CVZhilianUtil.get_count(url, con) if cnt == 0: return 0 return cnt >= 4000
class CVLPSpider(Spider): def __init__(self, thcnt, acs, type=1): Spider.__init__(self, thcnt) self._name = 'cvlpspider' self.lpm = MRLManager(acs, new_LPQYLogin) if type == 2: self.lpm = MRLManager(acs, new_LPLTLogin) self.pagestore = LPCVStore() self.hasher = spider.util.LocalHashChecker() self.lpm.ensure_login_do(None, lambda n:1, None) self.lpm.release_obj() self.imgcnt = 0 self._type = type self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' if self._type == 2: self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' self.stat = spider.runtime.StatDict() def run_job(self, jobd): if jobd.get('type') == 'cvurl': cvid = jobd.get('jobid') url = self.url_prefix.format(cvid) qstring = "liepincv://"+cvid if self.pagestore.check_should_fetch(qstring): self.stat.inc('cv') o = self.lpm.el_request(url, headers=Cdata.headers, allow_redirects=True) if o is None: self.add_job(jobd) return None self.pagestore.save(time.time(), cvid, url, o.text) time.sleep(3) else: print '======%s has downloaded=====' % qstring def dispatch(self): with open("res.spider.txt", 'rb') as f: for line in f: line = line.split("\t") if not line: continue self.add def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail(['*****@*****.**', '*****@*****.**'], '%s DONE' % sys.argv[0], msg)
class QccSpider2(Spider): def __init__(self, thcnt): Spider.__init__(self, thcnt) account = [{'qqno': '285259106', 'qqpwd': '123@456'}] self.pagestore = QccPageStore() self.qcc_acc_manager = MRLManager(account, self._newqcc, shared=True) def _newqcc(self, ac): a = QccLogin(ac) a.load_proxy('curproxy0', index=1, auto_change=False) return a def dispatch(self): # self.qcclogin.do_login() f = open("r1.txt", "rb") currline = 0 skipto = 363000 endline = 1000000 while currline < skipto: f.readline() currline += 1 while currline < endline: line = f.readline() if not line: break line = line.strip() currline += 1 key = line.split(" ")[-1].strip() job = {"kw": key, "page": "1", "type": "u1", 'line': currline} self.add_main_job(job) self.wait_q() self.add_main_job(None) def run_job(self, jobid): tp = self.get_job_type(jobid) if tp == 'u1': print "searching", jobid['kw'], "line:", jobid['line'] data = { 'key': jobid['kw'], 'token': hashlib.md5('f625a5b661058ba5082ca508f99ffe1b' + jobid['kw']).hexdigest(), 'type': 0 } url = 'http://qichacha.com/gongsi_getList' con = self.qcc_acc_manager.el_request( url, data=data, headers={'Referer': 'http://qichacha.com/'}) if con is None: time.sleep(10) self.add_job(jobid) return try: if con.text.strip() == 'null': print "NO DATA", jobid['kw'], "line:", jobid['line'] return j = json.loads(con.text) for job in j: ## [{"KeyNo":"b37b1d9b84ad1ac179ddfcef5d0d533d","Name":"\u6d1b\u9633\u7261\u4e39\u901a\u8baf\u80a1\u4efd\u6709\u9650\u516c\u53f8"}] kid = job["KeyNo"] name = job["Name"] self.add_job({'type': 'u2', 'kid': kid, 'name': name}) except: Log.errorbin(jobid['kw'], con.text) elif tp == 'u2': kid = jobid['kid'] url = 'http://qichacha.com/firm_CN_' + kid if self.pagestore.check_should_fetch(kid): con = self.request_url(url) if con is None: self.add_job(jobid) return if self.pagestore.save(int(time.time()), kid, url, con.text): print jobid['name'], kid, "saved" else: print "skip", kid
class CVZLContactSpider(downloader_base.CVContactSpiderBase): def __init__(self, thcnt, cvaccs): self.cvaccs = cvaccs self.zlgm = MRLManager(cvaccs, new_ZLLogin) downloader_base.CVContactSpiderBase.__init__(self, thcnt, 'cv_zhilian') self.page_store = cv_download_page.CVZLDownloadPageStore() self.log = log_util.MLog(self.__class__.__name__, config.LOGGING_FILE) def run_job(self, job): realUrl = job['realUrl'] cvId = job['cvId'] indexUrl = "%s://%s" % (self.channel, cvId) # 设置状态 self._cv_status.update({indexUrl: config.StatusCode.DOWNLOADING}) page_template = config.CV_PAGE_TEMPLATE.get('cv_zhilian') cv_page_url = page_template.format(cvId) data = self.get_post_data(cvId, cv_page_url) content, status = self.with_check_request(self._download_url, data=data, realUrl=cv_page_url) if not content: self.log.warn('downloaded cv page fail: %s, readd to the queue ' % indexUrl) self.re_add_job(job) return status = self.page_store.save(time.time(), cvId, realUrl, content) # 失败 重试 if status == config.StatusCode.AFTER_DOWNLOADING_FAIL: self.log.warn("cv %s download fail, readd to the queue" % indexUrl) self.re_add_job(job) return self._cv_status.update({indexUrl: status}) def get_post_data(self, cvId, cv_page_url): res = self.zlgm.el_request(cv_page_url) find = re.search(ur'简历名称.*?<strong[^<>]*>(.*?)</strong>', res.text, re.S) if not find: Log.errinfo("find zhilian cvname exception") return None cvname = find.group(1) cvname = re.sub(ur'  ', '', cvname) data = { "extID": cvId, "versionNumber": 1, "favoriteID": "113460230", "resumeName": cvname, "dType": 0 } return data def try_next_proxy(self): self.zlgm = MRLManager(self.cvaccs, new_ZLLogin) def with_check_request(self, url, data, realUrl): res = self.zlgm.el_request(url, data=data) if re.search(ur'您的登录IP(.*)存在异常行为,已被暂时冻结', res.text): print "trying next proxy ...." self.try_next_proxy() return self.with_check_request(url, data, realUrl) if ur'此应聘者的简历已被下载' in res.text: Log.info("already download, url = %s" % realUrl)
class CVZhilianGetCV(Spider2): def __init__(self, thcnt, cfgname, acs): Spider2.__init__(self, thcnt) self._name = 'cvzlgetcv_%s' % cfgname self.zlm = MRLManager(acs, new_ZLLogin) self.pagestore = CVZLPageStore() self.hasher = spider.util.LocalHashChecker() self.zlm.ensure_login_do(None, lambda n: 1, None) self.zlm.release_obj() self.imgcnt = 0 def init_jobs(self): return def wait_job(self): return self.wait_job_by_condition() def push_job(self, j): if j is None: self._no_more_wait_job = True else: self.add_job(j) def _get_image(self, refurl): imgurl = "http://rd2.zhaopin.com/s/loginmgr/monitorvalidatingcode.asp?t=" + str( int(time.time()) * 1000) con = self.zlm.el_request(imgurl, headers={'Referer': refurl}) if con is None: Log.warning("fetch image failed, sleep 1s") time.sleep(1) return self._get_image(refurl) return con.content def get_cv(self, url): #http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM622670859R90250000000_1_1?searchresume=1 con = self.zlm.el_request(url) if con is None: return None if u"您需要输入验证码才能继续后续的操作" in con.text: self.imgcnt += 1 if self.imgcnt > 10: self.imgcnt = 0 self.zlm.set_nologin() return None for i in range(0, 5): code = OnlineOCR('zhilian2').resolve( lambda dbgdata=None: self._get_image(url)) purl = "http://rd.zhaopin.com/resumePreview/resume/_CheackValidatingCode?validatingCode=" + code con = self.zlm.el_request(purl, data={'validatingCode': code}, headers={'Referer': url}) if con is not None: if re.search('true', con.text, re.I): time.sleep(5) return None Log.warning('验证码输入失败') time.sleep(2) #连续失败了5次, 换帐号!! self.zlm.set_nologin() self.imgcnt = 0 return None return con def run_job(self, jobid): # {'type':'cv', 'url':'http://rd.zhaopin.com/resumepreview/resume/viewone/2/JM321509749R90250002000_1_1?searchresume=1'} if self.get_job_type(jobid) == 'cv': url = jobid['url'] m = re.search(ur'/([0-9A-Z]+)_\d+_\d+', url) if m is None: Log.error('invalid cv url', url) return jdid = m.group(1) if self.pagestore.check_should_fetch(jdid): con = self.get_cv(url) if con is None: self.add_job(jobid) return if u"该简历已被求职者删除" in con.text: return if u"抱歉,该简历已被删除" in con.text: return if u"由于系统繁忙,一会再来看一下吧" in con.text: Log.warning("url %s 繁忙不可获得" % url) return getime = int(time.time()) self.pagestore.save(getime, jdid, url, con.text) else: Log.errinfo("跳过拉取简历%s" % jdid) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': title = ' '.join(sys.argv) + ' DONE' msg += "saved: %d\n" % self.pagestore.saved_count spider.util.sendmail(['*****@*****.**', '*****@*****.**'], title, msg)
class CVZhilianSearch(Spider2): PAGE_TEMPLATE = "http://rd.zhaopin.com/resumepreview/resume/viewone/1/%s_1_1" CRAWLER_RANGE_MAP = { '3d': '1,9', #最近三天 '1w': '2,9', #最近一周 '2w': '3,9', #最近两周 '1m': '4,9', #最近一个月 '2m': '5,9', #最近2个月 '3m': '6,9', #最近3个月 '6m': '7,9', #最近6个月 '1y': '8,9', #最近1年 } def __init__(self, thcnt, acs): Spider2.__init__(self, thcnt) self._name = 'cv_zhilian' self.jobpusher = None self.zlm = MRLManager(acs, new_ZLLogin) self.headers = { 'Referer': 'http://rdsearch.zhaopin.com/Home/ResultForCustom' } self.search_cnt = 0 self.crawler_range = None def init_jobs(self): # fn = 'cv_zhilian.queries.txt' # fn = 'split1y.txt' fn = "one_month_splitly.txt" self.add_main_job_file({'type': 'main'}, fn) def search_cnt_checker(self, net): # 当搜索次数到达一定数量时, 必须换帐号登录, 否则可能被封. self.search_cnt += 1 if self.search_cnt > 500: self.search_cnt = 0 raise LoginErrors.AccountHoldError() def run_job(self, job): jt = self.get_job_type(job) if jt == 'main': joburl = CVZhilianUtil.get_search_url(json.loads(job['line'])) # if this account can't search, then giveup. con = self.zlm.el_request(joburl, headers=self.headers, hint='search', prechecker=self.search_cnt_checker) if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', joburl) self.add_job(job) return for su in CVZhilianUtil.sub_pages(joburl, con): self.add_job({'type': 'search', 'url': su}) self.parse_page(joburl, con) elif jt == 'search': joburl = job['url'] # if self.crawler_range: # joburl = CVZhilianUtil.get_count() con = self.zlm.el_request(joburl, headers=self.headers, hint='search') if con.code == 404: con = None if con is None: Log.warning('请求搜索页失败', joburl) self.add_job(job) return self.parse_page(joburl, con) def parse_page(self, url, con): if u"请修改或适当减少搜索项再进行搜索" in con.text: if not self.zlm.cur_worker().cansearch: # Account BLOCKED ?? self.zlm.cur_worker().isvalid = False raise RuntimeError("AccountBlocked") Log.error("NO_RESULT_OR_BLOCK", url) return try: hf = spider.util.htmlfind(con.text, 'div class="resumes-list"', 0) node = hf.get_node() a = re.findall(ur'''tag="([a-zA-Z0-9]+)_1"''', node) a = spider.util.unique_list(a) for i in a: # print "found_cv", i self.jobpusher({ 'type': 'cv', 'url': CVZhilianSearch.PAGE_TEMPLATE % i }) except: msg = "unknown search result %s" % url Log.error(msg, "sleep 5s.") Log.errorbin(msg, con.text) time.sleep(5) def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': self.jobpusher(None) elif evt == 'STARTED': #spider.misc.stacktracer.trace_start('res.trace.html') pass
class QccSpider(Spider): def __init__(self, threadcnt, acc_file): # self.qcclogin = QccLogin(acc) self.pagestore = QccPageStore() self.qcc_acc_manager = MRLManager( QccData(acc_file).get_accounts(), QccLogin) super(QccSpider, self).__init__(threadcnt) def _do_requests(self, url, **kwargs): r = Spider._do_requests(self, url, **kwargs) if r is None: return r if r.text.strip() == u"": raise ProxyError('ip blocked.') return r def dispatch(self): # self.qcclogin.do_login() f = open("r1.txt", "rb") currline = 0 # if len(sys.argv) is 4: # skipto = int(sys.argv[1].strip()) # endline = int(sys.argv[2].strip()) # Log.warning("skipto %d, endline %d. account file is %s."% (skipto, endline, sys.argv[3])) # else: # raise RuntimeError("please use command-line arguments. arg[1]=skipto, arg[2]=endline, arg[3]=account_file_path") skipto = 0 endline = 20000 for line in f: currline += 1 if currline >= skipto: key = line.split(" ")[-1].strip() job = {"kw": key, "page": "1", "type": "u1"} self.add_main_job(job) if currline >= endline: break self.wait_q() self.add_job(None, True) def retry(self, con, job): if re.search(u'<h1>An error occurred.</h1>', con.text) or re.search( u'Tinyproxy was unable to', con.text): #should reload this page. if int(job["retrycnt"]) < 5: job["retrycnt"] = int(job["retrycnt"]) + 1 self.add_job(job) return True return False def run_job(self, job): if job["type"] is "u1": key = job["kw"] page = str(job["page"]) url = "http://qichacha.com/search?key=" + key + "&index=name&" + "p=" + page # con = self.qcclogin.request_url(url) con = self.qcc_acc_manager.el_request(url) res = con.text if res.strip() == "": time.sleep(10) self.add_job(job) return elif re.search(u'小查还没找到数据', res): Log.error("key=" + key + ", page=" + page + ", no data!\n") else: Log.error("searching %s" % key) urls = self._match( res, r'<h3 class="site-list-title"><a href="(.*?)"') if len(urls) == 0: Log.errorbin("%s %s" % (key, url), con.text) raise AccountErrors.NoAccountError(key) for u in urls: job2 = {"url": u, "type": "u2", "retrycnt": "0"} self.add_job(job2) # catch page 1 only # if page is '1': # corp_count = int(self._match(res, r'<span class="search-key">(.*?)</span>')[0]) # pg_count = (corp_count + 9)/10 # #not vip limit in 10 pages # if pg_count >= 10: # pg_count = 10 # for i in range(2, pg_count+1): # job3 = {"kw": key, "page": str(i), "type": "u1"} # self.add_job(job3) elif job["type"] is "u2": url = "http://qichacha.com" + job["url"] cpid = job["url"][1:] if self.pagestore.check_should_fetch(cpid): con = self.request_url(url) if con is None or self.retry(con, job): return self.pagestore.save(int(time.time()), cpid, url, con.text) else: Log.warning("skip ", cpid) def _match(self, content, pattern1, pattern2=''): list = re.findall(pattern1, content, re.S) result_list = [] if pattern2 is not '': for i in range(len(list)): tlist = re.findall(pattern2, list[i], re.S) for j in range(len(tlist)): result_list.append(tlist[j].strip()) return result_list else: for i in range(len(list)): list[i] = list[i].strip() return list def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': # spider.util.sendmail(['*****@*****.**', '*****@*****.**'], '%s DONE' % sys.argv[0], msg) pass elif evt == 'STARTED': #spider.misc.stacktracer.trace_start('res.trace.html') pass
class GenQ(GenQueriesLT): def __init__(self, thcnt): GenQueriesLT.__init__(self, thcnt) self.m_db = CVChinahrStore() self._name = 'cv_chinahr' self.lgm = MRLManager(HrConfig.ac, new_ChinaHrLogin) self._tls = threading.local() def init_conditions(self): CData.add(self, 'live', qdata.city_data) # 必须作为第一条件,否则会有和直接查看url时显示不一样 CData.add(self, 'reFreshTime', CData.reFreshTime) CData.add(self, 'degree', CData.degree_data) CData.add(self, 'sex', CData.gender_data) CData.add(self, 'age', CData.age_data) CData.add(self, 'workStatus', CData.workStatus) CData.add(self, 'salary', CData.salary) #简历详细度 CData.add(self, 'jobType', CData.jobType) CData.add(self, 'hasPhoto', CData.hasPhoto) CData.add(self, 'workPlace', qdata.city_data) CData.add(self, 'jobs', qdata.job_data) self.select_user_agent('firefox') def translate_data(self, o): url = {} if 'age' in o: m = re.split('-', o['age']) url.update({'minAge': m[0]}) url.update({'maxAge': m[1]}) if 'salary' in o: m = re.split('-', o['salary']) url.update({'minSalary': m[0]}) url.update({'maxSalary': m[1]}) for key in [ 'degree', 'jobs', 'sex', 'workPlace', 'reFreshTime', 'live', 'jobType', 'workStatus', 'hasPhoto' ]: if key in o: url.update({key: o[key]}) return url def need_split(self, url, level, isLast): params = self.translate_data(url) real_url = spider.util.compose_url_param(CData.SEARCH_URL, params) res = self.lgm.el_request(real_url) count = 0 find = re.search(u'搜索到.*?<span>(\d+).*?</span>', res.text, re.S) if find: count = int(find.group(1)) if count > 2999: return True if count: setattr(self._tls, '_count', count) print "real_url: %s || count: %d" % (real_url, count) return False def log_url(self, url): if isinstance(url, dict): url = self.translate_data(url) url = json.dumps(url) count = getattr(self._tls, "_count", None) if count: url = " %s|| %d" % (url, getattr(self._tls, "_count")) self.fs.append(url)
class CVLPSpider(Spider): def __init__(self, thcnt, acs, type=1, process_num=0, max_process_cnt=1): Spider.__init__(self, thcnt) self._name = 'cvlpspider' self.lpm = MRLManager(acs, new_LPQYLogin) if type == 2: self.lpm = MRLManager(acs, new_LPLTLogin) self.pagestore = LPCVStore() self.hasher = spider.util.LocalHashChecker() self.lpm.ensure_login_do(None, lambda n: 1, None) self.lpm.release_obj() self.imgcnt = 0 self._type = type self._process_num = process_num self._max_process_cnt = max_process_cnt self._spider_cnt = 0 self._start_time = datetime.datetime.today() self.url_prefix = 'https://lpt.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' if self._type == 2: self.url_prefix = 'https://h.liepin.com/resume/showresumedetail/?res_id_encode={}&isBatch=0' self.stat = spider.runtime.StatDict() self._limit_cnt = 200 def run_job(self, cvid): url = self.url_prefix.format(cvid) qstring = "liepincv://" + cvid if self.pagestore.check_should_fetch(qstring): self.stat.inc('cv') o = self.lpm.el_request(url, headers=Cdata.headers, allow_redirects=True) if o is None: self.add_job(cvid) return None self.pagestore.save(time.time(), cvid, url, o.text) time.sleep(5) self._spider_cnt += 1 self._check_if_stop() print "start: %s - now: %s || spider cnt: %d" % ( self._start_time, datetime.datetime.today(), self._spider_cnt) else: print '======%s has downloaded=====' % qstring def _check_if_stop(self): if self._spider_cnt % self._limit_cnt == 0: Log.info("spider %d pages, sleep 60*5s today" % self._spider_cnt) time.sleep(60 * 5) def dispatch(self): with open(Cdata.IDS_FILE, 'rb') as f: for index, line in enumerate(f): if index % self._max_process_cnt != self._process_num: continue line = line.strip() if not line: continue if self.pagestore.find_any("%s://%s" % ("cv_liepin", line)): continue if not self._is_needed_cv(line): continue self.add_main_job(line) self.add_main_job(None) def _is_needed_cv(self, line): if not hasattr(self, 'not_need_cvs'): self.not_need_cvs = set() if os.path.exists(LPCVConfig.NOT_NEED_CV_FN): with open(LPCVConfig.NOT_NEED_CV_FN, 'rb') as f: for line in f: line = line.strip() if not line: continue self.not_need_cvs.add(line) if os.path.exists(LPCVConfig.NOT_ACCESS_BY_QIYE): with open(LPCVConfig.NOT_ACCESS_BY_QIYE, 'rb') as f: for line in f: line = line.strip() if not line: continue self.not_need_cvs.add(line) if line in self.not_need_cvs: return False return True def event_handler(self, evt, msg, **kwargs): if evt == 'DONE': spider.util.sendmail( ['*****@*****.**'], 'CVLPSpider process %d, DONE' % self._process_num, msg + '\n saved: %d' % self.pagestore.saved_count)