class LinkedInSpider(Spider): def __init__(self, thread_cnt, latest_type=None): super(LinkedInSpider, self).__init__(thread_cnt) self.page_store = PageStoreLinkedIn() self.speed_control_requests = SpeedControlRequests() self.latest_type = latest_type self.page_store.testmode = False def dispatch(self): if self.latest_type: LinkedInConfig.URL_TMPLATE += "&f_TP={}".format(self.latest_type) for locId in range(8876, 9046): for indId in range(0, 149): if indId == 2: continue url = LinkedInConfig.URL_TMPLATE.format(locId, indId) self.add_main_job({"type":"search", "url": url}) self.wait_q() self.add_main_job(None) def run_job(self, jobid): if not isinstance(jobid, dict): return if jobid.get("type", None) == "search": self.parse_page(jobid.get("url")) elif jobid.get("type", None) == "jdurl": url = jobid.get("url") jobid = jobid.get("jobid", None) if not jobid: return res = self.speed_control_requests.with_sleep_requests(url, 0.5) self.page_store.save(int(time.time()), jobid, url, res.text) def parse_page(self, url): for page_num in range(1, 41): real_url = url + "&start={}&count=25".format(25 * (page_num-1)) page = self.speed_control_requests.with_sleep_requests(real_url, 0.5) jobids = re.findall(r'linkedin.com/jobs2/view/(\d+)', page.text, re.S) jobids = set(jobids) if not jobids: return for jobid in jobids: url_page = LinkedInConfig.PAGE_TMPLATE.format(jobid) self.add_job({"type":"jdurl", "url":url_page, "jobid": jobid}, False) def event_handler(self, evt, msg, **kwargs): if "DONE" == evt: spider.util.sendmail(["<*****@*****.**>"], "linkedin jd爬取", msg + '\nsaved: %d' % self.page_store.saved_count) return
def __init__(self, threadcnt): super(BaiduSchoolSpider, self).__init__(threadcnt) self.speed_control_requests = SpeedControlRequests() self.wb = openpyxl.Workbook() self.ws = self.wb.active self.sheet_list = [ u'高校', u'院校分类', u'办学性质', '211', '985', u'研究生院', u'院校隶属', u'办学类型', u'学历层次', u'标签' ] self.ws.append(self.sheet_list)
class JobWealinkSpider(Spider): def __init__(self, thread_cnt): super(JobWealinkSpider, self).__init__(thread_cnt) self.page_store = PageStoreWL() self.speed_control_request = SpeedControlRequests() self.page_store.testmode = False def dispatch(self): for i in xrange(28261133, 31000000): self.add_main_job(str(i)) self.wait_q() self.add_main_job(None) def run_job(self, jobid): if not self.page_store.check_should_fetch(jobid): return url = "http://www.wealink.com/zhiwei/view/{}/".format(jobid) res = self.speed_control_request.with_sleep_requests(url, sleep=0.1) if res.code == 404: print "jobid: {} match nothing".format(jobid) return if res is not None: self.page_store.save(int(time.time()), jobid, url, res.text) else: self.re_add_job(jobid) Log.error("failed get url", url)
class JobLagouSpider(Spider): def __init__(self, thread_cnt): super(JobLagouSpider, self).__init__(thread_cnt) self.page_store = PageStoreLG() self.speed_control_request = SpeedControlRequests() self.page_store.testmode = False def dispatch(self): self.bs = BinSaver('joblagou.bin') for i in xrange(0, 1500000): self.add_main_job(str(i)) self.wait_q() self.add_main_job(None) def run_job(self, jobid): if not self.page_store.check_should_fetch(jobid): return url = "http://www.lagou.com/jobs/{}.html".format(jobid) res = self.speed_control_request.with_sleep_requests(url, sleep=0.1) if htmlfind.findTag(res.text, 'div', 'position_del'): print "jobid: {} match nothing".format(jobid) return if res is not None: self.page_store.save(int(time.time()), jobid, url, res.text) else: self.re_add_job(jobid) Log.error("failed get url", url)
def __init__(self, thread_cnt, company): super(LagouBycompany, self).__init__(thread_cnt) self.page_store = PageStoreLG() self.speed_control_requests = SpeedControlRequests() self.page_store.testmode = False self.list = [] with open(company) as file_: for line in file_: self.list.append(line.strip())
class Jd58Spider(Spider): def __init__(self, thread_cnt): super(Jd58Spider, self).__init__(thread_cnt) self.page_store = Jd58PageStore() self.speed_control_request = SpeedControlRequests() self.page_store.testmode = False self.get_latest = None def real_dispatch(self): for url in urls: for ind in inds: i = 1 while 1: realUrl = url + ind + '/pn{}/'.format(i) if self.get_latest: l_time = spider.util.TimeHandler.getTimeOfNDayBefore( self.get_latest) / 1000 l_time_local = time.localtime(l_time) l_time_str = '%04d%02d%02d' % ( l_time_local[0], l_time_local[1], l_time_local[2]) h_time_local = time.localtime(time.time()) h_time_str = '%04d%02d%02d' % ( h_time_local[0], h_time_local[1], h_time_local[2]) realUrl += "?postdate={}_{}".format( l_time_str, h_time_str) # self.add_main_job({"urlpart": realUrl, "type":"loadPage"}) has_next = self.parse_html(realUrl) if not has_next: break i += 1 def parse_html(self, url): res = self.speed_control_request.with_sleep_requests(url, sleep=0.05) if not res: return True els = re.findall(r'entityId=(\d+)', res.text) if not els: return False part = url.split("pn")[0] for el in els: self.add_main_job({"urlpart": part, "jobid": el, "type": "jdPage"}) if re.search(ur'新信息较少', res.text): return False return True
class LiepinSpider(Spider): def __init__(self, thread_cnt): super(LiepinSpider, self).__init__(thread_cnt) self.page_store = PageStoreLP() self.speed_control_requests = SpeedControlRequests() self.page_store.testmode = False def dispatch(self): for i in range(3362419 + 1, 9999999): self.add_main_job(i) self.wait_q() self.add_main_job(None) def run_job(self, jobid): if isinstance(jobid, int): jobid = str(jobid) url = "http://job.liepin.com/{}_{}/".format(jobid[:3], jobid) res = self.speed_control_requests.with_sleep_requests(url, 0.1) if res is None: print "%d failed, sleeping 10 secs." % jobid time.sleep(2) self.add_job(jobid) return if re.search(u'您访问的页面不存在或已删除', res.text): print jobid, "match nothing" elif re.search(u'该职位已结束', res.text): print jobid, "match ending" elif re.search(u'您查看的职位已过期', res.text): print jobid, "match timeout" else: print "saving %s ..." % jobid self.page_store.save(int(time.time()), jobid, url, res.text)
def __init__(self, thread_cnt): super(LiepinSpider, self).__init__(thread_cnt) self.page_store = PageStoreLP() self.speed_control_requests = SpeedControlRequests() self.page_store.testmode = False
def __init__(self, thread_cnt): super(JobWealinkSpider, self).__init__(thread_cnt) self.page_store = PageStoreWL() self.speed_control_request = SpeedControlRequests() self.page_store.testmode = False
def __init__(self, thread_cnt): super(Jd58Spider, self).__init__(thread_cnt) self.page_store = Jd58PageStore() self.speed_control_request = SpeedControlRequests() self.page_store.testmode = False self.get_latest = None
def __init__(self, thread_cnt, latest_type=None): super(LinkedInSpider, self).__init__(thread_cnt) self.page_store = PageStoreLinkedIn() self.speed_control_requests = SpeedControlRequests() self.latest_type = latest_type self.page_store.testmode = False
class BaiduSchoolSpider(Spider): def __init__(self, threadcnt): super(BaiduSchoolSpider, self).__init__(threadcnt) self.speed_control_requests = SpeedControlRequests() self.wb = openpyxl.Workbook() self.ws = self.wb.active self.sheet_list = [ u'高校', u'院校分类', u'办学性质', '211', '985', u'研究生院', u'院校隶属', u'办学类型', u'学历层次', u'标签' ] self.ws.append(self.sheet_list) def get_ids(self): url = 'http://baike.baidu.com/wikitag/api/getlemmas' form_data = { 'limit': 30, 'timeout': 3000, 'filterTags': [0, 0, 0, 0, 0, 0, 0], 'tagId': 60829, 'fromLemma': 'false', 'contentLength': 40, 'page': 0 } total_page = 81 while form_data['page'] <= total_page: res = self.request_url(url, data=form_data) form_data['page'] += 1 json_resp = json.loads(res.text) for item in json_resp['lemmaList']: lis = item['lemmaUrl'].split('/') # 6 means url format is http://baike.baidu.com/subview/d1/d2.htm if len(lis) == 6: id_lis = [str(lis[4]), str(lis[5].split('.')[0])] yield {'id': id_lis} else: yield {'id': str(lis[4].split('.')[0])} def dispatch(self): for jobid in self.get_ids(): self.add_main_job(jobid) self.wait_q() self.add_main_job(None) def get_info(self, content): raw_info = { u'高校': u'不确定', u'院校分类': u'不确定', u'办学性质': u'不确定', '211': u'否', '985': u'否', u'研究生院': u'否', u'院校隶属': u'不确定', u'办学类型': u'不确定', u'学历层次': u'不确定', u'标签': u'不确定' } doc = html.fromstring(content) tag_list = doc.xpath( '//*[@id="open-tag-item"]/span/a/text()|//*[@id="open-tag-item"]/span/text()' ) tag_list = [i.strip() for i in tag_list] tag = ' ' + ' '.join(tag_list) + ' ' raw_info[u'标签'] = tag if u'211高校' in tag: raw_info['211'] = u'是' if u'985高校' in tag: raw_info['985'] = u'是' if u'研究生院高校' in tag: raw_info[u'研究生院'] = u'是' gaoxiao = doc.xpath('//h1/text()') if gaoxiao: raw_info[u'高校'] = gaoxiao[0] fenlei = re.findall(ur'\s([\u4e00-\u9fa5]*?类)高校\s', tag) if fenlei: raw_info[u'院校分类'] = fenlei[0] xingzhi = re.findall(ur'\s([\u4e00-\u9fa5]*?办)高校\s', tag) if xingzhi: raw_info[u'办学性质'] = xingzhi[0] lishu = re.findall(ur'\s([\u4e00-\u9fa5]*?)隶属高校\s', tag) if lishu: raw_info[u'院校隶属'] = lishu[0] elif u'地方所属高校' in tag: raw_info[u'院校隶属'] = u'地方所属' if u'本科' in tag: raw_info[u'学历层次'] = u'本科' else: raw_info[u'学历层次'] = u'专科' if u' 大学 ' in tag: raw_info[u'办学类型'] = u'大学' elif u' 学院 ' in tag: raw_info[u'办学类型'] = u'学院' elif u'高等专科院校' in tag: raw_info[u'办学类型'] = u'高等专科院校' elif u'高等职业技术院校' in tag: raw_info[u'办学类型'] = u'高等职业技术院校' elif u'独立学院' in tag: raw_info[u'办学类型'] = u'独立学院' elif u'成人高等院校' in tag: raw_info[u'办学类型'] = u'成人高等院校' elif u'短期职业大学' in tag: raw_info[u'办学类型'] = u'短期职业大学' elif u'管理干部学院' in tag: raw_info[u'办学类型'] = u'管理干部学院' elif u'教育学院' in tag: raw_info[u'办学类型'] = u'教育学院' elif u'高等学校分校' in tag: raw_info[u'办学类型'] = u'高等学校分校' else: raw_info[u'办学类型'] = u'其他' new_list = [raw_info[i] for i in self.sheet_list] self.ws.append(new_list) def run_job(self, jobid): if isinstance(jobid['id'], list): url = 'http://baike.baidu.com/subview/{}/{}.htm'.format( jobid['id'][0], jobid['id'][1]) res = self.speed_control_requests.with_sleep_requests(url, 0.1) jobid_str = '&'.join(jobid['id']) if res is not None: print "saving %s ..." % jobid_str self.get_info(res.text) # self.page_store.save(int(time.time), jobid_str, url, res.text) else: print "%d failed, sleeping 10 secs." % jobid_str time.sleep(2) self.add_job(jobid) elif isinstance(jobid['id'], str): url = 'http://baike.baidu.com/view/{}.htm'.format(jobid['id']) res = self.speed_control_requests.with_sleep_requests(url, 0.1) if res is not None: print "saving %s ..." % jobid['id'] self.get_info(res.text) # self.page_store.save(int(time.time), jobid, url, res.text) else: print "%d failed, sleeping 10 secs." % jobid['id'] time.sleep(2) self.add_job(jobid)
class CV58Spider(Spider): def __init__(self, thread_cnt): super(CV58Spider, self).__init__(thread_cnt) self.page_store = CV58PageStore() self.speed_control_request = SpeedControlRequests() self.page_store.testmode = False self.get_latest = 3 def real_dispatch(self): for url in urls: for ind in inds: i = 1 while 1: realUrl = "{}qz{}/pn{}".format(url, ind, i) if self.get_latest: l_time = TimeHandler.getTimeOfNDayBefore( self.get_latest) / 1000 l_time_local = time.localtime(l_time) l_time_str = '%04d%02d%02d' % ( l_time_local[0], l_time_local[1], l_time_local[2]) h_time_local = time.localtime(time.time()) h_time_str = '%04d%02d%02d' % ( h_time_local[0], h_time_local[1], h_time_local[2]) realUrl += "?postdate={}000000_{}000000".format( l_time_str, h_time_str) # self.add_main_job({"urlpart": realUrl, "type":"loadPage"}) has_next = self.parse_html(realUrl) if not has_next: break i += 1 def parse_html(self, url): res = self.speed_control_request.with_sleep_requests(url, sleep=0.05) els = re.findall(r'resume/(\d+)', res.text) els = set(els) if not els: return False for el in els: self.add_main_job({"jobid": el}) return True def dispatch(self): self.real_dispatch() self.wait_q() self.add_main_job(None) def run_job(self, jobid): url = "http://jianli.m.58.com/resume/{}/".format(jobid['jobid']) if not self.page_store.check_should_fetch(jobid['jobid']): return res = self.speed_control_request.with_sleep_requests(url, sleep=0.2) if res is not None: self.page_store.save(int(time.time()), jobid['jobid'], url, res.text) else: self.re_add_job(jobid) spider.util.Log.error(("failed get url", url)) def event_handler(self, evt, msg, **kwargs): if "START" == evt: spider.util.sendmail(["<*****@*****.**>"], "58 jd爬取", msg) return if "DONE" == evt: spider.util.sendmail(["<*****@*****.**>"], "58 jd爬取", msg) return
class LatestLagouSpider(Spider): def __init__(self, thread_cnt): super(LatestLagouSpider, self).__init__(thread_cnt) self.page_store = PageStoreLG() self.speed_control_requests = SpeedControlRequests() self.page_store.testmode = False def getIds(self, q): url = "http://www.lagou.com/jobs/positionAjax.json" hasNext = True pageIndex = 0 total_num = 100 while hasNext and pageIndex <= total_num: pageIndex += 1 q["pn"] = pageIndex res = self.request_url(url, data=q) json_resp = json.loads(res.text) if "content" in json_resp and "positionResult" in json_resp["content"] \ and "result" in json_resp["content"]["positionResult"]: # if pageIndex == 1: # total_num = json_resp["content"]["totalPageCount"] if not json_resp["content"]["positionResult"]["result"]: hasNext = False elif json_resp["content"]["positionResult"]["result"]: hasNext = True for item in json_resp["content"]["positionResult"][ "result"]: create_time = item['createTimeSort'] # 昨天的不管 if TimeHandler.isBeforeNDay(create_time, 2): yield item["positionId"] break yield item["positionId"] def dispatch(self): self.bs = BinSaver('joblagou.bin') for query in q: try: for jobid in self.getIds(query): if isinstance(jobid, int): jobid = str(jobid) self.add_main_job(jobid) except Exception as e: continue self.wait_q() self.add_main_job(None) def run_job(self, jobid): if not self.page_store.check_should_fetch(jobid): return url = "http://www.lagou.com/jobs/{}.html".format(jobid) res = self.speed_control_requests.with_sleep_requests(url, sleep=0.1) if htmlfind.findTag(res.text, 'div', 'position_del'): print "jobid: {} match nothing".format(jobid) if res is not None: self.page_store.save(int(time.time()), jobid, url, res.text) else: self.re_add_job(jobid) Log.error("failed get url", url) def event_handler(self, evt, msg, **kwargs): if "DONE" == evt: spider.util.sendmail(["<*****@*****.**>"], "lagou jd爬取", msg + '\nsaved: %d' % self.page_store.saved_count) return