def list_page(self, response): """ 解析职位列表页面 """ if isinstance(response.save, str): # 有bug,response.save有时为str response.save = { "type": response.save } recrawl = True if response.error else False # 判断代理是否失效,如果代理失效将导致599异常 if recrawl: ip = response.save.get("ip") port = response.save.get("port") proxyObj.delete(ip, port) # 删除失效的代理 if ("login" in response.url or recrawl): headers = getHeaders() ip, port = getProxy() response.save["ip"] = ip response.save["port"] = port self.crawl(response.orig_url, save=response.save, callback=self.list_page, headers=headers,proxy=proxyFormat % (ip, port), itag=str(time.time())) return for tag_div in response.doc('div#s_position_list > ul.item_con_list > li div.position').items(): link = tag_div("div.p_top > a.position_link").attr.href # 提取职位详细页面链接 work_experi, edu_bg = stripTag(str(tag_div("div.p_bot span").next()), ["i", ""]).split("/") # 提取工作经验及学历要求 headers = getHeaders() save = { "type": response.save.get("type"), "work_experi": work_experi, "edu_bg": edu_bg } self.crawl(link, save=save, callback=self.detail_page, headers=headers) # 翻页save=response.save link_next = response.doc('div.pager_container > a:last').attr.href headers = getHeaders() self.crawl(link_next, save=response.save.get("type"), callback=self.list_page, headers=headers)
def on_start(self): headers = getHeaders() ip, port = getProxy() self.crawl("http://sou.zhaopin.com/", callback=self.index_page, headers=headers, proxy=proxyFormat % (ip, port))
def index_page(self, response): """ 对拉钩首页进行解析,提取职位链接并进行抓取 """ for tag_a in response.doc('div.menu_box > div.menu_sub.dn > dl > dd > a').items(): # 保存类型,如java,python _type = tag_a.text() headers = getHeaders() # headers.update(**requests.utils.dict_from_cookiejar(requests.get("https://www.lagou.com/").cookies)) self.crawl(tag_a.attr.href, save={"type": _type}, callback=self.list_page,headers=headers) # fetch_type='js'
def index_page(self, response): """ 对智联职位搜索页面进行解析,提取职位链接并进行抓取 """ for tag_a in response.doc( '#search_bottom_content_demo > div.clearfixed > h1 > a').items( ): link = re.sub(re.compile('jl=.*?&', re.S), 'jl=0&', tag_a.attr.href) headers = getHeaders() ip, port = getProxy() self.crawl(link, callback=self.list_page, headers=headers, proxy=proxyFormat % (ip, port))
def detail_page(self, response): """ 解析职位详细信息页面,提取信息 """ if isinstance(response.save, str): # 有bug,response.save有时为str response.save = {} recrawl = True if response.error else False if recrawl: ip = response.save.get("ip") port = response.save.get("port") proxyObj.delete(ip, port) if ("login" in response.url or recrawl): headers = getHeaders() ip, port = getProxy() response.save["ip"] = ip response.save["port"] = port self.crawl(response.orig_url, save=response.save, callback=self.detail_page, headers=headers,proxy=proxyFormat % (ip, port), itag=str(time.time())) return position = response.doc('div.position-head div.job-name > span').text() # 职位 salary = response.doc('div.position-head dd.job_request > p > span:nth-child(1)').text() # 工资 require = response.doc('div.position-head dd.job_request > p:nth-child(1)').text().replace(salary, '') # 要求 release_time = response.doc('div.position-head dd.job_request > p.publish_time').text().replace('发布于拉勾网', '') # 发布时间 welfare = response.doc('dl#job_detail > dd.job-advantage > p').text() # 职位诱惑 job_description = response.doc('dl#job_detail > dd.job_bt > div').text() # 职位描述 company = response.doc('dl#job_company > dt h2').text().replace(response.doc('dl#job_company > dt h2 span').text(), '') # 公司 industry = response.doc('dl#job_company > dd li:nth-child(1)').text().replace("领域", "") # 行业,领域 type = response.save.get("type") # 类型 addr = response.doc('dl#job_detail > dd.job-address.clearfix > div.work_addr').text().replace('查看地图', '') # 地址 work_experi = response.save.get("work_experi") # 工作经验 edu_bg = response.save.get("edu_bg") # 学历要求 fetch_time = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) # 抓取时间 return { 'url': response.url, '职位': position, '工资': salary, '要求': require.strip(), '发布时间': release_time.strip(), '职位诱惑': welfare.strip(), '职位描述': job_description.strip(), '公司': company.strip(), '行业,领域': industry.strip(), '类型': type, '工作地址': addr.strip(), "工作经验": work_experi.strip(), "学历要求": edu_bg.strip(), '抓取时间': fetch_time }
def list_page(self, response): """ 解析职位列表页面 """ for tag_a in response.doc( "table.newlist tr:nth-child(1) > td.zwmc > div > a:nth-child(1)" ).items(): link = tag_a.attr.href headers = getHeaders() ip, port = getProxy() self.crawl(link, callback=self.detail_page, headers=headers, proxy=proxyFormat % (ip, port)) # 翻页 ip, port = getProxy() self.crawl(response.doc('.next-page').attr.href, callback=self.list_page, proxy=proxyFormat % (ip, port))
def on_start(self): self.crawl('https://user.qzone.qq.com/111111111/infocenter', headers=getHeaders(), callback=self.index_page, validate_cert=False, fetch_type='js', js_script=scroll, itag=str(time.time()))
def on_start(self): headers = getHeaders() self.crawl('https://www.lagou.com/', callback=self.index_page, headers=headers)