def parse(self, response): for each in response.xpath('//div[@class="job-list"]/ul//li'): item = ZhipinspiderItem() item['title'] = each.xpath( './div/div[@class="info-primary"]/h3/a/div[@class="job-title"]/text()' ).extract()[0] item['salary'] = each.xpath( './div/div[@class="info-primary"]/h3/a/span/text()').extract( )[0] item['company'] = each.xpath( './div/div[@class="info-company"]/div/h3/a/text()').extract( )[0] item['url'] = each.xpath( './div/div[@class="info-primary"]/h3/a/@href').extract()[0] item['work_addr'] = each.xpath( './div/div[@class="info-primary"]/p/text()').extract()[0] item['industry'] = each.xpath( './div/div[@class="info-company"]/div/p/text()').extract()[0] item['company_size'] = each.xpath( './div/div[@class="info-company"]/div/p/text()').extract()[-1] item['recruiter'] = each.xpath( './div/div[@class="info-publis"]/h3/text()').extract()[0] yield item next_pages = response.xpath( '//div[@class="page"]/a[@class="next"]/@href').extract() if next_pages and len(next_pages) > 0: next_page = next_pages[0] print("---------------------", next_page) yield scrapy.Request("https://www.zhipin.com" + next_page, callback=self.parse)
def parse(self, response): #遍历页面中的所有//div[@class="job_primary"]节点 for job_primary in response.xpath('//div[@class="job-primary"]'): item = ZhipinspiderItem() #匹配//div[@class="job-primary"]节点下的/div[@class="info-primary"]节点 info_primary = job_primary.xpath('./div[@class="info-primary"]') item['title'] = info_primary.xpath( './div[@class="primary-wrapper"]/div[@class="primary-box"]/div[@class="job-title"]/span[@class="job-name"]/a/text()' ).extract_first() item['salary'] = info_primary.xpath( './div[@class="primary-wrapper"]/div[@class="primary-box"]/div[@class="job-limit clearfix"]/span[@class="red"]/text()' ).extract_first() item['work_addr'] = info_primary.xpath( '.div[@class="primary-wrapper"]/div[@class="primary-box"]/div[@class="job-title"]/span[@class="job-area-wrapper"]/span[@class="job-area"]/text()' ).extract_first() item['url'] = info_primary.xpath('./h3a/@herf').extract_first() company_text = job_primary.xpath('./div[@class="info-company"]' + '/div[@class="company-text"]') item['company'] = company_text.xpath( './h3/a/text()').extract_first() company_info = company_text.xpath('./p/text()').extract() if company_info and len(company_info) > 0: item['industry'] = company_text.xpath( './p/text()').extract()[0] if company_info and len(company_info) > 1: item['company_size'] = company_text.xpath( './p/text()').extract()[2] info_publish = job_primary.xpath('./div[@class="info-publish"]') item['recruiter'] = info_publish.xpath( './h3/text()').extract_first() item['publish_date'] = info_publish.xpath( './p/text()').extract_first() yield item
def parse(self, response): # 获取当前页面下所有的招聘数据 node_list = response.xpath('//div[@id="main"]/div/div[2]/ul/li') # 循环解析页面的数据 for node in node_list: item = ZhipinspiderItem() item['name'] = node.xpath('.//div[@class="job-title"]/span[1]/a/text()').extract()[0] item['salary'] = node.xpath('.//div[@class="job-limit clearfix"]/span/text()').extract()[0] item['address'] = node.xpath('.//div[@class="job-title"]/span[2]/span/text()').extract()[0] item['years'] = node.xpath('.//div[@class="job-limit clearfix"]/p/text()').extract()[0] item['education'] = node.xpath('.//div[@class="job-limit clearfix"]/p/text()').extract()[1] item['company_name'] = node.xpath('.//div[@class="company-text"]/h3/a/text()').extract()[0] item['company_type'] = node.xpath('.//div[@class="company-text"]/p/a/text()').extract()[0] company = node.xpath('.//div[@class="company-text"]/p/text()').extract() item['company_scale'] = company[0] item['company_finance'] = company[1] yield item self.current_page += 1 # 最多爬2页 if self.current_page > 1: print("全部数据爬取完毕!") else: time.sleep(5) # 使用 yield 语句将 item 对象返回给 Scrapy 引擎。这里不能用return,因为return会导致整个方法返回,循环不能继续执行 # 而 yield 将会创建一个生成器。 yield self.next_request()
def parse(self, response): for job_primary in response.xpath( r'//div[@id="main"]/div/div[2]/ul/li/div'): item = ZhipinspiderItem() info_primary = job_primary.xpath('./div[@class="info-primary"]') item['title'] = info_primary.xpath( './h3/a/div[@class="job-title"]/text()') yield item
def parse(self, response): for job_primary in response.xpath('//div[@class="job_primary"]'): item = ZhipinspiderItem() info_primary = job_primary.xpath('./div[@class="info_primary"]') item['title'] = info_primary.xpath('./h3/a/div[@class="job_title"]/text()').extract_first() item['salary'] = info_primary.xpath('./h3/a/span[@class="red"]/text()').exract_first() item['work_addr'] = info_primary.xpath('./p/text()').exract_first() item['url'] = info_primary.xpath('./h3/a/@herf').exract_first() yield item
def parse(self, response): new_links = response.xpath( '//div[@class="page"]/a[@class="next"]/@href').extract() if new_links and len(new_links) > 0: # 获取下一页的链接 new_link = new_links[0] # 再次发送请求获取下一页数据 yield scrapy.Request("https://www.zhipin.com" + new_link, callback=self.parse) # print('---***--response {} -{}-'.format(response, response.selector.xpath('//div[@class="job-list"]//li'))) # 遍历页面上所有//div[@class="job-primary"]节点 item = ZhipinspiderItem() for job_primary in response.xpath('//div[@class="job-primary"]'): print('---***--job_primary {} --'.format(job_primary)) # 匹配//div[@class="job-primary"]节点下/div[@class="info-primary"]节点 # 也就是匹配到包含工作信息的<div.../>元素 info_primary = job_primary.xpath('./div[@class="info-primary"]') item['title'] = info_primary.xpath( './h3/a/div[@class="job-title"]/text()').extract_first() item['salary'] = info_primary.xpath( './h3/a/span[@class="red"]/text()').extract_first() item['work_addr'] = info_primary.xpath( './p/text()').extract_first() item['url'] = info_primary.xpath('./h3/a/@href').extract_first() # 匹配//div[@class="job-primary"]节点下./div[@class="info-company"]节点下 # 的/div[@class="company-text"]的节点 # 也就是匹配到包含公司信息的<div.../>元素 company_text = job_primary.xpath('./div[@class="info-company"]' + '/div[@class="company-text"]') item['company'] = company_text.xpath( './h3/a/text()').extract_first() company_info = company_text.xpath('./p/text()').extract() if company_info and len(company_info) > 0: item['industry'] = company_info[0] if company_info and len(company_info) > 2: item['company_size'] = company_info[2] # 匹配//div[@class="job-primary"]节点下./div[@class="info-publis"]节点下 # 也就是匹配到包含发布人信息的<div.../>元素 info_publis = job_primary.xpath('./div[@class="info-publis"]') item['recruiter'] = info_publis.xpath( './h3/text()').extract_first() item['publish_date'] = info_publis.xpath( './p/text()').extract_first() yield item
def parse(self, response): # 遍历页面中所有 //div[@class="job-primary"]节点 for job_primary in response.xpath('//div[@class="job-primary"]'): item = ZhipinspiderItem() # 匹配 //div[@class="job-primary"]节点下的 /div[@class="info-primary"]节点 # 也就是匹配到包含工作信息的<div.../>元素 info_primary = job_primary.xpath('./div[@class="info-primary"]') item['title'] = info_primary.xpath( './h3/a/div[@class="job-title"]/text()').extract_first() item['salary'] = info_primary.xpath( './h3/a/span[@class="red"]/text()').extract_first() item['work_addr'] = info_primary.xpath( './p/text()').extract_first() item['url'] = info_primary.xpath('./h3/a/@href').extract_first() # 匹配 //div[@class="job-primary"]节点下 ./div[@class="info-company"] 节点下 # 的 /div[@class="company-text" 节点 # 也就是匹配到包含公司信息的<div.../>元素 company_text = job_primary.xpath( './div[@class="info-company"]/div[@class="company-text"]') item['company'] = company_text.xpath( './h3/a/text()').extract_first() company_info = company_text.xpath('./p/text()').extract() if company_info and len(company_info) > 0: item['industry'] = company_text.xpath( './p/text()').extract()[0] if company_info and len(company_info) > 1: item['company_size'] = company_text.xpath( './p/text()').extract()[2] # 匹配 //div[@class="job-primary"]节点下的 /div[@class="info-publis"]节点 # 也就是匹配到包含发布人信息的<div.../>元素 info_publis = job_primary.xpath('./div[@class="info-publis"]') item['recruiter'] = info_publis.xpath( './h3/text()').extract_first() yield item # 解析下一页链接 new_links = response.xpath( '//div[@class="page"]/a[@class="next"]/@href').extract() if new_links and len(new_links) > 0: # 获取下一页的链接 new_links = new_links[0] # 再次发送请求获取下一页数据 yield scrapy.Request("http://www.zhipin.com" + new_links, callback=self.parse)
def parse(self, response): # pass # 遍历页面上所有//div[@class="job-primary"]节点,每个节点都包含一份招聘信息,为每个节点都建立一个ZhipinspiderItem对象,并从该节点中提取项目感兴趣的信息存入ZhipinspiderItem对象中 for job_primary in response.xpath('//div[@class="job-primary"]'): item = ZhipinspiderItem() # 匹配//div[@class="job-primary"]节点下/div[@class="info-primary"]节点,也就是匹配到包含工作信息的div元素 info_primary = job_primary.xpath('./div[@class="info-primary"]') item['title'] = info_primary.xpath( './h3/a/div[@class="job-title"]/text()').extract_first() item['salary'] = info_primary.xpath( './h3/a/span[@class="red"]/text()').extract_first() item['work_addr'] = info_primary.xpath( './p/text()').extract_first() item['url'] = info_primary.xpath('./h3/a/@href').extract_first() # 匹配//div[@class="job-primary"]节点下./div[@class="info-company"]节点下的/div[@class="company-text"]的节点,也就是匹配到包含公司信息的div元素 company_text = job_primary.xpath( './div[@class="info-company"]/div[@class="company-text"]') item['company'] = company_text.xpath( './h3/a/text()').extract_first() company_info = company_text.xpath('./p/text()').extract() if company_info and len(company_info) > 0: item['industry'] = company_info[0] if company_info and len(company_info) > 2: item['company_size'] = company_info[2] # 匹配//div[@class="job-primary"]节点下./div[@class="info-publis"]节点下,也就是匹配到包含发布人信息的div元素 info_publis = job_primary.xpath('./div[@class="info-publis"]') item['recruiter'] = info_publis.xpath( './h3/text()').extract_first() item['publish_date'] = info_publis.xpath( './p/text()').extract_first() # 使用yield语句将item对象返回给Scrapy引擎。此处不能使用return,因为return会导致整个方法返回,循环不能继续执行,而yield将会创建一个生成器 yield item new_links = response.xpath( '//div[@class="page"]/a[@class="next"]/@href').extract( ) # 解析下一页的链接 if new_links and len(new_links) > 0: new_link = new_links[0] # 获取下一页的链接 yield scrapy.Request("https://www.zhipin.com" + new_link, callback=self.parse) # 再次发送请求获取下一页数据
def parse(self, response): #response代表Scrapy下载器所获取的目标的响应 #和shell调试中response对象完全一样 #每个job_box包含一个工作信息 for job_box in response.xpath( '//div[@class="common-tab-box job-tab-box"]/ul/li'): #为每份工作创建一个item对象 item = ZhipinspiderItem() #获取包含工作信息的div job_info = job_box.xpath('./div[@class="sub-li"]') item['title'] = job_info.xpath('./a/p/text()').extract()[0] item['work_spot'] = job_info.xpath('./a/p/text()').extract()[1] item['experience'] = job_info.xpath('./a/p/text()').extract()[2] item['edu'] = job_info.xpath('./a/p/text()').extract()[3] item['company'] = job_info.xpath('./a/p/text()').extract()[4] item['salary'] = job_info.xpath('./a/p/span/text()').extract()[0] item['recruiter'] = job_info.xpath( './a/p/span/text()').extract()[1] #item('title') = job_info.xpath('./a/p/span/text()').extract()[2] yield item