def parse(self, response): item = BossItem() node_list = response.xpath('//div[@class="job-list"]/ul/li') for node in node_list: item['job'] = node.xpath( './/div[@class="info-primary"]//div[@class="job-title"]/text()' ).extract() salary = node.xpath('.//span[@class="red"]/text()')[0].extract() if len(salary) > 6: salary = salary[0:6] a, b = re.split("-", salary) b1 = b.replace("K", "") c = [] c.append(str((int(a) + int(b1)) / 2)) item['salary'] = c item['company'] = node.xpath( './/div[@class="job-primary"]/div[@class="info-company"]//a/text()' ).extract() print(item['job'], item['salary'], item['company']) yield item # 吧item对象传递到下一个pipeline #爬取下一页 self.page += 1 if response.xpath( '//div[@class="page"]/a[@href="/c100010000-p100109/?page={0}"]' .format( self.page)).extract_first() is not None and self.page <= 2: nextpage = 'https://www.zhipin.com/c100010000-p100109/?page={0}'.format( self.page) yield scrapy.Request(nextpage, callback=self.parse) # 函数指针
def parse(self, response): li_list = response.xpath('//div[@class="job-list"]/ul/li') # 用于递归终止条件 next_href = response.xpath( '//div[@class="page"]/a[@ka="page-next"]/@href').extract_first() for li in li_list: job_title = li.xpath( './div/div[1]/h3/a/div[1]/text()')[0].extract() salary = li.xpath('./div/div[1]/h3/a/span/text()')[0].extract() url = 'https://www.zhipin.com' + li.xpath( './div/div[1]/h3/a/@href').extract_first() company = li.xpath( './div/div[2]/div[1]/h3/a/text()').extract_first() release_time = li.xpath('./div/div[3]/p/text()').extract_first() item = BossItem() item['job_title'] = job_title item['salary'] = salary item['url'] = url item['company'] = company item['release_time'] = release_time yield item if next_href: # 如果next标签没有href了就终止 self.pageNum += 1 # 从第二页开始 new_url = self.url + str(self.pageNum) # 回调函数 yield scrapy.Request(url=new_url, callback=self.parse)
def parse_job(self, response): print(response.text) name = response.xpath('//div[@class="info-primary"]/div/h1/text()' ).extract_first().strip() salary = response.xpath( '//div[@class="info-primary"]/div/span/text()').extract().strip() infos = response.xpath( '//div[contains(@class,"job-primary")]/div[@class="info-primary"]/p/text()' ).extract() city = infos[0] work_years = infos[1] education = infos[2] company = response.xpath( '//div[@class="sider-company"]//div[@class="company-info"]/a[1]/@title' ).extract().strip() item = BossItem(name=name, salary=salary, city=city, work_years=work_years, education=education, company=company) print('=' * 60) print(name, salary, company) print('=' * 60) yield item
def parse_job(self, response): # xpath语法,提取网页信息 name = response.xpath("//div[@class='name']/h1/text()").get() name = name.strip() if name else '无' salary = response.xpath("//span[@class='salary']/text()").get() salary = salary.strip() if salary else '无' job_info = response.xpath( '//div[contains(@class, "job-primary ")]/div[@class="info-primary"]/p//text()' ).getall() if job_info: city = job_info[0] work_years = job_info[1] education = job_info[2] else: city = '无' work_years = '无' education = '无' company = response.xpath( "//div[@class='info-company']//a/text()").get() company = company.strip() if company else '无' yield BossItem(name=name, salary=salary, city=city, work_years=work_years, education=education, company=company)
def parse_job(self, response): doc = pq(response.text) item_loader = BossItemLoader(item=BossItem(), response=response) try: item_loader.add_value('job_tag', re.findall('query=(.*?)&', self.start_urls)) item_loader.add_value('url_id', get_md5(response.url)) item_loader.add_value('city', doc('.text-city').text()) item_loader.add_value('job_title', doc('title').text()) item_loader.add_value('job_describe', transfer_json(doc('.job-sec .text').text())) item_loader.add_value('job_address', doc('.location-address').text()) item_loader.add_value('job_url', response.url) item_loader.add_value('job_createtime', doc('.sider-company .gray').text().split(':')[-1]) item_loader.add_value('salary', str(doc('.salary').text().strip('K').split('-'))) if '·' in doc('.salary').text(): item_loader.add_value('salary_multiple', doc('.salary').text().strip('薪').split('·')[-1]) item_loader.add_value('company', doc('.job-sec .name').text()) item_loader.add_value('company_createtime', doc('.level-list .res-time').text().split(':')[-1]) item_loader.add_value('company_registered_fund', re.match('.*注册资金:(.*)万', doc('.level-list').text(), re.S).group(1) if re.match( '.*注册资金:(.*)万', doc('.level-list').text(), re.S) else None) item_loader.add_value('company_people', re.findall('\d.*人', doc('.sider-company p').text())) item_loader.add_value('company_industry', doc('.sider-company a[ka=job-detail-brandindustry]').text()) item_loader.add_value('company_describe', doc('.job-sec.company-info .text').text()) item_loader.add_value('create_time', datetime.now().strftime('%Y-%m-%d %X')) except Exception as e: with open(log_path + str(datetime.now().strftime('%Y-%m%d %X') + '.html'), 'w', encoding='utf-8') as f: f.write(response.text) f.write('\r\n') f.write(traceback.format_exc()) f.write('\r\n') f.write(str(e)) finally: job_item = item_loader.load_item() return job_item
def parse_job(self, response): position_name = response.xpath('//div[@class="name"]/h1/text()').get() salary = response.xpath( '//div[@class="info-primary"]//span[@class="badge"]/text()').get() info = response.xpath( '//div[@class="job-primary detail-box"]//div[@class="info-primary"]/p/text()' ).getall() city, work_experience, education = list( map(lambda x: x.split(":")[1], info)) tags = ",".join( response.xpath('//div[@class="job-tags"]/span/text()').getall()) describe = response.xpath( '//div[@class="job-sec"]/div[@class="text"]//text()').getall() describe = ",".join(describe).replace("\n", "").strip() company_describe = response.xpath( '//div[@class="job-sec company-info"]/div[@class="text"]/text()' ).getall() company_describe = ",".join(company_describe).replace("\n", "").strip() info_content = ",".join( response.xpath( '//div[@class="job-sec"]//div[@class="level-list"]//text()'). getall()) information = re.sub(r"[\n\s,,]", "", info_content) work_location = response.xpath( '//div[@class="location-address"]/text()').get() company_name = response.xpath('//h3[@class="name"]/a/text()').get() company_url = response.xpath( '//div[@class="info-company"]/p[last()]/text()').get() item = BossItem(position_name=position_name,salary=salary,city=city,work_experience=work_experience, \ education=education,tags=tags, describes=describe,company_describe=company_describe, \ information=information,work_location=work_location,company_name=company_name,\ company_url=company_url, url=response.url, url_object_id = get_md5(response.url)) yield item
def parse_boss(self, response): job_name = response.xpath( "//div[@class='job-banner']//div[@class='name']/h1/text()").get( ).strip() company = response.xpath( "//div[@class='detail-content']/div[4]//div[@class='name']/text()" ).get().strip() salary = response.xpath( "//div[@class='job-banner']//div[@class='name']/span/text()").get( ).strip() working_address = response.xpath( "//div[@class='job-banner']//div[@class='info-primary']/p//text()" ).getall()[0] working_age = response.xpath( "//div[@class='job-banner']//div[@class='info-primary']/p//text()" ).getall()[1] education = response.xpath( "//div[@class='job-banner']//div[@class='info-primary']/p//text()" ).getall()[2] detail = response.xpath( "//div[@class='job-box']//div[@class='job-detail']//div[@class='text']/text()" ).get() item = BossItem(job_name=job_name, company=company, salary=salary, working_address=working_address, working_age=working_age, education=education, detail=detail) print(item) yield item
def parse_job(self, response): title = response.xpath("//h1[@class= 'name']/text()").get().strip() salary = response.xpath("//h1[@class = 'name']/span/text()").get().strip() job_info = response.xpath("//div[@class= 'job_primary']/div[@class= 'info-primary']/p//text()").get().strip() city = job_info[0] work_year = job_info[1] education = job_info[2] item = BossItem(title=title, salary=salary, city=city, work_year=work_year, education=education) yield item
def parse(self, response): posts = response.css('div.post-meta')[:10] for post in posts: item = BossItem() item['link'] = post.css( 'a.archive-title::attr(href)').extract_first() item['name'] = post.css('a.archive-title::text').extract_first() print(item) yield item
def parse(self, response): # print(response, type(response)) # from scrapy.http.response.html import HtmlResponse # print(response.body_as_unicode()) # # current_url = response.url # 爬取时请求的url # body = response.body # 返回的html # unicode_body = response.body_as_unicode() # 返回的html unicode编码 # print unicode_body # md5_obj = hashlib.md5() # md5_obj.update(response.url) # md5_url = md5_obj.hexdigest() hxs = HtmlXPathSelector(response) if response.url in Demo.url_over_set: pass else: Demo.url_over_set.add(response.url) #获取数据 items = hxs.xpath('//div[@class="job-list"]/ul/li' ) # select中填写查询目标,按scrapy查询语法书写 for item in items: bossItem = BossItem() bossItem['jobName'] = item.xpath( './div[@class="job-primary"]/div[@class="info-primary"]/h3/a/text()' ).extract()[0] bossItem['salary'] = item.xpath( './div[@class="job-primary"]/div[@class="info-primary"]/h3/a/span/text()' ).extract()[0] bossItem['companyName'] = item.xpath( './div[@class="job-primary"]/div[@class="info-company"]//h3/a/text()' ).extract()[0] bossItem['city'] = item.xpath( './div[@class="job-primary"]/div[@class="info-primary"]/p/text()' ).extract()[0] bossItem['life'] = item.xpath( './div[@class="job-primary"]/div[@class="info-primary"]/p/text()' ).extract()[1] bossItem['education'] = item.xpath( './div[@class="job-primary"]/div[@class="info-primary"]/p/text()' ).extract()[2] bossItem['skill'] = item.xpath( './div[@class="job-tags"]/span/text()').extract() bossItem['time'] = item.xpath( './div[@class="job-time"]/span/text()').extract()[0] yield bossItem #完了后获取地址 page_url = hxs.select('//div[@class="page"]/a/@href').extract() for url in page_url: url = "http://" + Demo.allowed_domains[0] + url if not 'javascript' in url and url not in Demo.url_over_set: Demo.url_set.add(url) print Demo.url_set next_url = Demo.url_set.pop() time.sleep(1) yield Request(next_url, callback=self.parse)
def parse_job(self, response): title = response.xpath("//div[@class='job-banner']//div[@class='name']/h1/text()").get().strip() salary = response.xpath("//div[@class='job-banner']//div[@class='name']/span/text()").get().strip() job_info = response.xpath("//div[@class='job-banner']//p//text()").getall() city = job_info[0] work_years = job_info[1] education = job_info[2] company = response.xpath("//div[@class='sider-company']//div[@class='company-info']/a[1]/@title").get().strip() item = BossItem(name=title, salary=salary, city=city, work_years=work_years, education=education,company=company) yield item
def parse_job(self, response): name = response.xpath("//h1[@class='name']/text()").get().strip() salary = response.xpath("//h1[@class='name']/span/text()").get().strip() job_info = response.xpath("//div[@class='job-primary']/div[@class='info-primary']/p//text()").getall() city = job_info[0] work_years = job_info[1] education = job_info[2] company = response.xpath("//div[@class='info-company']//a/text()").get() item = BossItem(name=name,salary=salary,city=city,work_years=work_years,education=education,company=company) yield item
def parse_item(self, response): item = BossItem() try: item["title"] = response.xpath( '//div[@class="name"]/h1/text()').extract()[0] #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() print(item) yield item except Exception as e: print(e)
def parse_data(self, response): title = response.xpath('//div[@class="name"]/h1/text()').get() salary = response.xpath('//span[@class="badge"]/text()').get().strip() job_info = response.xpath('//div[@class="job-primary detail-box"]/div[2]/p/text()').getall() company = response.xpath('//h3[@class="name"]//text()').get() city = job_info[0].split(':')[1] work_year = job_info[1].split(':')[1] education = job_info[2].split(':')[1] item = BossItem(title=title, salary=salary, company=company, city=city, work_year=work_year, education=education) yield item
def parse(self, response): # item = BossItem() body = response.css(".job-primary") for head in body: item = BossItem() item["title"] = head.css(".job-title::text").extract()[0] item["wage"] = head.css(".red::text").extract()[0] item["site"] = head.css( ".info-primary p::text").extract_first().strip() item["name"] = head.css( ".company-text .name a::text").extract_first() yield item #翻页 next_page = response.css(".page .next::attr(href").extract()[0] if next_page is not None: yield response.follow('https://www.zhipin.com' + next_page, callback=self.parse)
def next2(self, response): # 请求Cookie Cookie2 = response.request.headers.getlist('Cookie') print(Cookie2) #body = response.body # 获取网页内容字节类型 #unicode_body = response.body_as_unicode() # 获取网站内容字符串类型 #a = response.xpath('/html/head/title/text()').extract() #得到个人中心页面 t = response.xpath('//text()').extract() txt = re.sub(r'[a-zA-Z",:{}\\.(\)!;%&?$@#><+|*/[\]_=-]', "", str(t)) txt = txt.replace(" ", "") txt = txt.replace("'", "") print(txt) item = BossItem() item['txt'] = txt return item
def parse_next_page(self, response): sel = Selector(response) sites = sel.xpath('//div[@class="job-primary"]') items = [] for site in sites: item = BossItem() item["position_name"] = site.xpath( 'div[@class="info-primary"]/h3/text()').extract()[0] item["salary"] = site.xpath( 'div[@class="info-primary"]/h3/span/text()').extract()[0] item["company"] = site.xpath( 'div[@class="info-comapny"]/div/h3/text()').extract()[0] item["company_type"] = site.xpath( 'div[@class="info-comapny"]/div/p/text()').extract()[0] #items.append(item) yield item print item["position_name"]
def parse(self, response): gl_item = response.selector.xpath('//div[@class="job-primary"]') for i in gl_item: boss = BossItem() job_title = ''.join( i.xpath('.//div[@class="job-title"]/text()').extract()) job_price = ''.join( i.xpath('.//h3[@class="name"]/a/span/text()').extract()) comp_name = ''.join( i.xpath('.//div[@class="company-text"]/h3/a/text()').extract()) comp_line = ''.join( i.xpath('.//div[@class="company-text"]/p/text()').extract()[0]) comp_info = i.xpath( './/div[@class="company-text"]/p/text()').extract() # print(per_num) if len(comp_info) == 3: per_num = comp_info[2] else: per_num = comp_info[1] publis_name = ''.join( i.xpath('.//div[@class="info-publis"]/h3/text()').extract()[0]) publis_time = ''.join( i.xpath('.//div[@class="info-publis"]/p/text()').extract()) src = ''.join( i.xpath('.//a[@class="btn btn-startchat"]/@redirect-url'). extract()) #https://www.zhipin.com/geek/new/index/chat?id=1ff819389ed8632e0nVz29S8FFM~ # job_jianjie = ''.join(i.xpath('.//div[@class="info-primary"]/p/text()')) # comp_info = ''.join(i.xpath('.//div[@class="company-text"]/p/text()')) boss['job_title'] = job_title boss['job_price'] = job_price boss['comp_name'] = comp_name boss['comp_line'] = comp_line boss['per_num'] = per_num boss['publis_name'] = publis_name boss['publis_time'] = publis_time boss['src'] = src yield boss
def parse_detail(self, response): company = response.xpath( "//div[@class='company-info']/a[@ka='job-detail-company_custompage']/text()").get().strip() position = response.xpath("//div[@class='name']/h1/text()").get() salary = response.xpath("//span[@class='salary']/text()").get() texts = response.xpath("//div[@class='info-primary']/p/text()").getall() city = texts[0] experience = texts[1] education = texts[2] describes = "".join(response.xpath("(//div[@class='job-sec'])[1]/div/text()").getall()).strip() origin_url = response.url tags = ",".join(response.xpath( "(//div[@class='tag-more'])[1]/div[contains(@class, 'tag-all')]//text()").getall()[1:-1]).strip() item = BossItem(company=company, position=position, salary=salary, city=city, experience=experience, education=education, describes=describes, tags=tags, origin_url=origin_url) yield item
def parse_detail(self, response): title = response.xpath("//div[@class='name']/h1/text()").get() salary = response.xpath("//span[@class='salary']/text()").get().strip() company = response.xpath( "//div[@class='detail-content']//div[@class='name']/text()").get() job_info = response.xpath( "//div[@class='info-primary']//p/text()").getall() city = job_info[0] work_years = job_info[1] education = job_info[2] item = BossItem(title=title, salary=salary, company=company, city=city, work_years=work_years, education=education) yield item
def parse_job(self, response): name = response.xpath("//div[@class='name']/h1/text()").get().strip() salary = response.xpath( "//div[@class='name']/span/text()").get().strip() job_info = response.xpath( "//div[contains(@class,'job-primary')]/div[@class='info-primary']/p[1]/text()" ).getall() city = job_info[0] work_years = job_info[1] education = job_info[2] company = response.xpath( "//div[@class='company-info']/a[last()]/text()").get().strip() yield BossItem(name=name, salary=salary, city=city, work_years=work_years, education=education, company=company)
def parse_item(self, response): name = response.xpath('//div[@class="name"]/h1/text()').get() salary = response.xpath( '//div[@class="name"]/span/text()').get().strip() job_info = response.xpath( '//div[@class="info-primary"]/p/text()').getall() city = job_info[0] worked_years = job_info[1] education = job_info[2] company = response.xpath( '//div[@class="company-info"]/a[1]/@title').get().strip() item = BossItem(name=name, salary=salary, city=city, worked_years=worked_years, education=education, company=company) return item
def parse_job(self, response): title = response.xpath("//div[@class='name']/h1/text()").get().strip() salary = response.xpath( "//div[@class='name']/span/text()").get().strip() job_info = response.xpath( "//div[@class='job-primary detail-box']/div[@class='info-pro=imary']/p//text()" ).getall() city = job_info[0] work_years = job_info[1] education = job_info[2] company_name = response.xpath( "//div[class='company-info']//a/text()").get().strip() item = BossItem(title=title, salary=salary, city=city, work_years=work_years, education=education, company_name=company_name) yield item
def parse_item(self, response): name = response.xpath("//div[@class='name']/h1/text()").get().strip() salary = response.xpath( "//div[@class='name']/span/text()").get().strip() info = response.xpath( "//div[@class='job-primary detail-box']//div[@class='info-primary']//p/text()" ).getall() city = info[0].strip() workYears = info[1].strip() education = info[2].strip() company = response.xpath( "//div[@class='company-info']/a/text()").getall()[2].strip() boss = BossItem(name=name, salary=salary, jobCity=city, workYear=workYears, education=education, company=company) yield boss
def parse(self, response): jobs = response.css('.job-box .job-list ul li .job-primary') for job in jobs: item = BossItem() item['岗位名称'] = job.css('.job-title::text').extract_first() item['薪资范围'] = job.css('.red::text').extract_first() item['工作地'] = job.css('.info-primary p::text').extract()[0] item['工作经验'] = job.css('.info-primary p::text').extract()[1] item['学历要求'] = job.css('.info-primary p::text').extract()[-1] item['公司名称'] = job.css( '.company-text .name a::text').extract_first() item['所属行业'] = job.css('.company-text p::text').extract()[0] item['融资阶段'] = job.css('.company-text p::text').extract()[1] item['公司规模'] = job.css('.company-text p::text').extract()[-1] yield item next = response.css('.page .next::attr(href)').extract_first() url = response.urljoin(next) yield scrapy.Request(url=url, callback=self.parse)
def parse(self, response): datas=response.xpath('//div[@class="job-list"]/ul/li').extract() item=BossItem() for i in range(0,len(datas)): data=datas[i] item["title"] = re.findall ('<div class="job-title">(.*?)</div>', data,re.S)[0] item["money"] = re.findall ('<span class="red">(.*?)</span>', data, re.S)[0] item["loc"] = re.findall ('<p>(.*?)<em class="vline">', data, re.S)[0] url = re.findall ('<a href="(/job_detail/.*?)"', data, re.S)[0] item["url"] = "https://www.zhipin.com" + url item["sb_time"] = re.findall ('em class="vline"></em>(.*?)<em', data, re.S)[0] item["school"] = re.findall ('<em class="vline"></em>.*?<em class="vline"></em>(.*?)</p>', data, re.S)[0] item["company"] = re.findall ('<h3 class="name"><.*?>(.*?)</a></h3>', data, re.S)[0] item["company_rs"] = re.findall ('<em class="vline"></em>.*?<em class="vline"></em>(.*?)</p>', data, re.S)[1] # print(item) yield item next_page=response.xpath('//a[@ka="page-next"]/@href').extract()[0] if len(next_page)!=0: next_url="https://www.zhipin.com"+next_page yield scrapy.Request(next_url,callback=self.parse)
def parse_item(self, response): name = response.xpath("//h1/text()").get().strip() salary = response.xpath("//span[@class='salary']/text()").get().strip() job_info = response.xpath( "//*[@id='main']/div[1]/div/div/div[2]/p//text()").getall() city = job_info[0] work_year = job_info[1] education = job_info[2] positon_info = response.xpath( "//div[@class='job-sec']//div[1][@class='text']/text()").get() # # positon_info = "".join(positon_info).strip() company = response.xpath( "//div[@class='company-info']//a[2]/text()").get().strip() item = BossItem(name=name, salary=salary, city=city, work_year=work_year, education=education, company=company, positon_info=positon_info) yield item
def parse_job(self, response): title = response.xpath("//div[@class='name']/h1/text()").get().strip() salary = response.xpath( "//div[@class='name']/span[@class='badge']/text()").get().strip() job_info = response.xpath( "//div[@class='job-primary detail-box']//div[@class='info-primary']/p/text()" ).getall() city = job_info[0].strip() work_years = job_info[1].strip() education = job_info[2].strip() company = response.xpath( "//div[@class='info-company']/h3[@class='name']/a/text()").get( ).strip() item = BossItem(title=title, salary=salary, city=city, work_years=work_years, education=education, company=company) # yield 给pipelines yield item
def parse_job(self, response): name = response.xpath("//div[@class='name']/h1/text()").get().strip( ) # strip()去掉头和尾的空白字符串 salary = response.xpath( "//div[@class='name']/span/text()").get().strip() job_info = response.xpath( "//div[@class='job-primary detail-box']/div[@class='info-primary']/p//text()" ).getall() city = job_info[0] work_years = job_info[1] education = job_info[2] # company = response.xpath("//div[@class='company-info']/a/@title").get().strip() company = response.xpath( "//div[@class='company-info']/div[@class='info']/text()").get( ).strip() item = BossItem(name=name, salary=salary, city=city, work_years=work_years, education=education, company=company) yield item
def parse_job(self, response): title = response.xpath('//h1[@class="name"]/text()').get().strip() salary = response.xpath( '//h1[@class="name"]/span/text()').get().strip() job_info = response.xpath( '//div[@class="job-primary"]/div[@class="info-primary"]/p//text()' ).getall() city = job_info[0] work_years = job_info[1] education = job_info[2] company = response.xpath( '//div[@class="info-primary"]//a/text()').get() item = BossItem(title=title, salary=salary, job_info=job_info, city=city, work_years=work_years, education=education, company=company) print(1) print(item) yield item