def parse(self, response): print(response) li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li') for li in li_list: item = BossproItem() # ./div/div[1]/div[1]/div/div[1]/span[1]/a job_name = li.xpath( './div/div[1]/div[1]/div//div[1]/span[1]/a/@title' ).extract_first() detail_url = 'https://www.zhipin.com' + li.xpath( './div/div[1]/div[1]/div//div[1]/span[1]/a/@href' ).extract_first() item['job_name'] = job_name print(job_name, detail_url) #对详情页发请求,获取详情页的源码数据 #手动请求发送 #请求传参,通过:meta={},可以将meta字典传递给请求对应的回调函数 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}) #分页操作 if self.page_num <= 3: new_url = format(self.url % self.page_num) print(new_url) self.page_num += 1 yield scrapy.Request(url=new_url, callback=self.parse)
def parse(self, response): count = 0 li_list = response.xpath('//div[@class="job-list"]/ul/li') for li in li_list: item = BossproItem() job_name = li.xpath( './div/div[1]/div[1]/div/div[1]/span[1]/text()').extract() item['job_name'] = job_name count += 1 print(count) # print(job_name) detail_url = 'https://www.zhipin.com' + li.xpath( './div/div[1]/div[1]/div/div[1]/span[1]/a/@href').extract() # 获取detail page的数据 # 手动请求的发送 # 请求传参,meta={}, 可以将meta字典传递给请求对应的回调函数 yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item: ': item}) # 分页操作 if self.page_num <= 3: new_url = format(self.url % self.page_num) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li') for li in li_list: item = BossproItem() job_name = li.xpath( './/span[@class="job-name"]/a/text()').extract_first() item['job_name'] = job_name print(job_name) detail_url = 'https://www.zhipin.com' + li.xpath( './/span[@class="job-name"]/a/@href').extract_first() # 对详情页发请求,获取详情页的页面源码数据 # 手动请求的发送 # 请求传参:meta={},可以将meta字典传递给请求对应的回调函数 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={'item': item}) # 分页操作 if self.page_num <= 3: new_url = format(self.url % self.page_num) # print(new_url) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): li_list = response.xpath('//div[@class="job-list"]/ul/li') for li in li_list: job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div/text()' ).extract_first() salary = li.xpath('.//div[@class="info-primary"]/h3/a/span/text()' ).extract_first() company = li.xpath( './/div[@class="company-text"]/h3/a/text()').extract_first() #实例化一个item对象 item = BossproItem() #将解析到的数据全部封装到item对象中 item['job_name'] = job_name item['salary'] = salary item['company'] = company #将item提交给管道 yield item if self.page <= 3: print('if 执行!!!') self.page += 1 new_url = format(self.url % self.page) print(new_url) #手动请求发送 yield scrapy.Request(url=new_url, callback=self.parse)
def parse(self, response): li_list = response.xpath('/html/body/div/div[3]/div/div[2]/ul/li') for li in li_list: item = BossproItem() job_name = li.xpath('.//div[@class="info-primary"]/h3/a/div[1]/text()').extract_first() item['job_name'] = job_name detial_url = 'https://www.zhipin.com' + li.xpath('.//div[@class="info-primary"]/h3/a/@href').extract_first() yield scrapy.Request(url=detial_url, callback=self.parse_detail, meta={'item': item}) if self.page_num <= 3: new_url = format(self.url % self.page_num) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): lis = response.xpath('//*[@id="main"]/div/div[3]/ul/li') for li in lis: item = BossproItem() job_name = li.xpath('.//div[@class="job-title"]/span/text()').get() price = li.xpath('//span[@class="red"]/text()').get() adders = li.xpath('.//span[@class="job-area"]/text()').get() detail_url = 'https://www.zhipin.com' + li.xpath( './/span[@class="job-name"]/@href').get() print(job_name, price, adders) print('123456') item['job_name'] = job_name # 对详情页发起请求 yield scrapy.Request(detail_url, callback=self.parse_deatil, meta={'item': item})
def parse(self, response): li_list = response.xpath('//div[@class="job-list"]/ul/li') for li in li_list: title = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/div/text()').extract_first() salary = li.xpath('.//div[@class="info-primary"]/h3[@class="name"]/a/span/text()').extract_first() company = li.xpath('.//div[@class="company-text"]/h3/a/text()').extract_first() #实例化一个item类型的对象 item = BossproItem() #将解析到的数据值存储到item对象中:why? item['title'] = title item['salary'] = salary item['company'] = company #将item对象提交给管道进行持久化存储 yield item
def parse(self, response): # print(TextResponse.text) li_list = response.xpath('//*[@id="main"]/div/div[3]/ul/li') print(li_list) for li in li_list: item = BossproItem() primary_box = li.xpath('./div/div[1]/div[1]/div') job_name = primary_box.xpath( './div[1]/span[1]/a/text()').extract_first() item['jobname'] = job_name detail_url = 'https://www.zhipin.com/' + primary_box.xpath( './div[1]/span/a/@href').extract_first() yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) if self.page_num <= 3: new_url = format(self.url % self.page_num) self.page_num += 1 yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): li_list = response.xpath('//*[@id="container"]/div[1]/ul/li') for li in li_list: job_name = li.xpath('./div/h3/a/text()').extract() job_name = ''.join(job_name) print(job_name) item = BossproItem() item['job_name'] = job_name detail_url = li.xpath('./div/h3/a/@href').extract_first() # 对详情页发请求,获取详情页的页面源码数据 # 请求传参:meta={},可以将meta字典传递给请求对应的回调函数 yield scrapy.Request(detail_url, callback=self.parse_detail, meta={'item': item}) # 分页操作 if self.page_num <= 100: new_url = format(self.url % self.page_num) self.page_num += 10 yield scrapy.Request(new_url, callback=self.parse)
def parse(self, response): li_list = response.xpath('//*[@id="main"]/div/div[2]/ul/li') for li in li_list: item = BossproItem() job_name = li.xpath( './/span[@class="job-name"]/a/text()').extract_first() job_area = li.xpath( './/span[@class="job-area"]/text()').extract_first() detail_url = 'https://www.zhipin.com' + li.xpath( './/span[@class="job-name"]/a/@href').extract_first() # print(job_name, job_area) item['job_name'] = job_name item['job_area'] = job_area # 对详情页发请求获取详情页的源码数据 # 手动请求的发送 # 请求传参: meta={}, 可以将meta子弹传递给请求对应的回调函数 header = self.get_header() yield scrapy.Request(url=detail_url, headers=header, callback=self.parse_detail, meta={'item': item}) # 分页操作 if self.page_num < 2: self.page_num += 1 new_url = format(self.url % (self.page_num, self.page_num)) print('正在爬取:%s' % new_url) time.sleep(5) header = self.get_header() yield scrapy.Request(url=new_url, headers=header, callback=self.parse)