def parse_content(self, response): # 实例化items 给items字段赋值 items = JobboleItem() # 文章创建时间 -- 这个函数会传到上面的for循环中 所以这里都是取第一个 create_time = response.xpath( '//div[@class="entry-meta"]/p/text()').extract_first().strip() create_time = create_time.replace(' ·', '').replace('/', '-') items['create_time'] = create_time # 内容 -- 只抓取了p标签内容 art_content = response.xpath( '//div[@class="entry"]/p/text()').extract() items['art_content'] = art_content # 接收传下来的数据 art_url = response.meta['art_url'] items['art_url'] = art_url art_title = response.meta['art_title'] items['art_title'] = art_title art_img_url = response.meta['art_img_url'] items['art_img_url'] = art_img_url # 拼接数据 art_list = { 'art_url': art_url, 'art_title': art_title, 'art_img_url': art_img_url, 'create_time': create_time, 'art_content': art_content } # 将数据返回给piplines.py return items
def parse(self, response): """ :param response: 请求的响应结果 :return: """ print('请求成功,获取到了页面源码') #获取响应的状态码 code = response.status # # 获取响应的页面源码 # html = response.text # 获取当前请求的url地址 url = response.url #获取响应的二进制数据 # b_html = response.body # #获取响应头 # response_headers = response.headers # #获取当前请求的request对想 # request = response.request # #获请求的请求头 # request_headers = response.request.headers print(code, url) # 提取目标数据 # 获取文章列表 article_divs = response.xpath('//div[@class="post floated-thumb"]') print(len(article_divs)) for article_div in article_divs: articleItem = JobboleItem() # title(标题) # title = article_div.xpath('.//a[@class="archive-title"]/text()').extract()[0] #extract_first():优势,如果没有取到返回 None,也可以自己设置一个默认值 articleItem['title'] = article_div.xpath( './/a[@class="archive-title"]/text()').extract_first('') # # 封面图 articleItem['coverImage'] = article_div.xpath( './/div[@class="post-thumb"]/a/img/@src').extract_first('暂无') print(articleItem) #获取详情页面的url地址 detailUrl = article_div.xpath( './/a[@class="archive-title"]/@href').extract_first() if detailUrl: #存在则发起请求 """ url, :要请求的url地址 callback=None, 请求成功的回调函数 method='GET', 设置请求的方式 headers=None, 设置请求头,dict类型 cookies=None, 设置用户的cookies信息 dict类型 meta=None, 传递参数 dict类型 encoding='utf-8', (编码类型,一般不需要改变) priority=0, (设置优先级,不需要改变) dont_filter=False, (是否过滤url,False表示过滤,True:表示不过滤) errback=None ,设置请求错误的回调函数 """ yield scrapy.Request(detailUrl, callback=self.parse_article_data, meta={'item': articleItem})
def parse_detail(self, response): """ 将爬虫爬取的数据送到item中进行序列化 这里通过ItemLoader加载item """ item = JobboleItem() item['title_image'] = response.meta.get('title_img') yield item
def parse_detail(self, response): url = response.url md5_url = get_md5(url) item_loader = JobboleInfoItem(item=JobboleItem(), response=response) item_loader.add_value('thumb_image_url', [response.meta.get('thumb_image_url', '')]) item_loader.add_value('url', url) item_loader.add_value('md5_url', md5_url) item_loader.add_css('title', '.entry-header h1::text') item_loader.add_css('create_date', '.entry-meta-hide-on-mobile::text') item_loader.add_css('tag', '.entry-meta-hide-on-mobile a::text') item_loader.add_css('content', 'div.entry') item_loader.add_css('comments', 'a[href="#article-comment"] span::text') item_loader.add_css('collections', '.post-adds span:nth-child(2)::text') item_loader.add_css('praise', '.post-adds span:nth-child(1) h10::text') article_item = item_loader.load_item() yield article_item
def parse_content(self, response): # items = ItemLoader(item=JobboleItem(),response=response) # # 匹配创建时间 # items.add_xpath('create_time','//p[@class="entry-meta-hide-on-mobile"]/text()') # print(create_time) # return items.load_item() # 实例化items 给items字段赋值 items = JobboleItem() # 文章创建时间 -- 这个函数会传到上面的for循环中 所以这里都是取第一个 create_time = response.xpath( '//div[@class="entry-meta"]/p/text()').extract_first().strip() create_time = create_time.replace(' ·', '').replace('/', '-') items['create_time'] = create_time # 内容 -- 只抓取了p标签内容 art_content = response.xpath( '//div[@class="entry"]/p/text()').extract() items['art_content'] = art_content # 接收传下来的数据 art_url = response.meta['art_url'] items['art_url'] = art_url art_title = response.meta['art_title'] items['art_title'] = art_title art_img_url = response.meta['art_img_url'] items['art_img_url'] = art_img_url # 拼接数据 art_list = { 'art_url': art_url, 'art_title': art_title, 'art_img_url': art_img_url, 'create_time': create_time, 'art_content': art_content } # 将数据返回给piplines.py # print(art_list) return items