def parse_page(self, response): # print(response.meta['nickName']) #//a[contains(@id,'TitleUrl')] urlArr = response.url.split('default.aspx?') if len(urlArr) > 1: baseUrl = urlArr[-2] else: baseUrl = response.url list = response.xpath("//a[contains(@id,'TitleUrl')]") for i in list: item = CnblogsItem() item['top'] = int(response.meta['item']['top']) item['nickName'] = response.meta['item']['nickName'] item['userName'] = response.meta['item']['userName'] item['score'] = int(response.meta['item']['score']) item['pageLink'] = response.url item['title'] = i.xpath("./text()").extract()[0].replace( u'[置顶]', '').replace('[Top]', '').strip() item['articleLink'] = i.xpath("./@href").extract()[0] yield scrapy.Request(i.xpath("./@href").extract()[0], meta={'item': item}, callback=self.parse_content) if len(list) > 0: response.meta['page'] += 1 yield scrapy.Request(baseUrl + 'default.aspx?page=' + str(response.meta['page']), meta={ 'page': response.meta['page'], 'item': response.meta['item'] }, callback=self.parse_page)
def parse_more_page(self, response): posts = response.xpath('//div[@class="post_item_body"]') for post in posts: item = CnblogsItem() item['title'] = post.xpath('h3/a/text()').extract() item['summary'] = post.xpath('p[@class="post_item_summary"]/text()').extract() yield item
def parse(self, response): # pass papers = response.xpath('//div[@class="day"]') # items = {} for paper in papers: item = CnblogsItem() item['title'] = paper.xpath( './div[@class="postTitle"]/a/text()').extract()[0] item['day'] = paper.xpath( './div[@class="dayTitle"]/a/text()').extract()[0] item['postcon'] = paper.xpath( './div[@class="postCon"]/div[@class="c_b_p_desc"]/text()' ).extract()[0] yield item
def parse(self, response): more_pages = [self.get_url_of_page(p) for p in xrange(2, 6)] for page_url in more_pages: CnblogsSpider.crawledLinks.append(page_url) yield Request(page_url, self.parse_more_page) posts = response.xpath('//div[@class="post_item_body"]') print posts for post in posts: item = CnblogsItem() item['title'] = post.xpath('h3/a/text()').extract() item['summary'] = post.xpath('p[@class="post_item_summary"]/text()').extract() yield item
def parse_blog(self, response): print "--------解析目录页面---------" for blog in response.xpath('//div[@class="day"]'): blog_item = CnblogsItem() blog_item['title'] = blog.xpath('.//div[@class="postTitle"]/a/text()').extract_first() blog_item['link'] = response.urljoin(blog.xpath('.//div[@class="postTitle"]/a/@href').extract_first()) blog_item['desc'] = blog.xpath('.//div[@class="c_b_p_desc"]/text()').extract_first() blog_info = blog.xpath('.//div[@class="postDesc"]/text()').extract_first() blog_item['read_num'] = blog_info.split('(')[1].split(')')[0] blog_item['comment_num'] = blog_info.split('(')[2].split(')')[0] yield blog_item print blog_item['title'] print blog_item['link'] print blog_item['desc'] print blog_item['read_num'] print blog_item['comment_num']
def parse(self, response): for i in response.xpath("//table[@width='90%']//td"): item = CnblogsItem() item['top'] = i.xpath("./small[1]/text()").extract()[0].split( '.')[-2].strip() item['nickName'] = i.xpath("./a[1]//text()").extract()[0].strip() item['userName'] = i.xpath("./a[1]/@href").extract()[0].split( '/')[-2].strip() totalAndScore = i.xpath("./small[2]//text()").extract()[0].lstrip( '(').rstrip(')').split(',') item['score'] = totalAndScore[2].strip() # print(top) # print(nickName) # print(userName) # print(total) # print(score) # return yield scrapy.Request(i.xpath("./a[1]/@href").extract()[0], meta={ 'page': 1, 'item': item }, callback=self.parse_page)
def parse_details(self, response): title = response.meta['title'] curl = response.meta['curl'] yield CnblogsItem(title=title, curl=curl, content=str(response.body))