Exemple #1
0
 def parse_page(self, response):
     #         print(response.meta['nickName'])
     #//a[contains(@id,'TitleUrl')]
     urlArr = response.url.split('default.aspx?')
     if len(urlArr) > 1:
         baseUrl = urlArr[-2]
     else:
         baseUrl = response.url
     list = response.xpath("//a[contains(@id,'TitleUrl')]")
     for i in list:
         item = CnblogsItem()
         item['top'] = int(response.meta['item']['top'])
         item['nickName'] = response.meta['item']['nickName']
         item['userName'] = response.meta['item']['userName']
         item['score'] = int(response.meta['item']['score'])
         item['pageLink'] = response.url
         item['title'] = i.xpath("./text()").extract()[0].replace(
             u'[置顶]', '').replace('[Top]', '').strip()
         item['articleLink'] = i.xpath("./@href").extract()[0]
         yield scrapy.Request(i.xpath("./@href").extract()[0],
                              meta={'item': item},
                              callback=self.parse_content)
     if len(list) > 0:
         response.meta['page'] += 1
         yield scrapy.Request(baseUrl + 'default.aspx?page=' +
                              str(response.meta['page']),
                              meta={
                                  'page': response.meta['page'],
                                  'item': response.meta['item']
                              },
                              callback=self.parse_page)
Exemple #2
0
    def parse_more_page(self, response):

        posts = response.xpath('//div[@class="post_item_body"]')

        for post in posts:
            item = CnblogsItem()
            item['title'] = post.xpath('h3/a/text()').extract()
            item['summary'] = post.xpath('p[@class="post_item_summary"]/text()').extract()
            yield item
Exemple #3
0
 def parse(self, response):
     # pass
     papers = response.xpath('//div[@class="day"]')
     # items = {}
     for paper in papers:
         item = CnblogsItem()
         item['title'] = paper.xpath(
             './div[@class="postTitle"]/a/text()').extract()[0]
         item['day'] = paper.xpath(
             './div[@class="dayTitle"]/a/text()').extract()[0]
         item['postcon'] = paper.xpath(
             './div[@class="postCon"]/div[@class="c_b_p_desc"]/text()'
         ).extract()[0]
         yield item
Exemple #4
0
    def parse(self, response):

        more_pages = [self.get_url_of_page(p) for p in xrange(2, 6)]
        for page_url in more_pages:
            CnblogsSpider.crawledLinks.append(page_url)
            yield Request(page_url, self.parse_more_page)

        posts = response.xpath('//div[@class="post_item_body"]')
        print posts
        for post in posts:
            item = CnblogsItem()
            item['title'] = post.xpath('h3/a/text()').extract()
            item['summary'] = post.xpath('p[@class="post_item_summary"]/text()').extract()
            yield item
    def parse_blog(self, response): 
        print "--------解析目录页面---------"
        for blog in response.xpath('//div[@class="day"]'):
            blog_item = CnblogsItem()
            blog_item['title'] = blog.xpath('.//div[@class="postTitle"]/a/text()').extract_first()
            blog_item['link'] = response.urljoin(blog.xpath('.//div[@class="postTitle"]/a/@href').extract_first())
            blog_item['desc'] = blog.xpath('.//div[@class="c_b_p_desc"]/text()').extract_first()
            blog_info = blog.xpath('.//div[@class="postDesc"]/text()').extract_first()
            blog_item['read_num'] = blog_info.split('(')[1].split(')')[0]
            blog_item['comment_num'] = blog_info.split('(')[2].split(')')[0]

            yield blog_item

            print blog_item['title']
            print blog_item['link']
            print blog_item['desc']
            print blog_item['read_num']
            print blog_item['comment_num']
Exemple #6
0
 def parse(self, response):
     for i in response.xpath("//table[@width='90%']//td"):
         item = CnblogsItem()
         item['top'] = i.xpath("./small[1]/text()").extract()[0].split(
             '.')[-2].strip()
         item['nickName'] = i.xpath("./a[1]//text()").extract()[0].strip()
         item['userName'] = i.xpath("./a[1]/@href").extract()[0].split(
             '/')[-2].strip()
         totalAndScore = i.xpath("./small[2]//text()").extract()[0].lstrip(
             '(').rstrip(')').split(',')
         item['score'] = totalAndScore[2].strip()
         #             print(top)
         #             print(nickName)
         #             print(userName)
         #             print(total)
         #             print(score)
         #             return
         yield scrapy.Request(i.xpath("./a[1]/@href").extract()[0],
                              meta={
                                  'page': 1,
                                  'item': item
                              },
                              callback=self.parse_page)
Exemple #7
0
 def parse_details(self, response):
     title = response.meta['title']
     curl = response.meta['curl']
     yield CnblogsItem(title=title, curl=curl, content=str(response.body))