Beispiel #1
0
    def parse_blogs(self, response):
        for item in response.xpath('//*[@id="post_list"]/div'):
            page_url = item.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first()
            id = hashlib.md5(page_url).hexdigest()
            title = item.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first()
            if self.blogs.isExistById(id):
                self.logger.info('id:%s is exist!' % id)
                continue
            author = item.xpath('div/div[@class="post_item_foot"]/a/text()').extract_first()
            avatar = item.xpath('div[@class="post_item_body"]/p/a/img/@src').extract_first()
            blog_url = item.xpath('div/div[@class="post_item_foot"]/a/@href').extract_first()
            summary = self.commonParser.trim(''.join(item.xpath('div[@class="post_item_body"]/p/text()').extract()))
            dateStr = self.commonParser.trim(''.join(item.xpath('div[@class="post_item_body"]/div/text()').extract()))
            pv = item.xpath('div[2]/div/span[2]/a/text()').extract_first().strip()
            cv = item.xpath('div[2]/div/span[1]/a/text()').extract_first().strip()
            positive = item.xpath('div[1]/div[1]/span/text()').extract_first()
            pv = re.findall(r'\d+', pv)[0]
            cv = re.findall(r'\d+', cv)[0]

            blogs = Blogs()
            blogs.id = id
            blogs.site = self.site
            blogs.title = title
            blogs.label = None
            blogs.author = author
            blogs.summary = summary
            blogs.content = None
            blogs.avatar = avatar
            blogs.page_url = page_url
            blogs.blog_url = blog_url
            blogs.pv = pv
            blogs.cv = cv
            blogs.positive = positive
            blogs.publish_time = parseDateString(dateStr)
            blogs.date_update = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
            yield ModelItem.getInstance(blogs)
Beispiel #2
0
    def parse_csdn(self, response):
        contentlist = response.xpath('//*[@class="blog_list_wrap"]/dl')
        self.logger.info(u'-----------第%s页,%d条数据-----------' %
                         (response.meta['page'], len(contentlist)))
        for item in contentlist:
            page_url = item.xpath('dd/h3/a/@href').extract_first()
            id = hashlib.md5(page_url).hexdigest()
            title = item.xpath('dd/h3/a/text()').extract_first()
            if self.blogs.isExistById(id):
                self.logger.info('id:%s is exist!' % id)
                continue
            summary = item.xpath(
                'dd/div[@class="blog_list_c"]/text()').extract_first()
            dateStr = item.xpath(
                'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_r fr"]/label/text()'
            ).extract_first()
            label = ','.join(
                item.xpath(
                    'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_l fl"]/span/a//text()'
                ).extract())
            blog_url = item.xpath('dt/a[2]/@href').extract_first()
            author = item.xpath('dt/a[2]/text()').extract_first()
            avatar = item.xpath('dt/a/img/@src').extract_first()
            pv = item.xpath(
                'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_r fr"]/span/em/text()'
            ).extract_first()
            positive = item.xpath(
                'dd/div[@class="blog_list_b_b"]/span/em/text()').extract_first(
                )
            blogs = Blogs()
            blogs.id = id
            blogs.site = self.site
            blogs.title = title
            blogs.label = label
            blogs.author = author
            blogs.summary = summary
            blogs.page_url = page_url
            blogs.blog_url = blog_url
            blogs.avatar = avatar
            blogs.pv = pv
            blogs.positive = positive
            blogs.publish_time = parseDateString(dateStr)
            now = datetime.datetime.now()
            blogs.date_update = now.strftime('%Y-%m-%d %H:%M:%S')

            yield ModelItem.getInstance(blogs)
        next_page_url = response.xpath(
            u'//div[@class="page_nav"]/a[text()="下一页"]/@href').extract_first()
        if next_page_url:
            pageNo = re.search('(\d+)', next_page_url).group(1)
            yield Request(
                'http://blog.csdn.net%s' % next_page_url,
                meta={
                    'type': 'list',
                    'page': pageNo
                },
                dont_filter=True,
                headers={
                    'User-Agent':
                    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36"
                })
        else:
            self.logger.info(u'----------CSDN最新博客,一共有%s页----------' %
                             (response.meta['page']))