def parse_blogs(self, response): for item in response.xpath('//*[@id="post_list"]/div'): page_url = item.xpath('div[@class="post_item_body"]/h3/a/@href').extract_first() id = hashlib.md5(page_url).hexdigest() title = item.xpath('div[@class="post_item_body"]/h3/a/text()').extract_first() if self.blogs.isExistById(id): self.logger.info('id:%s is exist!' % id) continue author = item.xpath('div/div[@class="post_item_foot"]/a/text()').extract_first() avatar = item.xpath('div[@class="post_item_body"]/p/a/img/@src').extract_first() blog_url = item.xpath('div/div[@class="post_item_foot"]/a/@href').extract_first() summary = self.commonParser.trim(''.join(item.xpath('div[@class="post_item_body"]/p/text()').extract())) dateStr = self.commonParser.trim(''.join(item.xpath('div[@class="post_item_body"]/div/text()').extract())) pv = item.xpath('div[2]/div/span[2]/a/text()').extract_first().strip() cv = item.xpath('div[2]/div/span[1]/a/text()').extract_first().strip() positive = item.xpath('div[1]/div[1]/span/text()').extract_first() pv = re.findall(r'\d+', pv)[0] cv = re.findall(r'\d+', cv)[0] blogs = Blogs() blogs.id = id blogs.site = self.site blogs.title = title blogs.label = None blogs.author = author blogs.summary = summary blogs.content = None blogs.avatar = avatar blogs.page_url = page_url blogs.blog_url = blog_url blogs.pv = pv blogs.cv = cv blogs.positive = positive blogs.publish_time = parseDateString(dateStr) blogs.date_update = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') yield ModelItem.getInstance(blogs)
def parse_csdn(self, response): contentlist = response.xpath('//*[@class="blog_list_wrap"]/dl') self.logger.info(u'-----------第%s页,%d条数据-----------' % (response.meta['page'], len(contentlist))) for item in contentlist: page_url = item.xpath('dd/h3/a/@href').extract_first() id = hashlib.md5(page_url).hexdigest() title = item.xpath('dd/h3/a/text()').extract_first() if self.blogs.isExistById(id): self.logger.info('id:%s is exist!' % id) continue summary = item.xpath( 'dd/div[@class="blog_list_c"]/text()').extract_first() dateStr = item.xpath( 'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_r fr"]/label/text()' ).extract_first() label = ','.join( item.xpath( 'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_l fl"]/span/a//text()' ).extract()) blog_url = item.xpath('dt/a[2]/@href').extract_first() author = item.xpath('dt/a[2]/text()').extract_first() avatar = item.xpath('dt/a/img/@src').extract_first() pv = item.xpath( 'dd/div[@class="blog_list_b clearfix"]/div[@class="blog_list_b_r fr"]/span/em/text()' ).extract_first() positive = item.xpath( 'dd/div[@class="blog_list_b_b"]/span/em/text()').extract_first( ) blogs = Blogs() blogs.id = id blogs.site = self.site blogs.title = title blogs.label = label blogs.author = author blogs.summary = summary blogs.page_url = page_url blogs.blog_url = blog_url blogs.avatar = avatar blogs.pv = pv blogs.positive = positive blogs.publish_time = parseDateString(dateStr) now = datetime.datetime.now() blogs.date_update = now.strftime('%Y-%m-%d %H:%M:%S') yield ModelItem.getInstance(blogs) next_page_url = response.xpath( u'//div[@class="page_nav"]/a[text()="下一页"]/@href').extract_first() if next_page_url: pageNo = re.search('(\d+)', next_page_url).group(1) yield Request( 'http://blog.csdn.net%s' % next_page_url, meta={ 'type': 'list', 'page': pageNo }, dont_filter=True, headers={ 'User-Agent': "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36" }) else: self.logger.info(u'----------CSDN最新博客,一共有%s页----------' % (response.meta['page']))