def parseContent(self, response): # 获取文章各部分信息 article = response.selector.xpath('//div[@id="articleContent"]') item = NewsContentItem() item['title'] = article.xpath('//div[@class="articleTitle"]/h2/text()').extract()[0] contents = article.xpath('//div[@class="articleBox mb20 cfix"]/p/text()').extract() item['content'] = '' for cont in contents: item['content'] += cont.strip() + '<br />' item['url'] = response.url item['time'] = article.xpath('//span[@class="yearMsg"]/text()').extract()[0] item['site'] = '北青网' yield item
def parseContent(self, response): # 获取文章各部分信息 article = response.selector.xpath('//div[@class="article"]') item = NewsContentItem() item['title'] = article.xpath('//h1/text()').extract()[0] contents = article.xpath('//div[@class="text"]/p/text()').extract() item['content'] = '' for cont in contents: item['content'] += cont.strip() + '<br />' item['url'] = response.url item['time'] = time.strftime("%Y-%m-%d %H:%M:%S",time.localtime(time.time())) item['site'] = '京郊日报' yield item