def parse_area(self, response): """添加不同的区域和页码进行爬取数据""" base_url = response.url sel = scrapy.Selector(response) # 存入Redis 数据库 item = MasterRedisItem() # 添加 redis key 区分不同的爬虫 item['redis_key'] = 'lianjia:ershoufang' areas_urls = sel.xpath('//div[@data-role="ershoufang"]/div/a/@href').extract() for area_url in areas_urls: area_url = area_url.replace('/ershoufang/', '') url = base_url + area_url # 添加页码 for page in range(1, MAX_PAGE + 1): url = re.sub('pg\d+/','pg%d' % page, url) item['url'] = url yield item
def parse_area(self, response): """添加不同的区域进行爬取数据""" base_url = response.url sel = scrapy.Selector(response) # 存入Redis 数据库 item = MasterRedisItem() # 添加 redis key 区分不同的爬虫 item['redis_key'] = 'lianjia:ershoufang' areas_urls = sel.xpath( '//ul[@class="district-wrapper"]/li/@data-district-spell').extract( ) for area_url in areas_urls: url = base_url + area_url # 添加页码 for page in range(1, MAX_PAGE + 1): url += re.sub('pg\d+/', 'pg%s' % page, url) item['url'] = url yield item