Ejemplo n.º 1
0
    def parse_area(self, response):
        """添加不同的区域和页码进行爬取数据"""
        base_url = response.url
        sel = scrapy.Selector(response)

        # 存入Redis 数据库
        item = MasterRedisItem()
        # 添加 redis key  区分不同的爬虫
        item['redis_key'] = 'lianjia:ershoufang'

        areas_urls = sel.xpath('//div[@data-role="ershoufang"]/div/a/@href').extract()
        for area_url in areas_urls:
            area_url = area_url.replace('/ershoufang/', '')
            url = base_url + area_url
            # 添加页码
            for page in range(1, MAX_PAGE + 1):
                url = re.sub('pg\d+/','pg%d' % page, url)
                item['url'] = url
                yield item
Ejemplo n.º 2
0
    def parse_area(self, response):
        """添加不同的区域进行爬取数据"""
        base_url = response.url
        sel = scrapy.Selector(response)

        # 存入Redis 数据库
        item = MasterRedisItem()
        # 添加 redis key  区分不同的爬虫
        item['redis_key'] = 'lianjia:ershoufang'

        areas_urls = sel.xpath(
            '//ul[@class="district-wrapper"]/li/@data-district-spell').extract(
            )
        for area_url in areas_urls:
            url = base_url + area_url
            # 添加页码
            for page in range(1, MAX_PAGE + 1):
                url += re.sub('pg\d+/', 'pg%s' % page, url)
                item['url'] = url
                yield item