Beispiel #1
0
    def parse_esfhouse(self, response):
        province, city = response.meta.get('info')
        item = EsfHouseItem(province=province, city=city)
        #print(province + '==' +  city)
        #a = response.xpath('//div[contains(@class, "shop_list")]/dl//a/@title').get()
        #print(a)
        dls = response.xpath('//div[contains(@class, "shop_list")]/dl')
        for index, dl in enumerate(dls):
            name = dl.xpath('.//p[@class="add_shop"]/a/@title').get()
            if not name:
                continue
            item['name'] = name
            tel_shop = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
            tel_shop = list(map(lambda x: re.sub(r'\s', '', x), tel_shop))
            for tel in tel_shop:
                if '厅' in tel:
                    item['rooms'] = tel
                elif '层' in tel:
                    item['floor'] = tel
                elif '㎡' in tel:
                    item['area'] = tel
                elif '向' in tel:
                    item['toward'] = tel
                elif '年' in tel:
                    item['year'] = tel.replace('年建', '')
            address = dl.xpath('.//p[@class="add_shop"]/span/text()').get()
            item['address'] = address
            # rooms = tel_shop[0]
            # area = tel_shop[1]
            # floor = tel_shop[2]
            # toward = tel_shop[3]
            # if '向' not in toward:
            #     toward = None
            #     tel_shop[4] = tel_shop[3]
            #year = tel_shop[4]
            price = dl.xpath('.//span[@class="red"]//text()').getall()
            unit = dl.xpath('.//dd/span[2]/text()').get()
            item['price'] = ''.join(price)
            item['unit'] = unit
            origin_url = dl.xpath('.//h4/a/@href').get()
            origin_url = response.urljoin(origin_url)
            item['origin_url'] = origin_url
            #print(origin_url)
            item['id'] = 2
            yield item
        next_name = response.xpath(
            '//div[@class="page_al"]/p[last()-2]/a/text()').get()
        if next_name == '下一页':

            next_url = response.xpath(
                '//div[@class="page_al"]/p[last()-2]/a/@href').get()
            next_url = response.urljoin(next_url)
            print(next_name, next_url)
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_esfhouse,
                                 meta={"info": (province, city)})
        """
Beispiel #2
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')
        divs = response.xpath(
            '//div[contains(@class,"shop_list")]/dl[not(@dataflag="bgcomare")]'
        )
        for div in divs:
            name = div.xpath('./dd/p[@class="add_shop"]/a/@title').get()
            infos = div.xpath(
                './dd/p[contains(@class,"tel_shop")]/text()').getall()
            # print('='*60)
            infos = list(map(lambda x: re.sub(r'\s', "", x), infos))
            rooms, area, floor, toward, year = '未知', '未知', '未知', '未知', '未知'
            for info in infos:
                if ('厅' or '室') in info:
                    rooms = info
                elif ('㎡' or '呎') in info:
                    area = info
                elif ('层' or '独栋' or '联排') in info:
                    floor = info
                elif '向' in info:
                    toward = info
                elif '年' in info:
                    year = info.replace('建', '')
            address = div.xpath(
                './dd/p[contains(@class,"add_shop")]/span/text()').get()
            # price = re.sub(r"\s", "", "".join(div.xpath('./dd[@class="price_right"]/span[1]').getall()))
            price = "".join(
                div.xpath(
                    './dd[@class="price_right"]/span[1]//text()').getall())
            # print(price)
            unit = div.xpath('./dd[@class="price_right"]/span[2]/text()').get()
            url = div.xpath('./dd/h4/a/@href').get()
            if not url.startswith('https:'):
                url = response.url + url
            # print(url)
            esfItem = EsfHouseItem(province=province,
                                   city=city,
                                   name=name,
                                   area=area,
                                   rooms=rooms,
                                   floor=floor,
                                   toward=toward,
                                   year=year,
                                   address=address,
                                   price=price,
                                   unit=unit,
                                   origin_url=url)
            yield esfItem
        next_url = response.xpath('//div[@class="page_al"]/p[1]/a/@href').get()

        if next_url:
            if not next_url.startswith('https:'):
                next_url = response.url + next_url
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={'info': (province, city)})
Beispiel #3
0
    def parse_esf(self, response):
        province, city = response.meta.get("info")
        dls = response.xpath("//div[contains(@class,'shop_list')]/dl")
        for dl in dls:
            name = dl.xpath(".//p[@class='add_shop']/a/@title").get()
            if name:
                item = EsfHouseItem(province=province, city=city, name=name)
                infos = dl.xpath(".//p[@class='tel_shop']//text()").getall()
                infos = list(map(lambda x: re.sub("\s", "", x), infos))
                for info in infos:
                    if "厅" in info:
                        item['rooms'] = info
                    elif '层' in info:
                        item['floor'] = info
                    elif '向' in info:
                        item['toward'] = info
                    elif '㎡' in info:
                        item['area'] = info
                    elif '年建' in info:
                        item['year'] = info.replace('年建', '')
                    else:
                        pass
                item['address'] = dl.xpath(
                    ".//p[@class='add_shop']/span/text()").get()
                item['price'] = "".join(
                    dl.xpath(
                        ".//dd[@class='price_right']/span[@class='red']//text()"
                    ).getall())
                item['unit'] = dl.xpath(
                    ".//dd[@class='price_right']/span[2]/text()").get()
                item['origin_url'] = response.urljoin(
                    dl.xpath(".//h4[@class='clearfix']/a/@href").get())
                yield item

        next_url = response.xpath(
            "//div[@class='page_al']//a[text()='下一页']/@href").get()

        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_esf,
                                 meta={"info": (province, city)})
Beispiel #4
0
 def parse_esf(self, response):
     province, city = response.meta.get('info')
     dls = response.xpath("//div[contains(@class,'shop_list')]/dl")
     for dl in dls:
         item = EsfHouseItem(province=province, city=city)
         item['name'] = dl.xpath(
             ".//p[@class='add_shop']/a/@title").extract_first()
         item['address'] = dl.xpath(
             ".//p[@class='add_shop']//span/text()").extract_first()
         infos = dl.xpath(".//p[@class='tel_shop']//text()").extract()
         infos = list(map(lambda x: re.sub(r"\s", '', x), infos))
         for info in infos:
             if "厅" in info:
                 item["rooms"] = info
             elif "层" in info:
                 item['floor'] = info
             elif "向" in info:
                 item['toward'] = info
             elif '㎡' in info:
                 item["area"] = info
             elif '建' in info:
                 item['year'] = info
             else:
                 item['contact'] = info
         # 总价
         item['price'] = ''.join(
             dl.xpath(
                 ".//dd[@class='price_right']/span[1]//text()").extract())
         # 单价
         item['unit'] = ''.join(
             dl.xpath(
                 ".//dd[@class='price_right']/span[2]//text()").extract())
         item['origin_url'] = dl.xpath(
             ".//h4[@class='clearfix']/a/@href").extract_first()
         yield item
     next_url = response.xpath(
         "//div[@class='page_al']/p[1]/a/@href").extract_first()
     yield scrapy.Request(url=response.urljoin(next_url),
                          callback=self.parse_esf,
                          meta={"info": (province, city)})
Beispiel #5
0
    def parse_esf(self, response):
        province, city = response.meta.get('info')
        dls = response.xpath(
            "//div[contains(@class,'shop_list')]/dl[not(@data-bgcomare)]")
        item = EsfHouseItem(province=province, city=city)
        for dl in dls:
            item['name'] = dl.xpath(".//p[@class='add_shop']/a/@title").get()
            item['address'] = dl.xpath(
                ".//p[@class='add_shop']/span/text()").get()
            infos = dl.xpath(".//p[@class='tel_shop']/text()").getall()
            infos = list(map(lambda x: re.sub(r"\s", "", x), infos))

            for info in infos:
                if "厅" in info:
                    item['rooms'] = info
                elif "㎡" in info:
                    item['area'] = info
                elif "层" in info:
                    item['floor'] = info
                elif "向" in info:
                    item['towards'] = info
                elif "建" in info:
                    item['year'] = info.replace("年建", "")
            item['price'] = dl.xpath(
                ".//dd[@class='price_right']/span[@class='red']/b/text()").get(
                )
            item['unit'] = dl.xpath(
                ".//dd[@class='price_right']/span[last()]/text()").get(
                ).replace("元/㎡", "")
            origin_url = dl.xpath(".//dt/a/@href").get()
            item['origin_url'] = response.urljoin(origin_url)
            yield item

        next_url = response.xpath("//div[@class='page_al']/p[1]/@href").get()

        # 如果存在下一页就循环调用自身,继续解析网页
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={"info": (province, city)})