Beispiel #1
0
 def parse_newhouse(self, response):
     province, city = response.meta.get("info")
     lis = response.xpath("//div[contains(@class, 'nl_con')]/ul/li")
     for li in lis:
         name = "".join(li.xpath(".//div[@class='nlcd_name']//text()").getall())
         name = re.sub(r'\s', '', name)
         house_type_list = "".join(li.xpath(".//div[contains(@class,'house_type')]//text("
                                            ")").getall())
         house_type_list = re.sub(r'\s', "", house_type_list)
         if "居" not in house_type_list:
             continue
         rooms = house_type_list.split("-")[0]
         area = house_type_list.split("-")[1]
         address = li.xpath(".//div[@class='address']/a/@title").get()
         district = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
         district = re.search(r'\[(.+)\].*', district).group(1)
         sale = li.xpath(".//span[@class='inSale']/text()").get()
         origin_url = "https:" + li.xpath(".//div[@class='nlcd_name']/a/@href").get()
         price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall()).strip()
         price = re.sub(r'\s|广告', "", price)
         item = NewHouseItem(name=name, rooms=rooms, area=area, address=address,
                             district=district, sale=sale, origin_url=origin_url, price=price,
                             province=province, city=city)
         yield item
     next_url = response.xpath(".//li[@class='fr']/a[@class='next']/@href").get()
     if next_url:
         yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse,
                              meta={"info": (province, city)})
Beispiel #2
0
    def parse_newhouse(self, response):
        province,city = response.meta['info']
        ul = response.xpath('//div[@class="nl_con clearfix"]/ul/li')
        for li in ul:
            name1 = li.xpath('.//div[@class="nlcd_name"]/a/text()').get()
            if not name1:
                continue
            name = name1.strip()

            rooms_text = li.xpath('.//div[@class="house_type clearfix"]/a/text()').getall()
            rooms = ''.join(list(filter(lambda x:x.endswith('居'),rooms_text)))

            area1 = ''.join(li.xpath('.//div[@class="house_type clearfix"]/text()').getall())
            area = re.sub(r'\s|/|-','',area1)

            district1 = ''.join(li.xpath('.//div[@class="address"]/a//text()').getall())
            district = re.search(r'.*\[(.+)\].*',district1).group(1)

            address = li.xpath('.//div[@class="address"]/a/@title').get().strip()

            sale = li.xpath('.//div[contains(@class,"fangyuan")]/span/text()').get()
            # sale = li.xpath('.//div[@class="fangyuan"]/span/text()').get()

            price1 = ''.join(li.xpath('.//div[@class="nhouse_price"]//text()').getall()).strip()
            price = re.sub(r'广告','',price1)

            origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()

            item = NewHouseItem(name=name,rooms=rooms,area=area,district=district,address=address,sale=sale,price=price,origin_url=origin_url,province=province,city=city)
            print(item)
            yield item
        next_page = response.xpath('//a[@class="next"]/@href').get()
        if next_page:
            next_url = response.urljoin(next_page)
            yield scrapy.Request(next_url,callback=self.parse_newhouse,meta={'info':(province,city)})
Beispiel #3
0
    def parse_newhouse(self, response):
        province, city = response.meta.get('info')
        #print(province + '==' +  city)

        lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
        for li in lis:
            name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get()
            if not name:
                continue
            name = re.sub(r'\s', '', name)
            house_type_list = li.xpath('.//div[contains(@class,"house_type")]/a/text()').getall()
            house_type_list = list(map(lambda x:re.sub(r'\s', '', x), house_type_list))
            rooms = list(filter(lambda x : x.endswith('居'), house_type_list))
            area = ''.join(li.xpath('.//div[contains(@class, "house_type")]/text()').getall())
            area = re.sub(r'[\s/-]', '', area)

            district = li.xpath('.//div[@class="address"]/a//text()').getall()
           # district = list(map(lambda x:re.sub(r'[\s\[\]]', '', x), district))
            district = ''.join(district)
            district = re.search('\[(.+)\].*', district).group(1)
            address = li.xpath('.//div[@class="address"]/a/@title').get()
            sale = li.xpath('.//div[contains(@class, "fangyuan")]/span/text()').get()
            price = li.xpath('.//div[@class="nhouse_price"]//text()').getall()
            price = re.sub(r'\s|广告', '', ''.join(price))
            origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()


            item = NewHouseItem(city=city, name=name, price=price,rooms=rooms, area=area,
                               address=address, district=district, sale=sale, origin_url=origin_url)
            item['id'] = 1
            yield item

        next_url = response.xpath('//a[@class="next"]/@href').get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info":(province, city)})
Beispiel #4
0
 def parse_newhouse(self, response):
     province, city = response.meta.get('info')  # 元祖解包
     lis = response.xpath('//div[@class="nl_con clearfix"]/ul/li')
     for li in lis:
         name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get()
         if name == None:
             pass
         else:
             name = re.sub(r'\s', '', name)
         # contains是指找到div下class里包含有house_type的div
         house_type_list = li.xpath(
             './/div[contains(@class,"house_type")]/a/text()').getall()
         # map函数, 用来替换数据里的空字符
         house_type_list = list(
             map(lambda x: re.sub(r'\s', '', x), house_type_list))
         # filter 过滤函数,过滤末尾带有‘居’字的数据, 没有带‘居’的变成空list[]
         rooms = list(filter(lambda x: x.endswith('居'), house_type_list))
         # "".join 是把列表变成字符串 getall()返回的是列表
         area = "".join(
             li.xpath(
                 './/div[contains(@class,"house_type")]/text()').getall())
         area = re.sub(r'\s|-|/', '', area)
         address = li.xpath('.//div[@class="address"]/a/@title').get()
         district_text = "".join(
             li.xpath('.//div[@class="address"]/a//text()').getall())
         district_text = re.sub(r'\s', '', district_text)
         district = re.search(r"\[(.+)\]", district_text)
         if district == None:
             pass
         else:
             district = district.group(1)
         sale = li.xpath(
             './/div[contains(@class,"fangyuan")]/span/text()').get()
         price = "".join(
             li.xpath('.//div[@class="nhouse_price"]//text()').getall())
         price = re.sub(r'\s|广告', '', price)
         detail_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()
         origin_url = response.urljoin(detail_url)
         # origin_url (https://lefuqiangyuerongwan.fang.com)
         # 楼盘简介(https://lefuqiangyuerongwan.fang.com/house/2110175680/housedetail.htm)
         # print("TAG============================", origin_url)
         yield scrapy.Request(url=origin_url,
                              callback=self.get_new_code,
                              meta={'info': (name, origin_url)})
         # newcode = self.get_new_code(origin_url)
         # detail_url = origin_url + "/house/" + newcode + "/housedetail.htm"
         # detail_intro = self.get_house_inttro(detail_url)
         item = NewHouseItem()
         for field in item.fields.keys():  # 取出所有的键
             item[field] = eval(field)
         yield item
     next_url = response.xpath(
         '//div[@class="page"]//a[@class="next"]/@href').get()
     if next_url:
         next_page = response.urljoin(
             next_url)  # 拼接URL urljoin(start_urls, next_page)
         print(next_page)
         yield scrapy.Request(url=next_page,
                              callback=self.parse_newhouse,
                              meta={'info': (province, city)})
Beispiel #5
0
    def parse_newhouse(self,response):
        province,city = response.meta.get('info')
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()

            house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
            house_type_list = list(map(lambda x:re.sub(r'\s','',x),house_type_list))
            rooms = list(filter(lambda x:x.endswith('居'),house_type_list))
            area = ''.join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())
            area = re.sub(r'\s|-|/', '', area)
            address = li.xpath("//div[@class='address']/a/@title").get()
            #district = li.xpath("//span[@class='sngrey']").get()
            district = ''.join(li.xpath("//div[@class='address']/a//text()").getall())
            #district = re.search(r'.*\[(.*?)\].*',district_text).group(1)
            sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
            price = ''.join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
            price = re.sub(r'\s|广告','',price)
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()

            item = NewHouseItem(name=name,rooms=rooms,area=area,address=address,district=district,sale=sale,price=price,origin_url=origin_url,province=province,city=city)
            yield item

        next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
        if next_url:
            # join到newhouse_url
            yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={'info':(province,city)})
Beispiel #6
0
    def parse_newhouse(self, response):
        province, city = response.meta.get("info")
        lis = response.xpath("//div[contains(@class, 'nl_con')]/ul/li[not(@style)]")
        for li in lis:
            # 获取房产名字
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get().strip()
            # 获取几居室
            rooms = li.xpath(".//div[contains(@class, 'house_type')]/a//text()").getall()
            # 获取面积
            area = li.xpath(".//div[contains(@class, 'house_type')]/text()").getall()
            area = "".join(area).strip()
            area = re.sub(r"/|-|/s|	|\n", "", area)
            # 获取地址
            address = li.xpath(".//div[@class = 'address']/a/@title").get()
            # 获取是哪个区的房子
            district = li.xpath(".//div[@class = 'address']/a//text()").getall()
            district = "".join(district)
            district = re.search(r".*\[(.+)\].*", district).group(1)
            # 获取是否在售
            sale = li.xpath(".//div[contains(@class, 'fangyuan')]/span/text()").get()
            # 获取价格
            price = li.xpath(".//div[@class = 'nhouse_price']//text()").getall()
            price = "".join(price).strip()
            # 获取详情页url
            origin_url = li.xpath(".//div[@class = 'nlcd_name']/a/@href").get()

            # 构建item返回
            item = NewHouseItem(province = province, city = city, name = name, rooms = rooms, area = area, address = address, district = district, sale = sale, price = price, origin_url = origin_url)
            yield item

        # 爬取下一页数据
        next_url = response.xpath("//div[@class = 'page']//a[@class = 'next']/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url), callback=self.parse_newhouse, meta={"info": (province, city)})
Beispiel #7
0
    def parse_newhouse(self,response):
        province,city = response.meta.get('info')
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name is not None:
                name = li.xpath(".//div[@class='nlcd_name']/a/text()").get().strip()
                house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
                house_type_list = list(map(lambda x:re.sub(r"\s","",x),house_type_list))
                rooms = list(filter(lambda x: x.endswith("居"), house_type_list))
                area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())
                area = re.sub(r"\s|-|/","",area)
                address = li.xpath(".//div[@class='address']/a/@title").get()
                district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
                district= re.search(r".*\[(.+)\].*",district_text).group(1)
                sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
                price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
                price = re.sub(r"\s|广告","",price)
                origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()

                item = NewHouseItem(name=name,rooms=rooms,area=area,address=address,district=district,
                                    sale=sale,price=price,origin_url=origin_url,province=province,city=city)
                yield item

        next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={"info":(province,city)})
Beispiel #8
0
    def parse_newhouse(self, response):
        province, city, cityabbr = response.meta.get("info")
        print(province, city)
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name is None:
                continue
            name = name.strip()
            house_type_list = li.xpath(
                ".//div[contains(@class,'house_type')]/a/text()").getall()
            house_type_list = list(
                map(lambda x: re.sub(r"\s", "", x), house_type_list))
            rooms = list(
                filter(lambda x: x.endswith("居") | x.endswith("上"),
                       house_type_list))
            #print(rooms)
            area = "".join(
                li.xpath(
                    ".//div[contains(@class,'house_type')]/text()").getall())
            area = re.sub(r'\s|-|/', "", area)
            # print(area)
            address = li.xpath(".//div[@class='address']/a/@title").get()
            district_text = "".join(
                li.xpath(".//div[@class='address']/a//text()").getall())
            district = re.search(r".*\[(.+)\].*", district_text).group(1)
            # print(district)
            sale = li.xpath(
                ".//div[contains(@class,'fangyuan')]/span/text()").get()
            price = "".join(
                li.xpath(".//div[@class='nhouse_price']//text()").getall())
            price = re.sub(r"\s", "", price)
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
            origin_url = urljoin("https:", origin_url)
            # print(origin_url)
            # print(sale,price)
            item = NewHouseItem(province=province,
                                city=city,
                                name=name,
                                rooms=rooms,
                                area=area,
                                address=address,
                                district=district,
                                sale=sale,
                                price=price,
                                origin_url=origin_url)
            yield item

        next_url = response.xpath(
            "//div[@class='page']//a[@class='next']/@href").get()

        if next_url:
            temp_url = "https://" + cityabbr + ".newhouse.fang.com"
            next_url = urljoin(temp_url, next_url)
            #print("下一页:%s"%next_url)
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={"info": (province, city, cityabbr)})
Beispiel #9
0
    def parse_newhouse(self, response):
        province, city = response.meta.get('info')
        lis = response.xpath(
            "//div[contains(@class,'nl_con')]/ul/li[not(@style)]")
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            name = re.sub(r"\s", '', name)

            house_type_text = li.xpath(
                ".//div[contains(@class,'house_type')]/a/text()").getall()
            rooms = list(map(lambda x: re.sub(r'\s', '', x), house_type_text))

            area = "".join(
                li.xpath(
                    ".//div[contains(@class,'house_type')]/text()").getall())
            area = re.sub(r"\s|-|/", "", area)

            # 地址
            address = li.xpath(".//div[@class='address']/a/@title").get()

            # 行政区
            district_text = "".join(
                li.xpath(".//div[@class='address']/a/span/text()").getall())
            district = re.search(r".*\[(.+)\].*", district_text).group(1)

            sale = li.xpath(".//div[@class='fangyuan']/span/text()").get()

            price = "".join(
                li.xpath(".//div[@class='nhouse_price']//text()").getall())
            price = re.sub(r"\s", "", price)

            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
            origin_url = "https:" + origin_url[1:-1]

            item = NewHouseItem(name=name,
                                rooms=rooms,
                                price=price,
                                area=area,
                                address=address,
                                sale=sale,
                                district=district,
                                origin_url=origin_url,
                                province=province,
                                city=city)

            yield item

        next_url = response.xpath(
            "//div[@class='page']//a[@class='next']/@href").get()
        # 如果urljoin没用就手动改url
        # next_url = response.url + next_url

        # 如果存在下一页就循环调用自身,继续解析网页
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={"info": (province, city)})
Beispiel #10
0
    def parse_newhouse(self, response):

        province, city = response.meta.get('info')
        # 判断是否class类 包含nl_con
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            NewItem = NewHouseItem()
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").extract()
            # 将空白字符 转换成空字符   小区的名字
            name = list(map(lambda x: re.sub(r'\s', '', x), name))
            house_type_list = li.xpath(
                ".//div[contains(@class,'house_type ')]//a/text()").extract()
            # 调用过滤函数filter  以“居”结尾的  几局
            rooms = list(filter(lambda x: x.endswith("居"), house_type_list))
            # 平方
            area = ''.join(
                li.xpath(
                    ".//div[contains(@class,'house_type')]/text()").getall())
            area = re.sub(r"\s|/|-", '', area)

            # 地址
            addrees = li.xpath(
                ".//div[@class='address']/a/@title").extract_first()

            # 区域
            district_text = li.xpath(
                ".//div[@class='address']/a/span/text()").extract()
            district = list(map(lambda x: re.sub(r'\s', '', x), district_text))

            # 价格 将列表转换为字符串
            price = ''.join(
                li.xpath(".//div[@class='nhouse_price']//text()").extract())
            price = re.sub(r'\s|广告', '', price)
            # 是否在售
            sale = li.xpath(".//div[contains(@class,fangyuan)]/span/text()"
                            ).extract_first()
            # 房源链接
            origin_url = str(
                li.xpath(".//div[@class='nlcd_name']/a/@href").extract_first())
            NewItem["province"] = province
            NewItem["city"] = city
            NewItem["name"] = name
            NewItem["price"] = price
            NewItem["rooms"] = rooms
            NewItem["area"] = area
            NewItem["addrees"] = addrees
            NewItem["district"] = district
            NewItem["sale"] = sale
            NewItem["origin_url"] = origin_url
            NewItem["province"] = province
            NewItem['city'] = city
            yield NewItem
        next_url = response.xpath(
            "//div[@class='page']//a[@class='next']/@href").extract_first()
        yield scrapy.Request(url=response.urljoin(next_url),
                             callback=self.parse_newhouse,
                             meta={"info": (province, city)})
    def parse_newhouse(self, response):
        province = response.meta['info'][0]
        city = response.meta['info'][1]
        newhouse_lis = response.xpath(
            './/div[@id="newhouse_loupai_list"]/ul/li')
        for newhouse_li in newhouse_lis:
            name = newhouse_li.xpath(
                './/div[@class="nlcd_name"]/a/text()').get()
            if name == None:
                continue
            name = name.strip()
            rooms = newhouse_li.xpath(
                ".//div[contains(@class,'house_type')]/a/text()").getall()
            if rooms == []:
                continue
            area = "".join(
                newhouse_li.xpath(
                    ".//div[contains(@class,'house_type')]/text()").getall())
            area = re.sub(r"\s|/|-", "", area)
            address = newhouse_li.xpath(
                ".//div[@class='address']/a/@title").get()
            district = "".join(
                newhouse_li.xpath(
                    ".//div[@class='address']/a//text()").getall())

            district = re.search(r".*\[(.*?)\].*", district).group(1)
            sale = newhouse_li.xpath(
                './/div[contains(@class,"fangyuan")]/span/text()').get()
            prise = newhouse_li.xpath(
                ".//div[@class='nhouse_price']//text()").getall()
            prise = "".join(prise)
            prise = re.sub(r"\s|广告", "", prise)
            origin_url = newhouse_li.xpath(
                ".//div[@class='nlcd_name']/a/@href").get()
            origin_url = "http:" + origin_url

            item = NewHouseItem(name=name,
                                rooms=rooms,
                                area=area,
                                address=address,
                                district=district,
                                sale=sale,
                                prise=prise,
                                origin_url=origin_url,
                                province=province,
                                city=city,
                                where="NH")
            yield item
        next_url = response.xpath('.//a[@class="next"]/@href').get()
        if next_url:

            next_url = response.urljoin(next_url)
            print(next_url)
            yield scrapy.Request(next_url,
                                 callback=self.parse_newhouse,
                                 meta={'info': (province, city)})
Beispiel #12
0
 def parse_newhouse(self, response):
     province, city = response.meta.get('info')
     divs = response.xpath(
         '//div[@class="nhouse_list"]//ul/li//div[@class="nlc_details"]')
     for div in divs:
         name = div.xpath(
             './/div[@class="nlcd_name"]/a/text()').get().strip()
         rooms = div.xpath(
             './/div[contains(@class,"house_type")]/a/text()').getall()
         rooms = list(filter(lambda x: x.endswith('居'), rooms))
         if not rooms:
             rooms = '未知'
         area = "".join(
             div.xpath(
                 './/div[contains(@class,"house_type")]/text()').getall())
         area = re.sub(r"\s|-|\/", "", area)
         if area == '':
             area = '未知'
         address = re.sub(
             r'\[.*\]', "", "".join(
                 div.xpath('.//div[@class="address"]/a/@title').getall()))
         district = "".join(
             div.xpath('.//div[@class="address"]/a//text()').getall())
         district = re.findall(r".*\[(.+)\].*", district)
         if not district:
             district = '未知'
         else:
             district = district[0]
         sale = div.xpath(
             '//div[contains(@class,"fangyuan")]/span/text()').get()
         price = re.sub(
             r"\s|广告", "", "".join(
                 div.xpath(
                     './/div[@class="nhouse_price"]//text()').getall()))
         origin_url = div.xpath('.//div[@class="nlcd_name"]/a/@href').get()
         if not origin_url.startswith('https:'):
             origin_url = 'https:' + origin_url
         nitem = NewHouseItem(province=province,
                              city=city,
                              name=name,
                              rooms=rooms,
                              area=area,
                              address=address,
                              district=district,
                              sale=sale,
                              price=price,
                              origin_url=origin_url)
         yield nitem
     next_url = response.xpath(
         '//div[@class="page"]//a[@class="next"]/@href').get()
     if next_url:
         yield scrapy.Request(url=response.urljoin(next_url),
                              callback=self.parse_newhouse,
                              meta={'info': (province, city)})
Beispiel #13
0
    def parse_newhouse(self, response):
        province, city = response.meta.get('info')
        lis = response.xpath(
            "//div[contains(@class, 'nl_con')]/ul/li[not(@class)]/div[@class='clearfix']/div[@class='nlc_details']"
        )
        for li in lis:
            item = NewHouseItem(province=province, city=city)
            ad = li.xpath(".//div[@class='nhouse_price']/em[2]/text()").get()
            if ad is not None:
                continue
            item['name'] = li.xpath(
                ".//div[@class='nlcd_name']/a/text()").get().strip()
            item['rooms'] = "/".join(
                li.xpath(".//div[contains(@class, 'house_type')]/a/text()").
                getall())
            item['area'] = re.sub(
                r"\s|/|-", "", "".join(
                    li.xpath(".//div[contains(@class, 'house_type')]/text()").
                    getall()))
            district_text = "".join(
                li.xpath(".//div[@class='address']/a//text()").getall())
            item['district'] = re.search(r".*\[(.+)\].*",
                                         district_text).group(1)
            item['address'] = li.xpath(
                ".//div[@class='address']/a/@title").get()
            item['origin_url'] = "http:{}".format(
                li.xpath(".//div[@class='nlcd_name']/a/@href").get())
            price_num = "".join(
                li.xpath(".//div[@class='nhouse_price']//text()").getall())
            item['price'] = re.sub(r"\s", "", price_num)
            item['telephone'] = "".join(
                li.xpath(".//div[@class='tel']/p//text()").getall())
            item['sale'] = li.xpath(
                ".//div[contains(@class, 'fangyuan')]/span/text()").get()
            item['label'] = "/".join(
                li.xpath(
                    ".//div[contains(@class, 'fangyuan')]/a/text()").getall())

            yield item

        # 分页请求
        span = response.xpath("//div[@class='otherpage']/span[1]/@class").get()
        if span == 'disable':
            next_page_url = response.xpath(
                "//div[@class='otherpage']/a[1]/@href").get()
        else:
            next_page_url = response.xpath(
                "//div[@class='otherpage']/a[2]/@href").get()

        if next_page_url is not None:
            next_page = response.urljoin(next_page_url)
            yield scrapy.Request(url=next_page,
                                 callback=self.parse_newhouse,
                                 meta={"info": (province, city)})
Beispiel #14
0
 def parse_newhouse(self, response):
     provinces, city = response.meta.get("info")
     #实例化一个items
     item = NewHouseItem()
     #得到所有的房源列表
     lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
     for li in lis:
         #去广告的li标签,
         if not li.xpath('.//div[@class="nlcd_name"]'):
             continue
         # 房名
         item["name"] = li.xpath(
             './/div[@class="nlcd_name"]/a/text()').get().strip()
         house_type_text = li.xpath(
             ".//div[contains(@class,'house_type')]/a//text()").getall()
         # 几居
         item["rooms"] = list(
             filter(lambda x: x.endswith('居' or '以上'), house_type_text))
         area = "".join(
             li.xpath(
                 './/div[contains(@class,"house_type")]/text()').getall())
         # 面积
         item["area"] = re.sub(r"\s|/|-", "", area)
         # 地区
         item["address"] = li.xpath(
             './/div[@class="address"]/a/@title').get()
         # 行政区
         district = "".join(
             li.xpath('.//div[@class="address"]/a//text()').getall())
         # 没有行政
         if "[" not in district:
             item["district"] = None
         else:
             item["district"] = re.search(r".*\[(.+)\].*",
                                          district).group(1)
         # 销售状态
         item["sale"] = li.xpath(
             './/div[contains(@class,"fangyuan")]/span/text()').get()
         # price
         price = "".join(
             li.xpath(".//div[@class='nhouse_price']//text()").getall())
         item["price"] = re.sub(r'\s|"广告"', "", price)
         # origin_url
         item["origin_url"] = response.urljoin(
             li.xpath('.//div[@class="nlcd_name"]/a/@href').get())
         item["provinces"] = provinces
         item["city"] = city
         yield item
         next_url = response.xpath(
             "//div[@class='page']//a[@class='next']/@href").get()
         if next_url:
             yield scrapy.Request(url=response.urljoin(next_url),
                                  callback=self.parse_newhouse,
                                  meta={"info": (provinces, city)})
Beispiel #15
0
 def parse_newhouse(self, response):
     province, city = response.meta.get('info')
     lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
     for li in lis:
         # 名字
         name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get()
         if name:
             name = name.strip()
             # 介绍:几居
             house_type_list = li.xpath(
                 './/div[contains(@class,"house_type")]//a/text()').getall(
                 )
             house_type_list = list(
                 map(lambda x: x.replace(" ", ""), house_type_list))
             room = list(filter(lambda x: x.endswith('居'), house_type_list))
             # 面积
             area = ''.join(
                 li.xpath('.//div[contains(@class,"house_type")]/text()').
                 getall())
             area = re.sub('\s|-|/', '', area)
             # 地址
             address = li.xpath('.//div[@class="address"]/a/@title').get()
             # 位置
             district_text = ''.join(
                 li.xpath('.//div[@class="address"]/a//text()').getall())
             district = re.search(r'.*\[(.+)\].*', district_text).group(1)
             # 是否在售
             sale = li.xpath(
                 './/div[contains(@class,"fangyuan")]/span/text()').get()
             # 价格
             price = ''.join(
                 li.xpath('.//div[@class="nhouse_price"]//text()').getall())
             price = re.sub(r'\s|广告', '', price)
             # 详细url
             origin_url = li.xpath(
                 './/div[@class="nlcd_name"]/a/@href').get()
             origin_url = 'http:' + origin_url
             item = NewHouseItem(province=province,
                                 city=city,
                                 name=name,
                                 rooms=room,
                                 area=area,
                                 address=address,
                                 district=district,
                                 sale=sale,
                                 price=price,
                                 origin_url=origin_url)
             yield item
     next_url = response.xpath('//a[@class="next"]/@href').get()
     if next_url:
         yield scrapy.Request(url=response.urljoin(next_url),
                              callback=self.parse_newhouse,
                              meta={'info': (province, city)})
Beispiel #16
0
    def parse_newhourse(self,response):
        province,city=response.meta.get("info")
        print("newurl response:", response.url)
        list_div = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in list_div:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name is not  None:
                name=name.strip()
            else:
                continue
            house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
            house_type_list = list(map(lambda x:re.sub(r"\s","",x),house_type_list))
            rooms_list = list(filter(lambda x:x.endswith("居"),house_type_list))
            rooms=""
            for room in rooms_list:
                rooms=rooms+room
            #  "".join()  以空字符作为链接
            area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall())
            area =re.sub(r"\s|-|/|-","",area)
            address = li.xpath(".//div[@class='address']/a/@title").get()
            district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
            if '[' in district_text:
                district = re.search(r".*\[(.+)\].*",district_text).group(1)
            else:
                district=city
            sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
            price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
            price = re.sub(r"\s|广告","",price)
            origin_url = "http:"+li.xpath(".//div[@class='nlcd_name']/a/@href").get()

            item = NewHouseItem(name=name,
                                rooms=rooms,
                                area=area,
                                address=address,
                                district=district,
                                sale=sale,
                                price=price,
                                origin_url=origin_url,
                                province=province,
                                city=city)
            yield item
        #下一页
        next_url = response.xpath("//a[text()='下一页']/@href").get()
        #https://newhouse.fang.com/house/s/b92/
        #https://newhouse.fang.com/house/s/b91/
        next_url =response.urljoin(next_url)
        if next_url:
            yield scrapy.Request(
                url=next_url,
                callback=self.parse_newhourse,
                meta={"info": (province, city)}
            )
Beispiel #17
0
    def parse_newhouse(self, response):
        #新房
        provice, city = response.meta.get('info')
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            name = li.xpath(
                ".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()"
            ).get()
            if name:
                name = re.sub(r"\s", "", name)
                #居室
                house_type_list = li.xpath(
                    ".//div[contains(@class,'house_type')]/a/text()").getall()
                house_type_list = list(
                    map(lambda x: re.sub(r"\s", "", x), house_type_list))
                rooms = list(filter(lambda x: x.endswith("居"),
                                    house_type_list))
                #面积
                area = "".join(
                    li.xpath(".//div[contains(@class,'house_type')]/text()").
                    getall())
                area = re.sub(r"\s|-|/", "", area)
                #地址
                address = li.xpath(".//div[@class='address']/a/@title").get()
                address = re.sub(r"[请选择]", "", address)
                sale = li.xpath(
                    ".//div[contains(@class,'fangyuan')]/span/text()").get()
                price = "".join(
                    li.xpath(".//div[@class='nhouse_price']//text()").getall())
                price = re.sub(r"\s|广告", "", price)
                #详情页url
                origin_url = li.xpath(
                    ".//div[@class='nlcd_name']/a/@href").get()

                item = NewHouseItem(name=name,
                                    rooms=rooms,
                                    area=area,
                                    address=address,
                                    sale=sale,
                                    price=price,
                                    origin_url=origin_url,
                                    provice=provice,
                                    city=city)
                yield item

        #下一页
        next_url = response.xpath(
            "//div[@class='page']//a[@class='next']/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={'info': (provice, city)})
Beispiel #18
0
    def parse_newhouse(self, response):
        province, city = response.meta.get("info")
        print(province + "   " + city)
        lis = response.xpath("//div[@class='nl_con clearfix']/ul/li")
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if (name == None):
                continue
            name = name.strip()
            house_type = li.xpath(
                ".//div[@class='house_type clearfix']//text()").getall()
            house_type = "".join(
                list(map(lambda x: re.sub(r"\s", "", x), house_type)))
            area = None
            if (house_type.find("-") >= 0):
                area = house_type.split("-")[1]
                house_type = house_type.split("-")[0].split("/")

            address = li.xpath(".//div[@class='address']/a/@title").get()
            district = "".join(
                li.xpath(".//div[@class='address']/a//text()").getall())
            district = re.search(r".*\[(.+)\].*", district)
            if (district != None):
                district = district.group(1)
            sale = li.xpath(".//div[@class='fangyuan pr']/span/text()").get()
            price = "".join(
                li.xpath(
                    ".//div[@class='nhouse_price']//text()").getall()).strip()
            price = re.sub(r"\s|广告", "", price)
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
            item = NewHouseItem(name=name,
                                rooms=house_type,
                                price=price,
                                address=address,
                                district=district,
                                sale=sale,
                                origin_url=origin_url,
                                area=area,
                                province=province,
                                city=city)
            yield item
        next_url = response.xpath(
            "//div[@class='page']//a[@class='next'][last()]/@href").get()
        print(next_url)
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={"info": (province, city)})
Beispiel #19
0
    def parse_newhouse(self,response):
        province,city=response.meta.get('info')
        # 获取一页的所有信息
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
        for li in lis:
            # 小区名字
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name:

                # 几居
                house_type_list = li.xpath(".//div[contains(@class,'house_type clearfix')]/a/text()").getall()
                house_type_list = list(map(lambda x: re.sub(r"\s", "", x), house_type_list))
                rooms = list(filter(lambda x: x.endswith("居"), house_type_list))
                # 面积
                # 转化为字符串
                area = "".join(li.xpath(".//div[contains(@class,'house_type clearfix')]/text()").getall())
                area = re.sub(r"\s|-|/", "", area)
                # 地址
                address = li.xpath(".//div[@class='address']/a/@title").get()
                # 行政区
                district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
                district = re.search(r".\[(.+)\].*", district_text).group(1)
                # 是否在销售
                sale = li.xpath("//div[contains(@class,'fangyuan')]/span/text()").get()
                # 价格
                price = "".join(li.xpath("//div[@class='nhouse_price']//text()").getall())
                price = re.sub(r"\s|广告","",price)
                # 房天下的详情url
                origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
                item = NewHouseItem(
                    name=name,
                    rooms=rooms,
                    area=area,
                    address=address,
                    sale=sale,
                    price=price,
                    origin_url=origin_url,
                    province=province,
                    city=city
                )
                yield item

        # 下一页
        next_url = response.xpath("//div[@class='page']/a[@class='next']/@href").getall()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={'info': (province, city)})
Beispiel #20
0
    def parse_newhouse(self, response):
        province, city = response.meta.get('info')
        lis = response.xpath(
            '//div[@id="newhouse_loupai_list"]/ul/li[not(@style)]')
        for li in lis:
            name = li.xpath(
                './/div[@class="nlcd_name"]/a/text()').get().strip()
            price = "".join(
                li.xpath('.//div[@class="nhouse_price"]//text()').getall())
            price = re.sub(r"\s|广告", "", price)
            house_type_list = li.xpath(
                './/div[contains(@class, "house_type")]//text()').getall()
            house_type_list = re.sub(r"\s", "",
                                     "".join(house_type_list)).split('-')
            rooms = house_type_list[0]
            area = house_type_list[-1]
            address = li.xpath('.//div[@class="address"]/a/@title').get()
            district_text = "".join(
                li.xpath('.//div[@class="address"]/a//text()').getall())
            district = re.search(r".*\[(.+)\].*", district_text).group(1)
            sale = response.xpath(
                ".//div[@class='fangyuan pr']/span/text()").get()
            origin_url = response.xpath(
                ".//div[@class='nlcd_name']/a/@href").get()
            origin_url = response.urljoin(origin_url)
            item = NewHouseItem(province=province,
                                city=city,
                                name=name,
                                price=price,
                                rooms=rooms,
                                area=area,
                                address=address,
                                district=district,
                                sale=sale,
                                origin_url=origin_url)
            # print(name, price, rooms, area, address, district, sale, origin_url)
            yield item

        next_url = response.xpath(
            "//div[@class='page']//a[@class='next']/@href").get
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={"info": (province, city)})
Beispiel #21
0
    def parse_newhouse(self, response):
        province, city = response.meta.get('info')
        list = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
        for li in list:
            name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get()
            if name:
                name = name.strip()
            rooms = li.xpath(
                './/div[contains(@class,"house_type")]//a/text()').getall()
            area = ''.join(
                li.xpath(
                    './/div[contains(@class,"house_type")]/text()').getall())
            area = re.sub(r'\s|-|/', '', area)
            address = li.xpath('.//div[@class="address"]/a/@title').get()
            district_text = ''.join(
                li.xpath('.//div[@class="address"]//text()').getall())
            if district_text:
                district = re.search(r'\[(.+)\]', district_text).group(1)
            sale = li.xpath(
                './/div[contains(@class,"fangyuan")]/span[1]/text()').get()
            price = ''.join(
                li.xpath('.//div[@class="nhouse_price"]//text()').getall())
            price = re.sub('\s|广告', '', price)
            origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()

            item = NewHouseItem(province=province,
                                city=city,
                                name=name,
                                rooms=rooms,
                                area=area,
                                address=address,
                                district=district,
                                sale=sale,
                                price=price,
                                origin_url=origin_url)
            yield item

        next_url = response.xpath(
            '//div[@class="page"]//a[@class="next"]/@href').get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={'info': (province, city)})
Beispiel #22
0
 def parse_new_fang(self, response):
     province, city = response.meta.get('city')
     lis = response.xpath('//div[contains(@class, "nl_con")]/ul/li')
     for li in lis:
         name = li.xpath(
             ".//div[@class='nlcd_name']/a/text()").get().strip()
         house_type_list = li.xpath(
             ".//div[contains(@class,'house_type')]/a/text()").getall()
         house_type_list = list(
             map(lambda x: re.sub(r'\s', '', x), house_type_list))
         rooms = list(filter(lambda x: x.endswith('居'), house_type_list))
         area = ''.join(
             li.xpath(".//div[contains(@class,'house_type')]/text()").
             getall()).strip()
         area = re.sub(r'\s|-|/|平米', '', area)
         address = li.xpath(".//div[@class='address']/a/@title").get()
         district_text = ''.join(
             li.xpath(".//div[@class='address']/a//text()").getall())
         district = re.search(r'.*?\[(.*)\].*', district_text).group(1)
         sale = li.xpath(
             ".//div[contains(@class, 'fangyuan')]/span/text()").get()
         price = ''.join(
             li.xpath(".//div[@class='nhouse_price']//text()").getall())
         price = re.sub(r'\s|广告', '', price)
         url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
         item = NewHouseItem(province=province,
                             city=city,
                             name=name,
                             rooms=rooms,
                             area=area,
                             address=address,
                             district=district,
                             sale=sale,
                             price=price,
                             url=url)
         yield item
     next_url = response.xpath(
         "//div[@class='page']//a[@class='next']/@href").get()
     if next_url:
         yield scrapy.Request(url=response.urljoin(next_url),
                              callback=self.parse_new_fang,
                              meta={'city': (province, city)})
Beispiel #23
0
    def parse_newhouse(self, response):
        # 解析新房具体字段
        # meta里面可以携带一些参数信息放到Request里面,在callback函数里面通过response获取
        province, city = response.meta.get('info')
        lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
        for li in lis:
            name = li.xpath(
                ".//div[contains(@class,'house_value')]//div[@class='nlcd_name']/a/text()"
            ).get()
            if name:
                name = re.sub(r"\s", "", name)
            house_type_list = li.xpath(
                './/div[contains(@class,"house_type")]/a/text()').getall()
            # house_type_list = list(map(lambda x:x.replace(' ',''),house_type_list))
            house_type_list = list(
                map(lambda x: re.sub(r'/s', '', x), house_type_list))
            rooms = list(filter(lambda x: x.endswith('居'), house_type_list))
            area = ''.join(
                li.xpath(
                    './/div[contains(@class,"house_type")]/text()').getall())
            area = re.sub(r'\s|-|/', '', area)
            address = li.xpath('.//div[@class="address"]/a/@title').get()
            # district_text = ''.join(li.xpath('.//div[@class="address"]/a//text()').getall())
            # district = re.search(r'.*\[(.+)\].*',district_text).group(1)
            sale = li.xpath(
                ".//div[contains(@class,'fangyuan')]/span/text()").get()
            price = "".join(
                li.xpath(".//div[@class='nhouse_price']//text()").getall())
            price = re.sub(r"\s|广告", "", price)
            # 详情页url
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()

            item = NewHouseItem(name=name,
                                rooms=rooms.get(),
                                area=area,
                                address=address,
                                sale=sale,
                                price=price,
                                origin_url=origin_url,
                                province=province,
                                city=city)
            yield item
Beispiel #24
0
    def parse_newhouse(self,response):
        province,city,new_city_url = response.meta.get("info")
        lis = response.xpath("//div[contains(@class,'nl_con')]/ul//li")
        # lis = lis[0::]
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name:
                name = name.strip()
            elif not name:
                continue
            house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
            area = "".join(li.xpath(".//div[contains(@class,'house_type')]/text()").getall()).strip()
            area = re.sub(r'\s|/|-','',area)
            address = li.xpath(".//div[@class='address']/a/@title").get()
            try:
                district = re.search(r'.*(\[.*?\]).*', address).group(1)
            except Exception as e:
                print("行政区获取失败",e)
                district = ''
            sale = li.xpath(".//div[@class='fangyuan']/span/text()").get()
            price_number = li.xpath(".//div[@class='nhouse_price']/span/text()").get()
            price_info = li.xpath(".//div[@class='nhouse_price']/em/text()").get()
            print("房子价格信息为",price_number,price_info)
            try:
                price = price_number + price_info
            except Exception as e:
                print("当前价格不完善",e)
                price = "价格待定"
            origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
            origin_url = "https:"+origin_url

            item = NewHouseItem(province=province,city=city,name=name,price=price,rooms=house_type_list,area=area,address=address, \
                                district=district,sale=sale,origin_url=origin_url)
            print("小区名字",name,"小区住宅类型",house_type_list,"房子面积",area,"地址为",address,"行政区为",\
                  district,"是否在售", sale,"房子价格信息为",price)
            yield item
        next_url = response.xpath("//div[@class='page']//a[@class='next']/@href").get()
        if next_url:
            next_url = new_city_url+next_url
            yield scrapy.Request(url=next_url,callback=self.parse_newhouse,meta={'info':(province,city,new_city_url)})
Beispiel #25
0
 def parse_newhouse(self,response):
     province,city = response.meta.get('info')
     lis = response.xpath("//div[contains(@class,'nl_con')]/ul/li")
     for li in  lis:
             name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
             if name is not None:
                   name = name.strip()
                   #print(name)
             house_type_list = li.xpath(".//div[contains(@class,'house_type')]/a/text()").getall()
             if house_type_list is not  None:
                house_type = list(map(lambda x:re.sub(r"\s","",x),house_type_list))
                rooms = list(filter(lambda x:x.endswith("居"),house_type))
               # print(rooms)
             area = "".join(li.xpath(".//div[contains(@class,'house_type')]//text()").getall())
             area = re.sub(r"\s|-|/|\d+[居]|.*?[\u4E00-\u9FA5]+起|.*?[\u4E00-\u9FA5]+SOHO","",area)
             #print(area)
             address_text = li.xpath(".//div[@class='address']/a/@title").get()
             if address_text is not  None:
                 address = address_text.strip()
             district_text = "".join(li.xpath(".//div[@class='address']/a//text()").getall())
             district_x = re.search(r".*\[(.+)\].*",district_text)
             if district_x is not None:
                 district = district_x.group(1)
             sale = li.xpath(".//div[contains(@class,'fangyuan')]/span/text()").get()
             if sale is not None:
                 sale = sale
             price = "".join(li.xpath(".//div[@class='nhouse_price']//text()").getall())
             price = re.sub(r"\s|广告","",price)
             if price is not "":
                  price = price
             origin_url = li.xpath(".//div[@class='nlcd_name']/a/@href").get()
             if origin_url is not None:
                 origin_url = "".join('https:'+origin_url)
             item = NewHouseItem(province = province,city = city,name = name,rooms = rooms,area = area,address =address,district = district,sale = sale,price = price,origin_url = origin_url)
             yield item
             #print(item)
     next_url = response.xpath(".//div[@class='page']//a[@class='next']/@href").get()
     if next_url:
         yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={"info":(province,city)})
Beispiel #26
0
 def parse_new_house(self, response):
     province, city = response.meta.get('info')
     lis = response.xpath(
         '//div[@id="newhouse_loupai_list"]/ul/li[not(@class)]')
     for li in lis:
         name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get(
             "").strip()
         number = li.xpath('.//div[@class="nhouse_price"]/span/text()').get(
             "")
         per = li.xpath('.//div[@class="nhouse_price"]/em/text()').get("")
         price = number + per
         rooms = ",".join(
             li.xpath(
                 './/div[@class="house_type clearfix"]/a/text()').getall())
         area = ",".join(
             li.xpath(
                 './/div[@class="house_type clearfix"]/text()').getall())
         try:
             area = re.search("\d+.*米", area).group()
         except Exception:
             area = ""
         address = li.xpath('//div[@class="address"]/a/@title').get()
         district = ",".join(
             li.xpath(
                 './/div[contains(@class,"fangyuan")]/a//text()').getall())
         sale = li.xpath(
             './/div[contains(@class,"fangyuan")]/span/text()').get()
         origin_url = response.url
         item = NewHouseItem(province = province,city = city,name = name,price = price,rooms = rooms,\
                             area = area,address = address,district = district,sale = sale,origin_url = origin_url)
         yield item
     next_page = response.xpath('//a[@class="next"]/@href').get()
     if next_page:
         yield scrapy.Request(url=response.urljoin(next_page),
                              callback=self.parse_new_house,
                              meta={"info": (province, city)})
Beispiel #27
0
    def parse_newhouse(self, response):
        # 解析新房具体字段
        # meta里面可以携带一些参数信息放到Request里面,在callback函数里面通过response获取
        province, city_name = response.meta.get('info')
        lis = response.xpath('//div[@class="nl_con clearfix"]/ul/li')
        for li in lis:
            # 广告和正常的房产两层class相同,唯一不同是广告有h3标签。如果是广告直接跳过
            ad = li.xpath('./div[@class="clearfix"]/h3/text()').extract_first()
            if ad:
                continue
            house_name = li.xpath(
                './/div[@class="house_value clearfix"]//div[@class="nlcd_name"]/a/text()'
            ).extract_first()
            if house_name:
                house_name = re.sub(r"\s", "", house_name)
            # 解析几居室
            rooms = '/'.join(
                li.xpath('.//div[@class="house_type clearfix"]/a/text()').
                extract())  # '3居/4居'
            # 销售电话
            phone_num = ''.join(
                li.xpath('.//div[@class="tel"]/p//text()').extract())
            # 解析房屋面积
            area = ''.join(
                li.xpath(
                    './/div[@class="house_type clearfix"]/text()').extract())
            area = re.sub('\s|-|/', '', area)
            address = li.xpath(
                './/div[@class="address"]/a/@title').extract_first()
            # 是否开盘(在售、待售)
            sale = li.xpath(
                ".//div[@class='fangyuan']/span/text()").extract_first()
            # 房屋卖点
            tags_list = li.xpath('//div[@id="sjina_C26_07"]//text()').extract()
            tags = list(filter(None, map(lambda x: x.strip(), tags_list)))[1:]
            tags = '/'.join(tags)
            # 每平米单价、少数整套价格
            price = li.xpath(
                ".//div[@class='nhouse_price']/span/text()").extract_first()
            price_unit = li.xpath(
                ".//div[@class='nhouse_price']/em/text()").extract_first()
            nearby = li.xpath('//div[@class="nhouse_price"]/label[2]/text()'
                              ).extract_first()
            if nearby:
                price = li.xpath(
                    '//div[@class="nhouse_price"]/i/text()').extract_first()
            #
            if not price_unit:
                price = price
            else:
                price = price + price_unit  # '40500元/㎡'
            # 详情页url
            origin_url = li.xpath(
                ".//div[@class='nlcd_name']/a/@href").extract_first()
            # 详情页可能会取空,加一个判断    TypeError: must be str, not NoneType
            if origin_url:
                origin_url = 'https:' + origin_url
            item = NewHouseItem()
            item['province'] = province
            item['city'] = city_name
            item['house_name'] = house_name
            item['sale'] = sale
            item['phone_num'] = phone_num if phone_num else '暂无电话'
            item['price'] = price
            item['tags'] = tags
            item['rooms'] = rooms
            item['area'] = area
            item['address'] = address
            item['origin_url'] = origin_url

            yield item

            # 提取最后一页
            last_url = response.xpath(
                '//ul[@class="clearfix"]/li[@class="fr"]/a[@class="last"]/@href'
            ).extract_first()  # '/house/s/b924/'
            # 如果某个冷门城市只有一页数据,last_url就不存在,.split('/')出异常
            if last_url:
                last_page = last_url.split('/')[-2].replace(
                    'b1saledate-b9', '')
                for i in range(1, int(last_page) + 1):
                    next_url = urljoin(
                        response.url,
                        '/house/s/b1saledate-b9{page}/'.format(page=str(i)))
                    if next_url:
                        yield scrapy.Request(url=next_url,
                                             callback=self.parse_newhouse,
                                             meta={
                                                 'info': (province, city_name),
                                                 'url': next_url
                                             },
                                             errback=self.handle_newhouse_err)
    def parse_new(self, response):
        """新房链接爬虫"""
        province, city = response.meta.get('info')
        html = etree.HTML(response.text)
        li_list = html.xpath('//div[@id="newhouse_loupai_list"]//li')
        # print(li_list)
        for li in li_list:
            detail = li.xpath('.//div[@class="nlc_details"]')
            if detail:
                # 名字
                name = detail[0].xpath(
                    ".//div[@class='nlcd_name']/a/text()")[0].strip()
                url = detail[0].xpath(".//div[@class='nlcd_name']/a/@href")[0]
                # 房间
                rooms = detail[0].xpath(
                    './/div[@class="house_type clearfix"]//text()')
                rooms = "".join(rooms)
                rooms = "".join(rooms.split())
                # 判断rooms是否含有居平米等关键字
                if rooms.find("居") != -1 and rooms.find("平米") != -1:
                    room = rooms.split("-")[0]
                    area = rooms.split("-")[1]
                else:
                    room = rooms
                    area = rooms
                # 价格
                price = detail[0].xpath(
                    './/div[@class="nhouse_price"]//text()')
                price = "".join(price)
                price = "".join(price.split())
                # 详细地址
                address = detail[0].xpath(
                    ".//div[@class='address']/a/@title")[0]
                # print(type(rooms))
                # district
                district = detail[0].xpath(
                    ".//div[@class='address']/a//text()")
                district = "".join(
                    "".join(district).split()).split("]")[0] + "]"
                #status
                status = detail[0].xpath(".//div[@class='fangyuan']//text()")
                status = "".join(status).split()
                type = "/".join(status[1:-1])
                status = status[0]
                newhouse_item = {}
                newhouse_item['new'] = NewHouseItem(province=province,
                                                    city=city,
                                                    name=name,
                                                    price=price,
                                                    rooms=room,
                                                    area=area,
                                                    address=address,
                                                    district=district,
                                                    status=status,
                                                    type=type,
                                                    url=url)
                yield newhouse_item
        next_url = html.xpath('//div[@class="page"]//a[@class="next"]/@href')
        if next_url:
            print("当前url", response.url)
            print("要返回的url:", next_url[0])
            if next_url[0].find("http") == -1:
                base_url = response.url.split("/house")[0]
                next_url = base_url + next_url[0]
            else:
                print("不包含http的链接", next_url)

            print(next_url)
            yield scrapy.Request(url=next_url,
                                 callback=self.parse_new,
                                 meta={'info': (province, city)})
Beispiel #29
0
    def parse_newhouse(self, response):
        province, city = response.meta.get('info')
        # contains  div里面的class属性包含有nl_con属性
        lis = response.xpath("//div[contains(@class,'nl_con')]//ul//li")
        for li in lis:
            name = li.xpath(".//div[@class='nlcd_name']/a/text()").get()
            if name:
                name = name.strip()
                # print(name)
            price1 = li.xpath(
                ".//div[@class='nhouse_price']/span/text()").get()
            price2 = li.xpath(".//div[@class='nhouse_price']/em/text()").get()
            price = str(price1) + str(price2)
            if price:
                price = price.strip()
                # print(price)
            rooms = li.xpath(
                ".//div[contains(@class,'house_type')]/a/text()").getall()
            if rooms:
                rooms = "".join(rooms).strip()
                # print(rooms)
            area = li.xpath(
                ".//div[contains(@class,'house_type')]/text()").getall()
            if area:
                area = "".join(area).strip()
                area = re.sub(r'\s|/|-', '', area)
                # print(area)
            address = li.xpath(
                ".//div[contains(@class,'address')]/a/text()").getall()
            if address:
                address = "".join(address).strip()
                address = re.sub(r'\s', '', address)
                # print(address)
            district = li.xpath(
                ".//div[contains(@class,'address')]/a/span/text()").get()
            if district:
                district = re.sub(r'\s|\[|\]', '', district)
                # print(district)
            sale = li.xpath(
                ".//div[contains(@class,'fangyuan')]/span/text()").get()
            if sale:
                sale = sale.strip()
                # print(sale)
            origin_url = li.xpath(
                ".//div[contains(@class,'nlcd_name')]/a/@href").get()
            if origin_url:
                origin_url = origin_url.strip()
                # print(origin_url)

            if name:
                yield NewHouseItem(province=province,
                                   city=city,
                                   name=name,
                                   price=price,
                                   rooms=rooms,
                                   area=area,
                                   address=address,
                                   district=district,
                                   sale=sale,
                                   origin_url=origin_url)
            else:
                continue
        next_url = response.xpath(
            "//div[@class='page']//a[contains(@class,'next')]/@href").get()
        if next_url:
            yield scrapy.Request(url=response.urljoin(next_url),
                                 callback=self.parse_newhouse,
                                 meta={"info": (province, city)})
Beispiel #30
0
    def parse_newhouse(self, response):
        province, city_name = response.meta.get('info')
        lis = response.xpath('//div[@class="nl_con clearfix"]/ul/li')
        for li in lis:
            ad = li.xpath('./div[@class="clearfix"]/h3/text()').extract_first()
            if ad:
                continue
            house_name = li.xpath(
                './/div[@class="house_value clearfix"]//div[@class="nlcd_name"]/a/text()'
            ).extract_first()

            if house_name:
                house_name = re.sub(r"\s", "", house_name)

            rooms = '/'.join(
                li.xpath(
                    './/div[@class="house_type clearfix"]/a/text()').extract())
            phone_num = ''.join(
                li.xpath('.//div[@class="tel"]/p//text()').extract())
            area = ''.join(
                li.xpath(
                    './/div[@class="house_type clearfix"]/text()').extract())
            area = re.sub('\s|-|/', '', area)
            address = li.xpath(
                './/div[@class="address"]/a/@title').extract_first()
            sale = li.xpath(
                ".//div[@class='fangyuan']/span/text()").extract_first()
            tags_list = li.xpath('//div[@id="sjina_C26_07"]//text()').extract()
            tags = list(filter(None, map(lambda x: x.strip(), tags_list)))[1:]
            tags = '/'.join(tags)
            price = li.xpath(
                ".//div[@class='nhouse_price']/span/text()").extract_first()
            price_unit = li.xpath(
                ".//div[@class='nhouse_price']/em/text()").extract_first()
            nearby = li.xpath('//div[@class="nhouse_price"]/label[2]/text()'
                              ).extract_first()

            if nearby:
                price = li.xpath(
                    '//div[@class="nhouse_price"]/i/text()').extract_first()
            if not price_unit:
                price = price
            else:
                price = price + price_unit  #'40500元/㎡'

            origin_url = li.xpath(
                ".//div[@class='nlcd_name']/a/@href").extract_first()
            # 详情页可能会取空,加一个判断    TypeError: must be str, not NoneType
            if origin_url:
                origin_url = 'https:' + origin_url
            item = NewHouseItem()
            item['province'] = province
            item['city'] = city_name
            item['house_name'] = house_name
            item['sale'] = sale
            item['phone_num'] = phone_num if phone_num else '暂无电话'
            item['price'] = price
            item['tags'] = tags
            item['rooms'] = rooms
            item['area'] = area
            item['address'] = address
            item['origin_url'] = origin_url

            yield item

            last_url = response.xpath(
                '//ul[@class="clearfix"]/li[@class="fr"]/a[@class="last"]/@href'
            ).extract_first()  # '/house/s/b924/'

            if last_url:
                last_page = last_url.split('/')[-2].replace('b9', '')
                for i in range(1, int(last_page) + 1):
                    next_url = urljoin(response.url,
                                       '/house/s/b9{page}/'.format(page=i))
                    if next_url:
                        yield scrapy.Request(
                            url=next_url,
                            callback=self.parse_newhouse,
                            meta={'info': (province, city_name)})