Example #1
0
    def parse_content(self, response):
        selector = etree.HTML(response.text)
        cj_list = selector.xpath("//ul[@class='listContent']//li")

        for cj in cj_list:
            item = LianjiaItem()
            item['region'] = self.regions.get(response.meta['region'])

            href = cj.xpath('./a/@href')
            if not len(href):
                continue

            item['href'] = href[0]

            content = cj.xpath('.//div[@class="title"]/a/text()')
            if len(content):
                content = content[0].split()
                item['name'] = content[0]
                item['style'] = content[1]
                item['area'] = content[2]

            content = cj.xpath('.//div[@class="houseInfo"]/text()')
            if len(content):
                content = content[0].split('|')
                item['orientation'] = content[0]
                item['decoration'] = content[1]
                if len(content) == 3:
                    item['elevator'] = content[2]
                else:
                    item['elevator'] = "无"

            content = cj.xpath('.//div[@class="positionInfo"]/text()')
            if len(content):
                content = content[0].split()
                item['floor'] = content[0]
                if len(content) == 2:
                    item['build_year'] = content[1]
                else:
                    item['build_year'] = '无'

            content = cj.xpath('.//div[@class="dealDate"]/text()')
            if len(content):
                item['sign_time'] = content[0]

            content = cj.xpath('.//div[@class="totalPrice"]/span/text()')
            if len(content):
                item['total_price'] = content[0]

            content = cj.xpath('.//div[@class="unitPrice"]/span/text()')
            if len(content):
                item['unit_price'] = content[0]

            content = cj.xpath('.//span[@class="dealHouseTxt"]/span/text()')
            if len(content):
                for i in content:
                    if i.find("房屋满") != -1:  # 找到了返回的是非-1得数,找不到的返回的是-1
                        item['fangchan_class'] = i
                    elif i.find("号线") != -1:
                        item['subway'] = i
                    elif i.find("学") != -1:
                        item['school'] = i
            yield item
Example #2
0
    def parse(self, response):
        item = LianjiaItem()
        soup = BeautifulSoup(response.text, 'html.parser')
        # 用BeautifulSoup解析html
        datas = soup.find_all('div', class_="info clear")
        for data in datas:
            #            item['referer']=response.url
            item['room_url'] = data.find(
                'a',
                href=re.compile(r'https://sz.lianjia.com/ershoufang/'))['href']
            item['room_id'] = re.search(r'\d+', item['room_url']).group()
            #若房源id重复,则不记录
            if item['room_id'] in self.id:
                continue
            self.id.add(item['room_id'])
            if soup.find('a', class_='selected', title=re.compile(r'在售')):
                item['area'] = soup.find('a',
                                         class_='selected',
                                         title=re.compile(r'在售')).get_text()
            item['room_name'] = data.find('div', class_='title').get_text()
            item['community'] = data.find(
                'div', class_="houseInfo").find('a').get_text()
            item['introduction'] = data.find('div',
                                             class_="houseInfo").get_text()
            item['space'] = re.search(r'\d{2,}\.*\d*',
                                      item['introduction']).group()
            #只记录楼层的数字和建造的时间
            position = data.find('div', class_="positionInfo").get_text()
            item['floor'] = re.search(r'\d{1,2}', position).group()
            if re.search(r'\d{4}', position):
                item['build_time'] = re.search(r'\d{4}', position).group()

            item['positionInfo'] = data.find(
                'div', class_="positionInfo").find('a').get_text()
            #只记录关注的人数,带看的人数,和发布售楼信息的时间
            followinfo = data.find('div', class_="followInfo").get_text()
            item['people_focus'] = re.search(
                '\d*',
                re.search('\d*人关注', followinfo).group()).group()
            item['look'] = re.search('\d*',
                                     re.search('\d*次带看',
                                               followinfo).group()).group()
            item['publish_time'] = re.search('\w*发布',
                                             followinfo).group().replace(
                                                 '发布', '')

            item['tag'] = data.find('div', class_='tag').get_text()
            item['price'] = data.find(
                'div', class_="totalPrice").find('span').get_text()
            item['unitprice'] = re.search(
                '\d+',
                data.find('div', class_="unitPrice").get_text()).group()
            yield item
        #发送新的url给调度器
        #获得各个区域的url(第一层url)
        first_url = 'https://sz.lianjia.com'
        area_urls = soup.find_all('a',
                                  href=re.compile(r'^/ershoufang/\w+/$'),
                                  title=re.compile(r'在售二手房'))
        if area_urls:
            for area_url in area_urls:
                full_url = first_url + area_url['href']
                yield scrapy.Request(url=full_url, callback=self.parse)
        # 解析翻页url
        #只在第一层url下解析翻页url
        if re.search(r'/ershoufang/\w+/$', response.url):
            last_url = soup.find_all(
                'a', href=re.compile(r'^/ershoufang/\w+/p\d/$'))
            if last_url:
                for url in last_url:
                    if re.search(r'\(\d+', url.get_text()):
                        number = re.search(r'\(\d+',
                                           url.get_text()).group().replace(
                                               '(', '')
                        if int(number) % 30 == 0:
                            n = int(int(number) / 30)
                        else:
                            n = int(int(number) / 30) + 1
                        one_url = first_url + url['href']
                        if n <= 100:
                            if re.search(r'/p[a-z]', one_url):
                                for i in range(1, n + 1):
                                    full_url = one_url.replace(
                                        '/p', '/pg' + str(i) + 'p').replace(
                                            '/pg' + str(i) + 'p', '/p', 1)
                                    yield scrapy.Request(url=full_url,
                                                         callback=self.parse)
                            else:
                                for i in range(1, n + 1):
                                    full_url = one_url.replace(
                                        '/p', '/pg' + str(i) + 'p')
                                    yield scrapy.Request(url=full_url,
                                                         callback=self.parse)
                        else:
                            if re.search(r'/p[a-z]', one_url):
                                for i in range(1, 101):
                                    full_url = one_url.replace(
                                        '/p', '/pg' + str(i) + 'p').replace(
                                            '/pg' + str(i) + 'p', '/p', 1)
                                    yield scrapy.Request(url=full_url,
                                                         callback=self.parse)
                            else:
                                for i in range(1, 101):
                                    full_url = one_url.replace(
                                        '/p', '/pg' + str(i) + 'p')
                                    yield scrapy.Request(url=full_url,
                                                         callback=self.parse)
Example #3
0
    def parse_detail(self, response):
        item_dict = response.meta
        item = LianjiaItem()
        item['province'] = '北京'
        item['city'] = '北京市'
        item['areas'] = '顺义区'
        item['coordinate_type'] = '百度'
        item['crawl_time'] = '2019-5-30'
        item['title'] = item_dict['title']
        item['prices'] = item_dict['prices']
        item['price'] = item_dict['price']
        hou = response.xpath(
            '//h3[@class="similar_data"]/div/div[2]/p[2]/text()'
        ).extract_first()  # 3室2厅
        # if hou:
        area = response.xpath(
            '//h3[@class="similar_data"]/div/div[3]/p[2]/text()'
        ).extract_first()
        address = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[12]/a/text()'
        ).extract_first()
        listing_time = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[2]/text()'
        ).extract_first()
        orient = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[3]/text()'
        ).extract_first()
        flo = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[4]/text()'
        ).extract_first()  # 底层/4
        fitment = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[7]/text()'
        ).extract_first()
        elevator = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[6]/text()'
        ).extract_first()
        build_type = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[5]/text()'
        ).extract_first()
        build_time = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[8]/text()'
        ).extract_first()
        quanshu = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[10]/text()'
        ).extract_first()
        # coordinate = response.xpath('//body/div[7]/div/a/@href').extract_first()
        coo = response.xpath('//div[@class="sub_mod_box location"]/div/a/@href'
                             ).extract_first()

        item['hou'] = hou if hou else 'None'
        item['area'] = area if area else 'None'
        item['address'] = address if address else 'None'
        item['listing_time'] = listing_time if listing_time else 'None'
        item['orient'] = orient if orient else 'None'
        item['flo'] = flo if flo else 'None'
        item['fitment'] = fitment if fitment else 'None'
        item['elevator'] = elevator if elevator else 'None'
        item['build_type'] = build_type if build_type else 'None'
        item['build_time'] = build_time if build_time else 'None'
        item['quanshu'] = quanshu if quanshu else 'None'
        if coo:
            coordinate = re.search('pos=(\d+.\d+,\d+.\d+)', coo)
            item['coordinate'] = coordinate.group(1) if coordinate else 'None'
        else:
            item['coordinate'] = ''
        time.sleep(0.05)
        yield item
    def parse_body_bj(self, response):

        print response.url
        city_name = self.city_name
        content = response.body
        tree = etree.HTML(content)
        nodes = tree.xpath('//ul[@class="listContent"]/li')
        print "len : ", len(nodes)
        for node in nodes:
            items = LianjiaItem()
            name = node.xpath('.//div[@class="title"]/a/text()')[0]
            items['name'] = name
            try:
                position = node.xpath('.//div[@class="positionInfo"]/a/text()')
                address = position[0] + position[1]
            except:
                address = 'NA'
            print address
            items['location'] = address
            items['city_name'] = city_name
            try:
                text_content = node.xpath(
                    './/div[@class="positionInfo"]/text()')
                # print len(build_date)

                detail = text_content[3].split('/')
                # 除去北京,北京的页面会多一个小区结构
                building_date = detail[1].strip()
                building_type = "NA"
                '''
                for k, i in enumerate(detail):
                    print k, i

                if len(detail) == 4:
                    buiding_type = detail[1].strip() + detail[3].strip()
                    build_date = detail[3].strip()
                elif len(detail) == 3:
                    buiding_type = detail[1].strip()

                    build_date = detail[2].strip()
                '''
            except:
                building_date = '未知年建成'
            items['building_date'] = building_date
            items['building_type'] = building_type
            # details = desc.split()
            price_t = node.xpath('.//div[@class="totalPrice"]/span/text()')[0]

            p = re.findall('\d+', price_t)
            if len(p) != 0:
                price = int(price_t)
            else:
                price = '均价未知'
            print price
            price_detail = {
                'price': price,
                'origin': 'LJ',
                'crawl_date': self.crawl_date
            }
            price_list = []
            price_list.append(price_detail)
            price_dict = {self.price_month: price_list}
            items['price'] = price_dict
            yield items
Example #5
0
class ZufangSpider(scrapy.Spider):
    name = 'zufang'
    allowed_domains = ['sz.lianjia.com']
    start_url = 'https://sz.lianjia.com/zufang/'
    page_titles = 30
    Item = LianjiaItem()

    def start_requests(self):
        """
        开始页
        :return:
        """
        yield scrapy.Request(url=self.start_url, callback=self.parse_district)

    def parse_district(self, response):
        """
        解析深圳各个区
        :param response:
        :return:
        """
        districts = response.xpath('//li[contains(@data-id, "2300")]')
        for dis in districts:
            url = dis.xpath('./a/@href').get()
            district = dis.xpath('./a/text()').get()
            if url:
                yield scrapy.Request(url=response.urljoin(url),
                                     callback=self.parse_bizcircle,
                                     meta={'district': district})

    def parse_bizcircle(self, response):
        """
        解析各个区中的商圈
        :param response:
        :return:
        """
        bizcircles = response.xpath('//li[@class="filter__item--level3  "]')
        for biz in bizcircles:
            district = response.meta['district']
            url = biz.xpath('./a/@href').get()
            bizcircle = biz.xpath('./a/text()').get()
            if url:
                yield scrapy.Request(url=response.urljoin(url),
                                     callback=self.parse_page,
                                     meta={
                                         'bizcircle': bizcircle,
                                         'district': district
                                     })

    def parse_page(self, response):
        """
        根据租房总量算出页数 page=int(total/30)
        :param response:
        :return:
        """
        title_num = response.xpath(
            '//span[@class="content__title--hl"]/text()').get()
        max_page = int(int(title_num) / self.page_titles + 1)
        for page in range(1, max_page + 1):
            bizcircle = response.meta['bizcircle']
            district = response.meta['district']
            yield scrapy.Request(url=response.url + 'pg{}'.format(page),
                                 callback=self.parse_content,
                                 meta={
                                     'bizcircle': bizcircle,
                                     'district': district
                                 })

    def parse_content(self, response):
        """
        解析租房数据
        :param response:
        :return:
        """
        contents = response.xpath('//div[@class="content__list--item"]')
        for content in contents:
            url = response.urljoin(
                content.xpath(
                    './/p[@class="content__list--item--title twoline"]/a/@href'
                ).get())
            title = content.xpath(
                './/p[@class="content__list--item--title twoline"]/a/text()'
            ).get().strip()
            district = response.meta['district']
            bizcircle = response.meta['bizcircle']
            area = content.xpath(
                './/p[@class="content__list--item--des"]//text()').re_first(
                    '(\d+㎡)')
            price = content.xpath(
                './/span[@class="content__list--item-price"]/em/text()').get(
                ).strip()
            apartment = content.xpath(
                './/p[@class="content__list--item--des"]//text()').re_first(
                    '(\d室\d厅\d卫)')
            company = content.xpath(
                './/p[@class="content__list--item--brand oneline"]//text()'
            ).get()
            if company:
                company = company.strip()
            for field in self.Item.fields:
                try:
                    self.Item[field] = eval(field)
                    yield self.Item
                except:
                    print('Field is not Defined: ' + field)
Example #6
0
    def disposeData(self, response):
        item = LianjiaItem()
        length = response.xpath(
            "//div[@class='base']/div[@class='content']/ul/li").extract()
        #基本属性
        if (len(length) == 12):
            item["house_type"] = response.xpath(
                "//div[@class='content']/ul/li[1]/text()").extract()[0]
            item["floor"] = response.xpath(
                "//div[@class='content']/ul/li[2]/text()").extract()[0]
            item["area"] = response.xpath(
                "//div[@class='content']/ul/li[3]/text()").extract()[0]
            item["house_structure"] = response.xpath(
                "//div[@class='content']/ul/li[4]/text()").extract()[0]
            item["inside_space"] = response.xpath(
                "//div[@class='content']/ul/li[5]/text()").extract()[0]
            item["building_type"] = response.xpath(
                "//div[@class='content']/ul/li[6]/text()").extract()[0]
            item["direct"] = response.xpath(
                "//div[@class='content']/ul/li[7]/text()").extract()[0]
            item["building_structure"] = response.xpath(
                "//div[@class='content']/ul/li[8]/text()").extract()[0]
            item["decorate_situation"] = response.xpath(
                "//div[@class='content']/ul/li[9]/text()").extract()[0]
            item["elevator_proportion"] = response.xpath(
                "//div[@class='content']/ul/li[10]/text()").extract()[0]
            item["equipped_escalators"] = response.xpath(
                "//div[@class='content']/ul/li[11]/text()").extract()[0]
            item["property_term"] = response.xpath(
                "//div[@class='content']/ul/li[12]/text()").extract()[0]
            item["villa_type"] = ""
        elif (len(length) == 9):
            item["house_type"] = response.xpath(
                "//div[@class='content']/ul/li[1]/text()").extract()[0]
            item["floor"] = response.xpath(
                "//div[@class='content']/ul/li[2]/text()").extract()[0]
            item["area"] = response.xpath(
                "//div[@class='content']/ul/li[3]/text()").extract()[0]
            item["inside_space"] = response.xpath(
                "//div[@class='content']/ul/li[4]/text()").extract()[0]
            item["direct"] = response.xpath(
                "//div[@class='content']/ul/li[5]/text()").extract()[0]
            item["building_structure"] = response.xpath(
                "//div[@class='content']/ul/li[6]/text()").extract()[0]
            item["decorate_situation"] = response.xpath(
                "//div[@class='content']/ul/li[7]/text()").extract()[0]
            item["villa_type"] = response.xpath(
                "//div[@class='content']/ul/li[8]/text()").extract()[0]
            item["property_term"] = response.xpath(
                "//div[@class='content']/ul/li[9]/text()").extract()[0]
            item["house_structure"] = ""
            item["building_type"] = ""
            item["elevator_proportion"] = ""
            item["equipped_escalators"] = ""
        elif (len(length) == 3):
            item["floor"] = response.xpath(
                "//div[@class='content']/ul/li[1]/text()").extract()[0]
            item["area"] = response.xpath(
                "//div[@class='content']/ul/li[2]/text()").extract()[0]
            item["direct"] = response.xpath(
                "//div[@class='content']/ul/li[3]/text()").extract()[0]
            item["house_type"] = ""
            item["inside_space"] = ""
            item["building_structure"] = ""
            item["decorate_situation"] = ""
            item["villa_type"] = ""
            item["property_term"] = ""
            item["house_structure"] = ""
            item["building_type"] = ""
            item["elevator_proportion"] = ""
            item["equipped_escalators"] = ""
        elif (len(length) == 15):
            item["house_type"] = response.xpath(
                "//div[@class='content']/ul/li[1]/text()").extract()[0]
            item["floor"] = response.xpath(
                "//div[@class='content']/ul/li[2]/text()").extract()[0]
            item["area"] = response.xpath(
                "//div[@class='content']/ul/li[3]/text()").extract()[0]
            item["house_structure"] = response.xpath(
                "//div[@class='content']/ul/li[4]/text()").extract()[0]
            item["inside_space"] = response.xpath(
                "//div[@class='content']/ul/li[5]/text()").extract()[0]
            item["building_type"] = response.xpath(
                "//div[@class='content']/ul/li[6]/text()").extract()[0]
            item["direct"] = response.xpath(
                "//div[@class='content']/ul/li[7]/text()").extract()[0]
            item["building_structure"] = response.xpath(
                "//div[@class='content']/ul/li[8]/text()").extract()[0]
            item["decorate_situation"] = response.xpath(
                "//div[@class='content']/ul/li[9]/text()").extract()[0]
            item["elevator_proportion"] = response.xpath(
                "//div[@class='content']/ul/li[10]/text()").extract()[0]
            item["equipped_escalators"] = response.xpath(
                "//div[@class='content']/ul/li[11]/text()").extract()[0]
            item["property_term"] = response.xpath(
                "//div[@class='content']/ul/li[12]/text()").extract()[0]
            item["water_type"] = response.xpath(
                "//div[@class='content']/ul/li[13]/text()").extract()[0]
            item["electricity_type"] = response.xpath(
                "//div[@class='content']/ul/li[14]/text()").extract()[0]
            item["gas_price"] = response.xpath(
                "//div[@class='content']/ul/li[15]/text()").extract()[0]
            item["villa_type"] = ""

        #交易属性
        item["time_tone"] = response.xpath(
            "//div[@class='content']/ul/li[1]/span[2]/text()").extract()[0]
        item["trading_ownership"] = response.xpath(
            "//div[@class='content']/ul/li[2]/span[2]/text()").extract()[0]
        item["last_transaction"] = response.xpath(
            "//div[@class='content']/ul/li[3]/span[2]/text()").extract()[0]
        item["house_usage"] = response.xpath(
            "//div[@class='content']/ul/li[4]/span[2]/text()").extract()[0]
        item["house_term"] = response.xpath(
            "//div[@class='content']/ul/li[5]/span[2]/text()").extract()[0]
        item["property_owner"] = response.xpath(
            "//div[@class='content']/ul/li[6]/span[2]/text()").extract()[0]
        mortgage_info = response.xpath(
            "//div[@class='content']/ul/li[7]/span[2]/text()").extract()[0]
        mortgage_info = mortgage_info.replace(' ', '')
        mortgage_info = mortgage_info.replace('\n', '')
        item["mortgage_info"] = mortgage_info
        item["house_certificate"] = response.xpath(
            "//div[@class='content']/ul/li[8]/span[2]/text()").extract()[0]
        #其他属性
        item["total_price"] = response.xpath(
            "//span[@class='total']/text()").extract()[0]
        item["unit_price"] = response.xpath(
            "//span[@class='unitPriceValue']/text()").extract()[0]
        item["housing_name"] = response.xpath(
            "//div[@class='communityName']/a[1]/text()").extract()[0]
        item["county"] = response.xpath(
            "//div[@class='areaName']/span[2]/a[1]/text()").extract()[0]
        item["street"] = response.xpath(
            "//div[@class='areaName']/span[2]/a[2]/text()").extract()[0]
        item["built_year"] = response.xpath(
            "//div[@class='area']/div[2]/text()").extract()[0]
        print("爬取成功")
        yield item
Example #7
0
    def parse_content(self, response):
    '''
    parse_content 这个方法就是解析具体的页面了,可以看到,这个方法里面包含了非常多的条件判断,这是因为,我们之前定义的item字段里面的
    信息,并不是每一个小区都有的,就是说,我们要的信息他不是一个规规矩矩的信息,很多的房源没有提供相关的信息,比如地
    铁,周边学校等等的信息,我们这里就是如果有这个信息,我们就把它提取出来,如果没有的话,我们就给他自定义一个内容。
    最后将item提交给item pipeline进行后续的处理。
    '''
        selector = etree.HTML(response.text)
        cj_list = selector.xpath("//ul[@class='listContent']/li")



        for cj in cj_list:
            item = LianjiaItem()
            item['region'] = self.regions.get(response.meta['region'])
            href = cj.xpath('./a/@href')
            if not len(href):
                continue
            item['href'] = href[0]

            content = cj.xpath('.//div[@class="title"]/a/text()')
            if len(content):
                content = content[0].split()
                item['name'] = content[0]
                item['style'] = content[1]
                item['area'] = content[2]

            content = cj.xpath('.//div[@class="houseInfo"]/text()')
            if len(content):
                content = content[0].split('|')
                item['orientation'] = content[0]
                item['decoration'] = content[1]
                if len(content) == 3:
                    item['elevator'] = content[2]
                else:
                    item['elevator'] = '无'

            content = cj.xpath('.//div[@class="positionInfo"]/text()')
            if len(content):
                content = content[0].split()
                item['floor'] = content[0]
                if len(content) == 2:
                    item['build_year'] = content[1]
                else:
                    item['build_year'] = '无'

            content = cj.xpath('.//div[@class="dealDate"]/text()')
            if len(content):
                item['sign_time'] = content[0]

            content = cj.xpath('.//div[@class="totalPrice"]/span/text()')
            if len(content):
                item['total_price'] = content[0]

            content = cj.xpath('.//div[@class="unitPrice"]/span/text()')
            if len(content):
                item['unit_price'] = content[0]

            content = cj.xpath('.//span[@class="dealHouseTxt"]/span/text()')
            if len(content):
                for i in content:
                    if i.find("房屋满") != -1:
                        item['fangchan_class'] = i

                    elif i.find("号线") != -1:
                        item['subway'] = i

                    elif i.find("学") != -1:
                        item['school'] = i

            yield  item
Example #8
0
    def parse_item(self, response):
        item = LianjiaItem()
        # 照片
        house_photo = response.xpath(
            '//ul[@class="smallpic"]/li/@data-src').extract()
        # 价格
        mon = response.xpath('//div[@class="content"]/div/span').xpath(
            'string(.)').extract()[2:4]
        money = '/'.join(mon)
        # 一平米多少钱
        one_area = \
        response.xpath('//div[@class="content"]/div/div[@class="text"]/div[1]/span').xpath('string(.)').extract()[0]
        # 房间
        rom = response.xpath('//div[@class="houseInfo"]/div[@class="room"]/div'
                             ).xpath('string(.)').extract()
        room = ','.join(rom)
        # 类型
        tp = response.xpath('//div[@class="houseInfo"]/div[@class="type"]/div'
                            ).xpath('string(.)').extract()
        typ = ','.join(tp)
        # 面积
        ara = response.xpath('//div[@class="houseInfo"]/div[@class="area"]/div'
                             ).xpath('string(.)').extract()
        area = ','.join(ara)
        # 介绍
        # 小区名称
        info_position = response.xpath(
            '//div[@class="aroundInfo"]/div[1]').xpath('string(.)').extract()
        # 位置
        region = response.xpath('//div[@class="aroundInfo"]/div[2]').xpath(
            'string(.)').extract()
        info_region = ''.join(region).split()
        # 看房时间
        info_look_time = response.xpath(
            '//div[@class="aroundInfo"]/div[3]').xpath('string(.)').extract()
        # 链家编号
        info_number = response.xpath(
            '//div[@class="aroundInfo"]/div[4]//span/text()').extract()
        # 基本属性
        basic_attributes = response.xpath(
            '//div[@class="base"]/div[@class="content"]/ul/li').xpath(
                'string(.)').extract()
        # 交易属性
        tran_attribute = response.xpath(
            '//div[@class="transaction"]/div[@class="content"]/ul/li').xpath(
                'string(.)').extract()
        transaction_attribute = ''.join(tran_attribute).split()

        # 注意
        careful = response.xpath(
            '//div[@class="introContent"]/div[@class="disclaimer"]/text()'
        ).extract()

        # 房源特色-->房源标签
        hous_label = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[1]/div[2]/a/text()').extract(
            )
        housing_label = ''.join(hous_label).split()[0]
        # 房源特色-->周边配套
        peri_matching = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[2]/div[2]/text()').extract()
        peripheral_matching = ''.join(peri_matching).split()[0]
        # 房源特色-->小区介绍
        com_introduction = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[3]/div[2]/text()').extract()
        community_introduction = ''.join(com_introduction).split()[0]
        # 房源特色-->装修描述
        dec_description = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[4]/div[2]/text()').extract()
        decoration_description = ''.join(dec_description).split()[0]
        # 房源特色 --> 核心卖点
        cor_sell_point = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[5]/div[2]/text()').extract()
        core_selling_point = ''.join(cor_sell_point).split()[0]
        # 注意事项
        matters_needing_attention = \
        response.xpath('//div[@class="box-l"]/div[2]/div/div[@class="disclaimer"]/text()').extract()[0]

        # 联系人照片
        contacts_photo = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/a/img/@src').extract()[0]
        # 联系人名字
        contacts_name = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/div/div/a/text()'
        ).extract()[0]
        # 联系人评分
        contacts_sco = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/div/div[2]/span/text()'
        ).extract()
        contacts_score = ''.join(contacts_sco)
        # 联系人电话
        contacts_tel = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/div/div[3]/text()'
        ).extract()
        contacts_telephone = '转'.join(contacts_tel)
        contacts_company = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/div/div[@class="brokerName"]/div/span/text()'
        ).extract()[0]

        item['house_photo'] = house_photo
        item['money'] = money
        item['one_area'] = one_area
        item['room'] = room
        item['area'] = area
        item['typ'] = typ
        item['careful'] = careful
        item['info_position'] = info_position
        item['info_region'] = info_region
        item['info_look_time'] = info_look_time
        item['info_number'] = info_number
        item['basic_attributes'] = basic_attributes
        item['transaction_attribute'] = transaction_attribute
        item['housing_label'] = housing_label
        item['peripheral_matching'] = peripheral_matching
        item['community_introduction'] = community_introduction
        item['decoration_description'] = decoration_description
        item['core_selling_point'] = core_selling_point
        item['matters_needing_attention'] = matters_needing_attention
        item['contacts_photo'] = contacts_photo
        item['contacts_name'] = contacts_name
        item['contacts_score'] = contacts_score
        item['contacts_telephone'] = contacts_telephone
        item['contacts_company'] = contacts_company
        yield item
Example #9
0
    def parse_detail(self, response):

        # get detail data
        house_detail_url = response.url
        house_title = response.xpath(
            '//h3[@class="house_desc lazyload_ulog"]/text()').extract_first(
            ).replace("\n", "").strip() if len(
                response.xpath('//h3[@class="house_desc lazyload_ulog"]/text()'
                               )) > 0 else None
        house_price = response.xpath(
            '//h3[@class="similar_data"]/div[1]/p[2]/span[1]/text()'
        ).extract_first().split("万")[0] if len(
            response.xpath(
                '//h3[@class="similar_data"]/div/p[2]/span[1]/text()')
        ) > 0 else None
        house_single_price = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[1]/text()'
        ).extract_first().split("元/平")[0] if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[1]/text()')
        ) > 0 else None
        house_room_type = response.xpath(
            '//h3[@class="similar_data"]/div[2]/p[2]/text()').extract_first(
            ) if len(
                response.xpath('//h3[@class="similar_data"]/div[2]/p[2]/text()'
                               )) > 0 else None
        house_heading = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[3]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[3]/text()')
        ) > 0 else None
        house_floor = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[4]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[4]/text()')
        ) > 0 else None
        house_building_type = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[5]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[5]/text()')
        ) > 0 else None
        house_elevator = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[6]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[6]/text()')
        ) > 0 else None
        house_decoration = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[7]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[7]/text()')
        ) > 0 else None
        house_year = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[8]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[8]/text()')
        ) > 0 else None
        house_use = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[9]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[9]/text()')
        ) > 0 else None
        house_ownership = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[10]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[10]/text()'
            )) > 0 else None
        house_commiunity = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[12]/a/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[12]/a/text()'
            )) > 0 else None
        house_area = response.xpath(
            '//h3[@class="similar_data"]/div[3]/p[2]/text()').extract_first(
            ).split("m²")[0] if len(
                response.xpath('//h3[@class="similar_data"]/div[3]/p[2]/text()'
                               )) > 0 else None
        house_introduction = ''.join(
            response.xpath(
                '//div[@class="mod_cont fiveline house_intro_mod_cont"]/text()'
            ).extract()
        ).replace("\n", "").replace(" ", "") if len(
            response.xpath(
                '//div[@class="mod_cont fiveline house_intro_mod_cont"]/text()'
            )) > 0 else None
        house_adders = ''.join(
            response.xpath(
                '//div[@class="sub_mod_box location"]/div/a/div/div[@class="marker_desc"]/p/text()'
            ).extract()
        ).replace("\n", "").replace(" ", "") if len(
            response.xpath(
                '//div[@class="sub_mod_box location"]/div/a/div/div[@class="marker_desc"]/p/text()'
            )) > 0 else None
        house_date = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[2]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[2]/text()')
        ) > 0 else None

        item = LianjiaItem()

        item["house_title"] = house_title
        item["house_price"] = house_price
        item["house_single_price"] = house_single_price
        item["house_room_type"] = house_room_type
        item["house_heading"] = house_heading
        item["house_floor"] = house_floor
        item["house_building_type"] = house_building_type
        item["house_elevator"] = house_elevator
        item["house_decoration"] = house_decoration
        item["house_year"] = house_year
        item["house_use"] = house_use
        item["house_ownership"] = house_ownership
        item["house_commiunity"] = house_commiunity
        item["house_detail_url"] = house_detail_url
        item["house_area"] = house_area
        item["house_introduction"] = house_introduction
        item["house_adders"] = house_adders
        item["house_date"] = house_date

        yield item