Python LianjiaItem Examples

Programming Language: Python

Namespace/Package Name: lianjia.items

Class/Type: LianjiaItem

Examples at hotexamples.com: 9

Python LianjiaItem - 9 examples found. These are the top rated real world Python examples of lianjia.items.LianjiaItem extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LianjiaItem(30)

Frequently Used Methods

LianjiaItem (30)

Example #1

Show file

    def parse_content(self, response):
        selector = etree.HTML(response.text)
        cj_list = selector.xpath("//ul[@class='listContent']//li")

        for cj in cj_list:
            item = LianjiaItem()
            item['region'] = self.regions.get(response.meta['region'])

            href = cj.xpath('./a/@href')
            if not len(href):
                continue

            item['href'] = href[0]

            content = cj.xpath('.//div[@class="title"]/a/text()')
            if len(content):
                content = content[0].split()
                item['name'] = content[0]
                item['style'] = content[1]
                item['area'] = content[2]

            content = cj.xpath('.//div[@class="houseInfo"]/text()')
            if len(content):
                content = content[0].split('|')
                item['orientation'] = content[0]
                item['decoration'] = content[1]
                if len(content) == 3:
                    item['elevator'] = content[2]
                else:
                    item['elevator'] = "无"

            content = cj.xpath('.//div[@class="positionInfo"]/text()')
            if len(content):
                content = content[0].split()
                item['floor'] = content[0]
                if len(content) == 2:
                    item['build_year'] = content[1]
                else:
                    item['build_year'] = '无'

            content = cj.xpath('.//div[@class="dealDate"]/text()')
            if len(content):
                item['sign_time'] = content[0]

            content = cj.xpath('.//div[@class="totalPrice"]/span/text()')
            if len(content):
                item['total_price'] = content[0]

            content = cj.xpath('.//div[@class="unitPrice"]/span/text()')
            if len(content):
                item['unit_price'] = content[0]

            content = cj.xpath('.//span[@class="dealHouseTxt"]/span/text()')
            if len(content):
                for i in content:
                    if i.find("房屋满") != -1:  # 找到了返回的是非-1得数，找不到的返回的是-1
                        item['fangchan_class'] = i
                    elif i.find("号线") != -1:
                        item['subway'] = i
                    elif i.find("学") != -1:
                        item['school'] = i
            yield item

Example #2

Show file

File: spider.py Project: longsheng666/python-spider

    def parse(self, response):
        item = LianjiaItem()
        soup = BeautifulSoup(response.text, 'html.parser')
        # 用BeautifulSoup解析html
        datas = soup.find_all('div', class_="info clear")
        for data in datas:
            #            item['referer']=response.url
            item['room_url'] = data.find(
                'a',
                href=re.compile(r'https://sz.lianjia.com/ershoufang/'))['href']
            item['room_id'] = re.search(r'\d+', item['room_url']).group()
            #若房源id重复，则不记录
            if item['room_id'] in self.id:
                continue
            self.id.add(item['room_id'])
            if soup.find('a', class_='selected', title=re.compile(r'在售')):
                item['area'] = soup.find('a',
                                         class_='selected',
                                         title=re.compile(r'在售')).get_text()
            item['room_name'] = data.find('div', class_='title').get_text()
            item['community'] = data.find(
                'div', class_="houseInfo").find('a').get_text()
            item['introduction'] = data.find('div',
                                             class_="houseInfo").get_text()
            item['space'] = re.search(r'\d{2,}\.*\d*',
                                      item['introduction']).group()
            #只记录楼层的数字和建造的时间
            position = data.find('div', class_="positionInfo").get_text()
            item['floor'] = re.search(r'\d{1,2}', position).group()
            if re.search(r'\d{4}', position):
                item['build_time'] = re.search(r'\d{4}', position).group()

            item['positionInfo'] = data.find(
                'div', class_="positionInfo").find('a').get_text()
            #只记录关注的人数，带看的人数，和发布售楼信息的时间
            followinfo = data.find('div', class_="followInfo").get_text()
            item['people_focus'] = re.search(
                '\d*',
                re.search('\d*人关注', followinfo).group()).group()
            item['look'] = re.search('\d*',
                                     re.search('\d*次带看',
                                               followinfo).group()).group()
            item['publish_time'] = re.search('\w*发布',
                                             followinfo).group().replace(
                                                 '发布', '')

            item['tag'] = data.find('div', class_='tag').get_text()
            item['price'] = data.find(
                'div', class_="totalPrice").find('span').get_text()
            item['unitprice'] = re.search(
                '\d+',
                data.find('div', class_="unitPrice").get_text()).group()
            yield item
        #发送新的url给调度器
        #获得各个区域的url(第一层url)
        first_url = 'https://sz.lianjia.com'
        area_urls = soup.find_all('a',
                                  href=re.compile(r'^/ershoufang/\w+/$'),
                                  title=re.compile(r'在售二手房'))
        if area_urls:
            for area_url in area_urls:
                full_url = first_url + area_url['href']
                yield scrapy.Request(url=full_url, callback=self.parse)
        # 解析翻页url
        #只在第一层url下解析翻页url
        if re.search(r'/ershoufang/\w+/$', response.url):
            last_url = soup.find_all(
                'a', href=re.compile(r'^/ershoufang/\w+/p\d/$'))
            if last_url:
                for url in last_url:
                    if re.search(r'\(\d+', url.get_text()):
                        number = re.search(r'\(\d+',
                                           url.get_text()).group().replace(
                                               '(', '')
                        if int(number) % 30 == 0:
                            n = int(int(number) / 30)
                        else:
                            n = int(int(number) / 30) + 1
                        one_url = first_url + url['href']
                        if n <= 100:
                            if re.search(r'/p[a-z]', one_url):
                                for i in range(1, n + 1):
                                    full_url = one_url.replace(
                                        '/p', '/pg' + str(i) + 'p').replace(
                                            '/pg' + str(i) + 'p', '/p', 1)
                                    yield scrapy.Request(url=full_url,
                                                         callback=self.parse)
                            else:
                                for i in range(1, n + 1):
                                    full_url = one_url.replace(
                                        '/p', '/pg' + str(i) + 'p')
                                    yield scrapy.Request(url=full_url,
                                                         callback=self.parse)
                        else:
                            if re.search(r'/p[a-z]', one_url):
                                for i in range(1, 101):
                                    full_url = one_url.replace(
                                        '/p', '/pg' + str(i) + 'p').replace(
                                            '/pg' + str(i) + 'p', '/p', 1)
                                    yield scrapy.Request(url=full_url,
                                                         callback=self.parse)
                            else:
                                for i in range(1, 101):
                                    full_url = one_url.replace(
                                        '/p', '/pg' + str(i) + 'p')
                                    yield scrapy.Request(url=full_url,
                                                         callback=self.parse)

Example #3

Show file

    def parse_detail(self, response):
        item_dict = response.meta
        item = LianjiaItem()
        item['province'] = '北京'
        item['city'] = '北京市'
        item['areas'] = '顺义区'
        item['coordinate_type'] = '百度'
        item['crawl_time'] = '2019-5-30'
        item['title'] = item_dict['title']
        item['prices'] = item_dict['prices']
        item['price'] = item_dict['price']
        hou = response.xpath(
            '//h3[@class="similar_data"]/div/div[2]/p[2]/text()'
        ).extract_first()  # 3室2厅
        # if hou:
        area = response.xpath(
            '//h3[@class="similar_data"]/div/div[3]/p[2]/text()'
        ).extract_first()
        address = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[12]/a/text()'
        ).extract_first()
        listing_time = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[2]/text()'
        ).extract_first()
        orient = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[3]/text()'
        ).extract_first()
        flo = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[4]/text()'
        ).extract_first()  # 底层/4
        fitment = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[7]/text()'
        ).extract_first()
        elevator = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[6]/text()'
        ).extract_first()
        build_type = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[5]/text()'
        ).extract_first()
        build_time = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[8]/text()'
        ).extract_first()
        quanshu = response.xpath(
            '//ul[@class="house_description big lightblack lazyload_ulog"]/li[10]/text()'
        ).extract_first()
        # coordinate = response.xpath('//body/div[7]/div/a/@href').extract_first()
        coo = response.xpath('//div[@class="sub_mod_box location"]/div/a/@href'
                             ).extract_first()

        item['hou'] = hou if hou else 'None'
        item['area'] = area if area else 'None'
        item['address'] = address if address else 'None'
        item['listing_time'] = listing_time if listing_time else 'None'
        item['orient'] = orient if orient else 'None'
        item['flo'] = flo if flo else 'None'
        item['fitment'] = fitment if fitment else 'None'
        item['elevator'] = elevator if elevator else 'None'
        item['build_type'] = build_type if build_type else 'None'
        item['build_time'] = build_time if build_time else 'None'
        item['quanshu'] = quanshu if quanshu else 'None'
        if coo:
            coordinate = re.search('pos=(\d+.\d+,\d+.\d+)', coo)
            item['coordinate'] = coordinate.group(1) if coordinate else 'None'
        else:
            item['coordinate'] = ''
        time.sleep(0.05)
        yield item

Example #4

Show file

File: lianjia_web_su.py Project: Rockyzsu/lianjia-spider

    def parse_body_bj(self, response):

        print response.url
        city_name = self.city_name
        content = response.body
        tree = etree.HTML(content)
        nodes = tree.xpath('//ul[@class="listContent"]/li')
        print "len : ", len(nodes)
        for node in nodes:
            items = LianjiaItem()
            name = node.xpath('.//div[@class="title"]/a/text()')[0]
            items['name'] = name
            try:
                position = node.xpath('.//div[@class="positionInfo"]/a/text()')
                address = position[0] + position[1]
            except:
                address = 'NA'
            print address
            items['location'] = address
            items['city_name'] = city_name
            try:
                text_content = node.xpath(
                    './/div[@class="positionInfo"]/text()')
                # print len(build_date)

                detail = text_content[3].split('/')
                # 除去北京，北京的页面会多一个小区结构
                building_date = detail[1].strip()
                building_type = "NA"
                '''
                for k, i in enumerate(detail):
                    print k, i

                if len(detail) == 4:
                    buiding_type = detail[1].strip() + detail[3].strip()
                    build_date = detail[3].strip()
                elif len(detail) == 3:
                    buiding_type = detail[1].strip()

                    build_date = detail[2].strip()
                '''
            except:
                building_date = '未知年建成'
            items['building_date'] = building_date
            items['building_type'] = building_type
            # details = desc.split()
            price_t = node.xpath('.//div[@class="totalPrice"]/span/text()')[0]

            p = re.findall('\d+', price_t)
            if len(p) != 0:
                price = int(price_t)
            else:
                price = '均价未知'
            print price
            price_detail = {
                'price': price,
                'origin': 'LJ',
                'crawl_date': self.crawl_date
            }
            price_list = []
            price_list.append(price_detail)
            price_dict = {self.price_month: price_list}
            items['price'] = price_dict
            yield items

Example #5

Show file

File: zufang.py Project: maxnoodles/lianjia_spider

class ZufangSpider(scrapy.Spider):
    name = 'zufang'
    allowed_domains = ['sz.lianjia.com']
    start_url = 'https://sz.lianjia.com/zufang/'
    page_titles = 30
    Item = LianjiaItem()

    def start_requests(self):
        """
        开始页
        :return:
        """
        yield scrapy.Request(url=self.start_url, callback=self.parse_district)

    def parse_district(self, response):
        """
        解析深圳各个区
        :param response:
        :return:
        """
        districts = response.xpath('//li[contains(@data-id, "2300")]')
        for dis in districts:
            url = dis.xpath('./a/@href').get()
            district = dis.xpath('./a/text()').get()
            if url:
                yield scrapy.Request(url=response.urljoin(url),
                                     callback=self.parse_bizcircle,
                                     meta={'district': district})

    def parse_bizcircle(self, response):
        """
        解析各个区中的商圈
        :param response:
        :return:
        """
        bizcircles = response.xpath('//li[@class="filter__item--level3  "]')
        for biz in bizcircles:
            district = response.meta['district']
            url = biz.xpath('./a/@href').get()
            bizcircle = biz.xpath('./a/text()').get()
            if url:
                yield scrapy.Request(url=response.urljoin(url),
                                     callback=self.parse_page,
                                     meta={
                                         'bizcircle': bizcircle,
                                         'district': district
                                     })

    def parse_page(self, response):
        """
        根据租房总量算出页数 page=int(total/30)
        :param response:
        :return:
        """
        title_num = response.xpath(
            '//span[@class="content__title--hl"]/text()').get()
        max_page = int(int(title_num) / self.page_titles + 1)
        for page in range(1, max_page + 1):
            bizcircle = response.meta['bizcircle']
            district = response.meta['district']
            yield scrapy.Request(url=response.url + 'pg{}'.format(page),
                                 callback=self.parse_content,
                                 meta={
                                     'bizcircle': bizcircle,
                                     'district': district
                                 })

    def parse_content(self, response):
        """
        解析租房数据
        :param response:
        :return:
        """
        contents = response.xpath('//div[@class="content__list--item"]')
        for content in contents:
            url = response.urljoin(
                content.xpath(
                    './/p[@class="content__list--item--title twoline"]/a/@href'
                ).get())
            title = content.xpath(
                './/p[@class="content__list--item--title twoline"]/a/text()'
            ).get().strip()
            district = response.meta['district']
            bizcircle = response.meta['bizcircle']
            area = content.xpath(
                './/p[@class="content__list--item--des"]//text()').re_first(
                    '(\d+㎡)')
            price = content.xpath(
                './/span[@class="content__list--item-price"]/em/text()').get(
                ).strip()
            apartment = content.xpath(
                './/p[@class="content__list--item--des"]//text()').re_first(
                    '(\d室\d厅\d卫)')
            company = content.xpath(
                './/p[@class="content__list--item--brand oneline"]//text()'
            ).get()
            if company:
                company = company.strip()
            for field in self.Item.fields:
                try:
                    self.Item[field] = eval(field)
                    yield self.Item
                except:
                    print('Field is not Defined: ' + field)

Example #6

Show file

File: ershoufang.py Project: xiaohuang915/lianjia_mysql

    def disposeData(self, response):
        item = LianjiaItem()
        length = response.xpath(
            "//div[@class='base']/div[@class='content']/ul/li").extract()
        #基本属性
        if (len(length) == 12):
            item["house_type"] = response.xpath(
                "//div[@class='content']/ul/li[1]/text()").extract()[0]
            item["floor"] = response.xpath(
                "//div[@class='content']/ul/li[2]/text()").extract()[0]
            item["area"] = response.xpath(
                "//div[@class='content']/ul/li[3]/text()").extract()[0]
            item["house_structure"] = response.xpath(
                "//div[@class='content']/ul/li[4]/text()").extract()[0]
            item["inside_space"] = response.xpath(
                "//div[@class='content']/ul/li[5]/text()").extract()[0]
            item["building_type"] = response.xpath(
                "//div[@class='content']/ul/li[6]/text()").extract()[0]
            item["direct"] = response.xpath(
                "//div[@class='content']/ul/li[7]/text()").extract()[0]
            item["building_structure"] = response.xpath(
                "//div[@class='content']/ul/li[8]/text()").extract()[0]
            item["decorate_situation"] = response.xpath(
                "//div[@class='content']/ul/li[9]/text()").extract()[0]
            item["elevator_proportion"] = response.xpath(
                "//div[@class='content']/ul/li[10]/text()").extract()[0]
            item["equipped_escalators"] = response.xpath(
                "//div[@class='content']/ul/li[11]/text()").extract()[0]
            item["property_term"] = response.xpath(
                "//div[@class='content']/ul/li[12]/text()").extract()[0]
            item["villa_type"] = ""
        elif (len(length) == 9):
            item["house_type"] = response.xpath(
                "//div[@class='content']/ul/li[1]/text()").extract()[0]
            item["floor"] = response.xpath(
                "//div[@class='content']/ul/li[2]/text()").extract()[0]
            item["area"] = response.xpath(
                "//div[@class='content']/ul/li[3]/text()").extract()[0]
            item["inside_space"] = response.xpath(
                "//div[@class='content']/ul/li[4]/text()").extract()[0]
            item["direct"] = response.xpath(
                "//div[@class='content']/ul/li[5]/text()").extract()[0]
            item["building_structure"] = response.xpath(
                "//div[@class='content']/ul/li[6]/text()").extract()[0]
            item["decorate_situation"] = response.xpath(
                "//div[@class='content']/ul/li[7]/text()").extract()[0]
            item["villa_type"] = response.xpath(
                "//div[@class='content']/ul/li[8]/text()").extract()[0]
            item["property_term"] = response.xpath(
                "//div[@class='content']/ul/li[9]/text()").extract()[0]
            item["house_structure"] = ""
            item["building_type"] = ""
            item["elevator_proportion"] = ""
            item["equipped_escalators"] = ""
        elif (len(length) == 3):
            item["floor"] = response.xpath(
                "//div[@class='content']/ul/li[1]/text()").extract()[0]
            item["area"] = response.xpath(
                "//div[@class='content']/ul/li[2]/text()").extract()[0]
            item["direct"] = response.xpath(
                "//div[@class='content']/ul/li[3]/text()").extract()[0]
            item["house_type"] = ""
            item["inside_space"] = ""
            item["building_structure"] = ""
            item["decorate_situation"] = ""
            item["villa_type"] = ""
            item["property_term"] = ""
            item["house_structure"] = ""
            item["building_type"] = ""
            item["elevator_proportion"] = ""
            item["equipped_escalators"] = ""
        elif (len(length) == 15):
            item["house_type"] = response.xpath(
                "//div[@class='content']/ul/li[1]/text()").extract()[0]
            item["floor"] = response.xpath(
                "//div[@class='content']/ul/li[2]/text()").extract()[0]
            item["area"] = response.xpath(
                "//div[@class='content']/ul/li[3]/text()").extract()[0]
            item["house_structure"] = response.xpath(
                "//div[@class='content']/ul/li[4]/text()").extract()[0]
            item["inside_space"] = response.xpath(
                "//div[@class='content']/ul/li[5]/text()").extract()[0]
            item["building_type"] = response.xpath(
                "//div[@class='content']/ul/li[6]/text()").extract()[0]
            item["direct"] = response.xpath(
                "//div[@class='content']/ul/li[7]/text()").extract()[0]
            item["building_structure"] = response.xpath(
                "//div[@class='content']/ul/li[8]/text()").extract()[0]
            item["decorate_situation"] = response.xpath(
                "//div[@class='content']/ul/li[9]/text()").extract()[0]
            item["elevator_proportion"] = response.xpath(
                "//div[@class='content']/ul/li[10]/text()").extract()[0]
            item["equipped_escalators"] = response.xpath(
                "//div[@class='content']/ul/li[11]/text()").extract()[0]
            item["property_term"] = response.xpath(
                "//div[@class='content']/ul/li[12]/text()").extract()[0]
            item["water_type"] = response.xpath(
                "//div[@class='content']/ul/li[13]/text()").extract()[0]
            item["electricity_type"] = response.xpath(
                "//div[@class='content']/ul/li[14]/text()").extract()[0]
            item["gas_price"] = response.xpath(
                "//div[@class='content']/ul/li[15]/text()").extract()[0]
            item["villa_type"] = ""

        #交易属性
        item["time_tone"] = response.xpath(
            "//div[@class='content']/ul/li[1]/span[2]/text()").extract()[0]
        item["trading_ownership"] = response.xpath(
            "//div[@class='content']/ul/li[2]/span[2]/text()").extract()[0]
        item["last_transaction"] = response.xpath(
            "//div[@class='content']/ul/li[3]/span[2]/text()").extract()[0]
        item["house_usage"] = response.xpath(
            "//div[@class='content']/ul/li[4]/span[2]/text()").extract()[0]
        item["house_term"] = response.xpath(
            "//div[@class='content']/ul/li[5]/span[2]/text()").extract()[0]
        item["property_owner"] = response.xpath(
            "//div[@class='content']/ul/li[6]/span[2]/text()").extract()[0]
        mortgage_info = response.xpath(
            "//div[@class='content']/ul/li[7]/span[2]/text()").extract()[0]
        mortgage_info = mortgage_info.replace(' ', '')
        mortgage_info = mortgage_info.replace('\n', '')
        item["mortgage_info"] = mortgage_info
        item["house_certificate"] = response.xpath(
            "//div[@class='content']/ul/li[8]/span[2]/text()").extract()[0]
        #其他属性
        item["total_price"] = response.xpath(
            "//span[@class='total']/text()").extract()[0]
        item["unit_price"] = response.xpath(
            "//span[@class='unitPriceValue']/text()").extract()[0]
        item["housing_name"] = response.xpath(
            "//div[@class='communityName']/a[1]/text()").extract()[0]
        item["county"] = response.xpath(
            "//div[@class='areaName']/span[2]/a[1]/text()").extract()[0]
        item["street"] = response.xpath(
            "//div[@class='areaName']/span[2]/a[2]/text()").extract()[0]
        item["built_year"] = response.xpath(
            "//div[@class='area']/div[2]/text()").extract()[0]
        print("爬取成功")
        yield item

Example #7

Show file

    def parse_content(self, response):
    '''
    parse_content 这个方法就是解析具体的页面了，可以看到，这个方法里面包含了非常多的条件判断，这是因为，我们之前定义的item字段里面的
    信息，并不是每一个小区都有的，就是说，我们要的信息他不是一个规规矩矩的信息，很多的房源没有提供相关的信息，比如地
    铁，周边学校等等的信息，我们这里就是如果有这个信息，我们就把它提取出来，如果没有的话，我们就给他自定义一个内容。
    最后将item提交给item pipeline进行后续的处理。
    '''
        selector = etree.HTML(response.text)
        cj_list = selector.xpath("//ul[@class='listContent']/li")



        for cj in cj_list:
            item = LianjiaItem()
            item['region'] = self.regions.get(response.meta['region'])
            href = cj.xpath('./a/@href')
            if not len(href):
                continue
            item['href'] = href[0]

            content = cj.xpath('.//div[@class="title"]/a/text()')
            if len(content):
                content = content[0].split()
                item['name'] = content[0]
                item['style'] = content[1]
                item['area'] = content[2]

            content = cj.xpath('.//div[@class="houseInfo"]/text()')
            if len(content):
                content = content[0].split('|')
                item['orientation'] = content[0]
                item['decoration'] = content[1]
                if len(content) == 3:
                    item['elevator'] = content[2]
                else:
                    item['elevator'] = '无'

            content = cj.xpath('.//div[@class="positionInfo"]/text()')
            if len(content):
                content = content[0].split()
                item['floor'] = content[0]
                if len(content) == 2:
                    item['build_year'] = content[1]
                else:
                    item['build_year'] = '无'

            content = cj.xpath('.//div[@class="dealDate"]/text()')
            if len(content):
                item['sign_time'] = content[0]

            content = cj.xpath('.//div[@class="totalPrice"]/span/text()')
            if len(content):
                item['total_price'] = content[0]

            content = cj.xpath('.//div[@class="unitPrice"]/span/text()')
            if len(content):
                item['unit_price'] = content[0]

            content = cj.xpath('.//span[@class="dealHouseTxt"]/span/text()')
            if len(content):
                for i in content:
                    if i.find("房屋满") != -1:
                        item['fangchan_class'] = i

                    elif i.find("号线") != -1:
                        item['subway'] = i

                    elif i.find("学") != -1:
                        item['school'] = i

            yield  item

Example #8

Show file

File: fang.py Project: mz2371383341/lianjia

    def parse_item(self, response):
        item = LianjiaItem()
        # 照片
        house_photo = response.xpath(
            '//ul[@class="smallpic"]/li/@data-src').extract()
        # 价格
        mon = response.xpath('//div[@class="content"]/div/span').xpath(
            'string(.)').extract()[2:4]
        money = '/'.join(mon)
        # 一平米多少钱
        one_area = \
        response.xpath('//div[@class="content"]/div/div[@class="text"]/div[1]/span').xpath('string(.)').extract()[0]
        # 房间
        rom = response.xpath('//div[@class="houseInfo"]/div[@class="room"]/div'
                             ).xpath('string(.)').extract()
        room = ','.join(rom)
        # 类型
        tp = response.xpath('//div[@class="houseInfo"]/div[@class="type"]/div'
                            ).xpath('string(.)').extract()
        typ = ','.join(tp)
        # 面积
        ara = response.xpath('//div[@class="houseInfo"]/div[@class="area"]/div'
                             ).xpath('string(.)').extract()
        area = ','.join(ara)
        # 介绍
        # 小区名称
        info_position = response.xpath(
            '//div[@class="aroundInfo"]/div[1]').xpath('string(.)').extract()
        # 位置
        region = response.xpath('//div[@class="aroundInfo"]/div[2]').xpath(
            'string(.)').extract()
        info_region = ''.join(region).split()
        # 看房时间
        info_look_time = response.xpath(
            '//div[@class="aroundInfo"]/div[3]').xpath('string(.)').extract()
        # 链家编号
        info_number = response.xpath(
            '//div[@class="aroundInfo"]/div[4]//span/text()').extract()
        # 基本属性
        basic_attributes = response.xpath(
            '//div[@class="base"]/div[@class="content"]/ul/li').xpath(
                'string(.)').extract()
        # 交易属性
        tran_attribute = response.xpath(
            '//div[@class="transaction"]/div[@class="content"]/ul/li').xpath(
                'string(.)').extract()
        transaction_attribute = ''.join(tran_attribute).split()

        # 注意
        careful = response.xpath(
            '//div[@class="introContent"]/div[@class="disclaimer"]/text()'
        ).extract()

        # 房源特色-->房源标签
        hous_label = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[1]/div[2]/a/text()').extract(
            )
        housing_label = ''.join(hous_label).split()[0]
        # 房源特色-->周边配套
        peri_matching = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[2]/div[2]/text()').extract()
        peripheral_matching = ''.join(peri_matching).split()[0]
        # 房源特色-->小区介绍
        com_introduction = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[3]/div[2]/text()').extract()
        community_introduction = ''.join(com_introduction).split()[0]
        # 房源特色-->装修描述
        dec_description = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[4]/div[2]/text()').extract()
        decoration_description = ''.join(dec_description).split()[0]
        # 房源特色 --> 核心卖点
        cor_sell_point = response.xpath(
            '//div[@class="box-l"]/div[2]/div/div[5]/div[2]/text()').extract()
        core_selling_point = ''.join(cor_sell_point).split()[0]
        # 注意事项
        matters_needing_attention = \
        response.xpath('//div[@class="box-l"]/div[2]/div/div[@class="disclaimer"]/text()').extract()[0]

        # 联系人照片
        contacts_photo = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/a/img/@src').extract()[0]
        # 联系人名字
        contacts_name = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/div/div/a/text()'
        ).extract()[0]
        # 联系人评分
        contacts_sco = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/div/div[2]/span/text()'
        ).extract()
        contacts_score = ''.join(contacts_sco)
        # 联系人电话
        contacts_tel = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/div/div[3]/text()'
        ).extract()
        contacts_telephone = '转'.join(contacts_tel)
        contacts_company = response.xpath(
            '//div[@class="component-agent-es-pc-6"]/div/div[@class="brokerName"]/div/span/text()'
        ).extract()[0]

        item['house_photo'] = house_photo
        item['money'] = money
        item['one_area'] = one_area
        item['room'] = room
        item['area'] = area
        item['typ'] = typ
        item['careful'] = careful
        item['info_position'] = info_position
        item['info_region'] = info_region
        item['info_look_time'] = info_look_time
        item['info_number'] = info_number
        item['basic_attributes'] = basic_attributes
        item['transaction_attribute'] = transaction_attribute
        item['housing_label'] = housing_label
        item['peripheral_matching'] = peripheral_matching
        item['community_introduction'] = community_introduction
        item['decoration_description'] = decoration_description
        item['core_selling_point'] = core_selling_point
        item['matters_needing_attention'] = matters_needing_attention
        item['contacts_photo'] = contacts_photo
        item['contacts_name'] = contacts_name
        item['contacts_score'] = contacts_score
        item['contacts_telephone'] = contacts_telephone
        item['contacts_company'] = contacts_company
        yield item

Example #9

Show file

File: spider.py Project: Phosky-PC/Lianjia-scrapy

    def parse_detail(self, response):

        # get detail data
        house_detail_url = response.url
        house_title = response.xpath(
            '//h3[@class="house_desc lazyload_ulog"]/text()').extract_first(
            ).replace("\n", "").strip() if len(
                response.xpath('//h3[@class="house_desc lazyload_ulog"]/text()'
                               )) > 0 else None
        house_price = response.xpath(
            '//h3[@class="similar_data"]/div[1]/p[2]/span[1]/text()'
        ).extract_first().split("万")[0] if len(
            response.xpath(
                '//h3[@class="similar_data"]/div/p[2]/span[1]/text()')
        ) > 0 else None
        house_single_price = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[1]/text()'
        ).extract_first().split("元/平")[0] if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[1]/text()')
        ) > 0 else None
        house_room_type = response.xpath(
            '//h3[@class="similar_data"]/div[2]/p[2]/text()').extract_first(
            ) if len(
                response.xpath('//h3[@class="similar_data"]/div[2]/p[2]/text()'
                               )) > 0 else None
        house_heading = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[3]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[3]/text()')
        ) > 0 else None
        house_floor = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[4]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[4]/text()')
        ) > 0 else None
        house_building_type = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[5]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[5]/text()')
        ) > 0 else None
        house_elevator = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[6]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[6]/text()')
        ) > 0 else None
        house_decoration = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[7]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[7]/text()')
        ) > 0 else None
        house_year = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[8]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[8]/text()')
        ) > 0 else None
        house_use = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[9]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[9]/text()')
        ) > 0 else None
        house_ownership = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[10]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[10]/text()'
            )) > 0 else None
        house_commiunity = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[12]/a/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[12]/a/text()'
            )) > 0 else None
        house_area = response.xpath(
            '//h3[@class="similar_data"]/div[3]/p[2]/text()').extract_first(
            ).split("m²")[0] if len(
                response.xpath('//h3[@class="similar_data"]/div[3]/p[2]/text()'
                               )) > 0 else None
        house_introduction = ''.join(
            response.xpath(
                '//div[@class="mod_cont fiveline house_intro_mod_cont"]/text()'
            ).extract()
        ).replace("\n", "").replace(" ", "") if len(
            response.xpath(
                '//div[@class="mod_cont fiveline house_intro_mod_cont"]/text()'
            )) > 0 else None
        house_adders = ''.join(
            response.xpath(
                '//div[@class="sub_mod_box location"]/div/a/div/div[@class="marker_desc"]/p/text()'
            ).extract()
        ).replace("\n", "").replace(" ", "") if len(
            response.xpath(
                '//div[@class="sub_mod_box location"]/div/a/div/div[@class="marker_desc"]/p/text()'
            )) > 0 else None
        house_date = response.xpath(
            '//ul[@class="house_description big lightblack"]/li[2]/text()'
        ).extract_first() if len(
            response.xpath(
                '//ul[@class="house_description big lightblack"]/li[2]/text()')
        ) > 0 else None

        item = LianjiaItem()

        item["house_title"] = house_title
        item["house_price"] = house_price
        item["house_single_price"] = house_single_price
        item["house_room_type"] = house_room_type
        item["house_heading"] = house_heading
        item["house_floor"] = house_floor
        item["house_building_type"] = house_building_type
        item["house_elevator"] = house_elevator
        item["house_decoration"] = house_decoration
        item["house_year"] = house_year
        item["house_use"] = house_use
        item["house_ownership"] = house_ownership
        item["house_commiunity"] = house_commiunity
        item["house_detail_url"] = house_detail_url
        item["house_area"] = house_area
        item["house_introduction"] = house_introduction
        item["house_adders"] = house_adders
        item["house_date"] = house_date

        yield item