Exemple #1
0
    def parse_content(self, response):
        selector = etree.HTML(response.text)
        cj_list = selector.xpath("//ul[@class='listContent']/li")

        for cj in cj_list:
            item = LianjiaItem()
            item['region'] = self.regions.get(response.meta['region'])
            href = cj.xpath('./a/@href')
            if not len(href):
                continue
            item['href'] = href[0]

            content = cj.xpath('.//div[@class="title"]/a/text()')
            if len(content):
                content = content[0].split()  # 按照空格分割成一个列表
                item['name'] = content[0]
                item['style'] = content[1]
                item['area'] = content[2]

            content = cj.xpath('.//div[@class="houseInfo"]/text()')
            if len(content):
                content = content[0].split('|')
                item['orientation'] = content[0]
                item['decoration'] = content[1]
                if len(content) == 3:
                    item['elevator'] = content[2]
                else:
                    item['elevator'] = '无'

            content = cj.xpath('.//div[@class="positionInfo"]/text()')
            if len(content):
                content = content[0].split()
                item['floor'] = content[0]
                if len(content) == 2:
                    item['build_year'] = content[1]
                else:
                    item['build_yaer'] = '无'

            content = cj.xpath('.//div[@class="dealDate"]/text()')
            if len(content):
                item['sign_time'] = content[0]

            content = cj.xpath('.//div[@class="totalPrice"]/span/text()')
            if len(content):
                item['total_price'] = content[0] + '万'

            content = cj.xpath('.//div[@class="unitPrice"]/span/text()')
            if len(content):
                item['unit_price'] = content[0] + '元/平'

            content = cj.xpath('.//div[@class="dealHouseTxt"]/span/text()')
            if len(content):
                for i in content:
                    if i.find("房屋满") != -1:  # 找到了返回的是非-1得数,找不到的返回的是-1
                        item['fangchan_class'] = i
                    elif i.find("近地铁") != -1:
                        item['subway'] = i
                    elif i.find("学") != -1:
                        item['school'] = i
            yield item
Exemple #2
0
    def parse_getitem(self, response, province, city, link_chengjiao, host):
        title = response.xpath("//h1[@class= 'index_h1']/text()").getall()
        # 检测人机认证
        if not title:
            n_title = response.xpath(
                "//div[@class='container']/div/h1/text()").get()
            if '人机认证' in n_title:
                logging.warning("人机认证: No title in refresh cache" +
                                link_chengjiao)
                request = scrapy.Request(
                    link_chengjiao,
                    # meta={'dont_redirect': True, 'handle_httpstatus_list': [302]},
                    meta={"refresh_cache": True},
                    dont_filter=True,
                    headers={
                        'Host': host,
                        'Referer': link_chengjiao
                    },
                    callback=self.parse_getitem,
                    cb_kwargs={
                        'province': province,
                        'city': city,
                        'link_chengjiao': link_chengjiao,
                        'host': host
                    })
                yield request
            else:
                logging.warning("This is a warning: No title" + link_chengjiao)
        else:
            # 获取信息
            price = response.xpath(
                "//span[@class='dealTotalPrice']/i/text()").getall()
            # average = response.xpath("//div[@class='price']//b/text()").getall()
            msg = response.xpath("//div[@class='msg']//label/text()").getall()
            content_temp = response.xpath(
                "//div[@class = 'content']//li/text()").getall()
            content = []
            for c in content_temp:
                content.append(c.strip())
            if title is None or '':
                logging.warning('warning there is no Title' + link_chengjiao)

            record_list = []
            record_price = response.xpath(
                "//ul[@class = 'record_list']//span[@class = 'record_price']/text()"
            ).getall()
            record_detail = response.xpath(
                "//ul[@class = 'record_list']//p[@class = 'record_detail']/text()"
            ).getall()
            for r in range(len(record_price)):
                record_list.append(record_price[r])
                record_list.extend(record_detail[r].split(","))
            # 拼凑在成list 方便写入csv
            item = [province] + [city] + [
                link_chengjiao
            ] + title + price + msg + content + record_list

            line = LianjiaItem()
            line['item'] = item
            yield line
 def parse(self, response):
     #l = ItemLoader(item = LianjiaItem(),response=response)
     for i in range(
             0,
             len(
                 response.xpath(
                     "//div[@class='info-panel']/h2/a/text()").extract())):
         l = ItemLoader(item=LianjiaItem(), response=response)
         info = response.xpath("//div[@class='info-panel']/h2/a/text()"
                               ).extract()[i].encode('utf-8')
         local = response.xpath("//div[@class='info-panel']").xpath(
             ".//span[@class='region']/text()").extract()[i].encode('utf-8')
         house_layout = response.xpath("//div[@class='info-panel']").xpath(
             ".//span[@class='zone']//text()").extract()[i].encode('utf-8')
         house_square = response.xpath("//div[@class='info-panel']").xpath(
             ".//span[@class='meters']/text()").extract()[i].encode('utf-8')
         house_orientation = response.xpath(
             "//div[@class='info-panel']").xpath(
                 ".//div[@class='where']//span/text()").extract()[
                     (i + 1) * 4 - 1].encode('utf-8')
         district = response.xpath("//div[@class='info-panel']").xpath(
             ".//div[@class='con']/a/text()").extract()[i].encode(
                 'utf-8')[:-6]
         floor = response.xpath("//div[@class='info-panel']").xpath(
             ".//div[@class='con']//text()").extract()[(i + 1) * 5 -
                                                       3].encode('utf-8')
         building_year = response.xpath("//div[@class='info-panel']").xpath(
             ".//div[@class='con']//text()").extract()[(i + 1) * 5 -
                                                       1].encode('utf-8')
         price_month = response.xpath("//div[@class='info-panel']").xpath(
             ".//span[@class='num']//text()").extract()[(i + 1) * 2 -
                                                        2].encode('utf-8')
         person_views = response.xpath("//div[@class='info-panel']").xpath(
             ".//span[@class='num']//text()").extract()[(i + 1) * 2 -
                                                        1].encode('utf-8')
         tags = []
         for j in range(
                 0,
                 len(
                     response.xpath("//div[@class='view-label left']")
                     [i].xpath(".//span//text()").extract())):
             tags.append(
                 response.xpath("//div[@class='view-label left']")[i].xpath(
                     ".//span//text()").extract()[j].encode("utf-8"))
         l.add_value('info', info)
         l.add_value('local', local)
         l.add_value('house_layout', house_layout)
         l.add_value('house_square', house_square)
         l.add_value('house_orientation', house_orientation)
         l.add_value('district', district)
         l.add_value('floor', floor)
         l.add_value('building_year', building_year)
         l.add_value('price_month', price_month)
         l.add_value('person_views', person_views)
         l.add_value('tags', tags)
         print l
         yield l.load_item()
Exemple #4
0
    def parse_item(self, response):
        item = LianjiaItem()

        item['title'] = response.xpath('//h1/@title').extract()[0]

        item['house_type'] = re.findall(r"houseType:'(.*?)',",
                                        response.text)[0]
        item['position'] = re.findall(r"resblockPosition:'(.*?)',",
                                      response.text)[0]
        item['longitude'] = item['position'].split(',')[0]
        item['latitude'] = item['position'].split(',')[1]

        item['area'] = re.findall(r"area:'(.*?)',", response.text)[0]
        item['total_price'] = re.findall(r"totalPrice:'(.*?)',",
                                         response.text)[0]
        item['avg_price'] = re.findall(r"price:'(.*?)',", response.text)[0]
        item['community'] = re.findall(r"resblockName:'(.*?)',",
                                       response.text)[0]

        base_datail = response.xpath(
            '//*[@id="introduction"]//ul/li/text()').extract()
        item['layout'] = base_datail[0]
        item['floor'] = base_datail[1]
        # item['area'] = base_datail[2][:-1]

        if item['house_type'] == '别墅':
            item['direction'] = base_datail[4]
            item['decorate'] = base_datail[6]

        else:
            item['design'] = base_datail[3]
            item['direction'] = base_datail[6]
            item['decorate'] = base_datail[8]
            item['lift'] = base_datail[10]
            item['lift_proportion'] = base_datail[9]

        # item['total_price'] = response.xpath('//span[@class="total"]/text()').extract()[0]
        # item['avg_price'] = response.xpath('//span[@class="unitPriceValue"]/text()').extract()[0]

        item['region'] = response.xpath(
            '//span[@class="info"]/a[1]/text()').extract()[0]
        item['local'] = response.xpath(
            '//span[@class="info"]/a[2]/text()').extract()[0]
        # item['community'] = response.xpath('//div[@class="communityName"]/a[1]/text()').extract()[0]

        yield item
Exemple #5
0
 def parseDetail(self,response):
     item=LianjiaItem()
     item['title']=response.xpath('//div[@class="title"]/h1/text()').extract_first()
     item['community']=response.xpath('//div[@class="communityName"]/a[@class="info"]/text()').extract_first()
     item['model']=response.xpath('//div[@class="room"]/div[@class="mainInfo"]/text()').extract_first()#户型
     item['floor']=response.xpath('//div[@class="room"]/div[@class="subInfo"]/text()').extract_first()
     item['orientation']=response.xpath('//div[@class="type"]/div[@class="mainInfo"]/text()').extract_first()
     item['decorate']=response.xpath('//div[@class="type"]/div[@class="subInfo"]/text()').extract_first()
     item['area']=response.xpath('//div[@class="area"]/div[@class="mainInfo"]/text()').extract_first()#面积
     item['buildtime']=response.xpath('//div[@class="area"]/div[@class="subInfo"]/text()').extract_first()
     item['focus_num']=response.xpath('//span[@id="favCount"]/text()').extract_first()#关注人数
     item['watch_num']=response.xpath('//span[@id="cartCount"]/text()').extract_first()#观看人数
     item['time']=response.xpath('//div[@class="transaction"]//ul/li[1]/text()').extract_first()#发布时间
     item['price']=response.xpath('//span[@class="total"]/text()').extract_first()#价格
     item['link']=response.url#详细链接
     item['Latitude']=response.xpath('//script[19]/text()').re_first(r"resblockPosition:'(.*?)'")#金维度
     item['city']=response.xpath('//span[@class="info"]/a[1]/text()').extract_first()#城区
     yield item
Exemple #6
0
    def get_info(self, response):
        # print("++++++++++++++++", item)
        item = LianjiaItem()
        # print("++++++++++++++++", item.keys)
        item['链接'] = response.url
        item['小区名称'] = response.xpath(
            '//div[@class="communityName"]//text()').extract()[1]
        item['所在区域'] = response.xpath(
            '//div[@class="areaName"]//span[@class="info"]//a//text()'
        ).extract()
        item['地铁站'] = response.xpath(
            '//div[@class="areaName"]/a[@class="supplement"]//text()').extract(
            )
        item['总价'] = response.xpath('//span[@class="total"]//text()').extract()
        item['单价'] = response.xpath(
            '//span[@class="unitPriceValue"]//text()').extract()[0]
        item['建筑时间'] = response.xpath(
            '//div[@class="area"]//div[@class="subInfo"]//text()').extract(
            )[0].split('/')[0]

        base_infos_keys = response.xpath(
            '//div[@class="base"]//li/span/text()').extract()
        base_infos = response.xpath(
            '//div[@class="base"]//li/text()').extract()
        for key in base_infos_keys:
            try:
                item[key] = base_infos[base_infos_keys.index(key)]
            except:
                pass

        transactions_infos_keys = response.xpath(
            '//div[@class="transaction"]//li/span[@class="label"]//text()'
        ).extract()
        transactions_infos = response.xpath(
            '//div[@class="transaction"]//li//span[2]//text()').extract()
        for key in transactions_infos_keys:
            try:
                item[key] = transactions_infos[transactions_infos_keys.index(
                    key)]
            except:
                pass

        yield item
        pass
Exemple #7
0
    def parse1(self, response):
        info = Selector(response)
        house_items = {}

        community_names = info.xpath(self.xpath1).extract()
        basic_infos = info.xpath(self.xpath2).extract()
        locations = info.xpath(self.xpath3).extract()
        total_prices = info.xpath(self.xpath4).extract()
        per_flats = info.xpath(self.xpath5).extract()

        for index in range(len(community_names)):
            item = LianjiaItem()
            item['community_name'] = community_names[index]
            item['basic_info'] = basic_infos[index]
            item['location'] = locations[index]
            item['total_price'] = total_prices[index]
            item['per_flat'] = per_flats[index]
            house_items[index] = item

        yield house_items
Exemple #8
0
    def pageData(self, response):
        print("=" * 50)
        city = response.meta.get("info")
        detail_li = response.xpath("//ul[@class='sellListContent']/li")
        for page_li in detail_li:
            if page_li.xpath("@class").get() == "list_app_daoliu":
                continue
            money = page_li.xpath(
                ".//div[@class='totalPrice']/span/text()").get()
            money = str(money) + "万"
            address = page_li.xpath(
                ".//div[@class='positionInfo']/a/text()").get()

            #获取到房屋的全部数据,进行分割
            house_data = page_li.xpath(
                ".//div[@class='houseInfo']/text()").get().split("|")

            #房屋格局
            house_pattern = house_data[0]
            #面积大小
            house_size = house_data[1].strip()
            #装修程度
            house_degree = house_data[3].strip()
            #楼层
            house_floor = house_data[4].strip()
            #单价
            price = page_li.xpath(
                ".//div[@class='unitPrice']/span/text()").get().replace(
                    "单价", "")
            time.sleep(0.5)
            item = LianjiaItem(city=city,
                               money=money,
                               address=address,
                               house_pattern=house_pattern,
                               house_size=house_size,
                               house_degree=house_degree,
                               house_floor=house_floor,
                               price=price)
            yield item