コード例 #1
0
    def parse(self, response):
        sel = scrapy.Selector(response=response)
        nodes = sel.xpath('//div[@class="list-content"]/div[@_soj="xqlb"]')
        for node in nodes:
            item = ajk_residential_brief_item()
            eles = node.xpath('./div[@class="li-info"]/h3/a/@href').extract()
            item['residential_id'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="li-info"]/h3/a/text()').extract()
            item['residential_name'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './div[@class="li-info"]/address/text()').extract()
            item['address'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './div[@class="li-info"]/p[@class="date"]/text()').extract()
            item['build_year'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './div[@class="li-info"]/p[@class="bot-tag"]/a/@href').extract(
                )
            item['lat_lon'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './div[@class="li-side"]/p/strong/text()').extract()
            item['unit_price'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './div[@class="li-side"]/p[@class="price-txt price-down"]/text()'
            ).extract()
            item['pct_change_down'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './div[@class="li-side"]/p[@class="price-txt price-no"]/text()'
            ).extract()
            item['pct_change_no'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './div[@class="li-side"]/p[@class="price-txt"]/text()'
            ).extract()
            item['pct_change_up'] = format(eles[0]) if len(eles) > 0 else ''
            yield item

        for node in nodes:
            item = ajk_residential_brief_item()
            eles = node.xpath('./div[@class="li-info"]/h3/a/@href').extract()
            url = format(eles[0]) if len(eles) > 0 else ''
            if url != '':
                yield scrapy.Request(url,
                                     callback=self.baseinfo_parse,
                                     headers=self.headers)
            else:
                continue
コード例 #2
0
    def residential_baseinfo_parse(self, response):
        sel = scrapy.Selector(response=response)
        item = ftx_residential_baseinfo_item()

        item['residential_id'] = response.url

        eles = sel.xpath(
            '//div[@class="ceninfo_sq"]/h1/a[@class="tt"]/text()').extract()
        item['residential_name'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="box detaiLtop mt20 clearfix"]/dl[1]/dd/span/text()'
        ).extract()
        item['price'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="box detaiLtop mt20 clearfix"]/dl[2]/dd/span/text()'
        ).extract()
        item['price_mon_change'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="box detaiLtop mt20 clearfix"]/dl[@class="last"]/dd/span/text()'
        ).extract()
        item['price_year_change'] = format(eles[0]) if len(eles) > 0 else ''

        nodes = sel.xpath(
            '//div[@class="inforwrap clearfix"]/dl[@class=" clearfix mr30"]/dd'
        )
        for node in nodes:
            strong = node.xpath('./strong/text()').extract()[0] if len(
                node.xpath('./strong/text()').extract()) > 0 else ''
            value = node.xpath('text()').extract()[0] if len(
                node.xpath('text()').extract()) > 0 else ''
            field_map = {
                '小区地址': 'address',
                '所属区域': 'district',
                '物业类别': 'build_type',
                '建筑年代': 'build_year',
                '开 发 商': 'develop_company',
                '建筑类型': 'build_type',
                '建筑面积': 'build_area',
                '占地面积': 'build_area',
                '房屋总数': 'house_count',
                '楼栋总数': 'build_count',
                '物业公司': 'property_company',
                '绿 化 率': 'green_rate',
                '容 积 率': 'volumetric_rate',
                '物 业 费': 'property_cost'
            }
            keys = [key for key in field_map if key in strong]
            for key in keys:
                item[field_map[key]] = value
        yield item
コード例 #3
0
    def parse(self, response):
        sel = scrapy.Selector(response=response)

        nodes = sel.xpath('//ul[@class="listContent"]/li')
        for node in nodes:
            item = Residential_Brief()
            eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/@href').extract()
            item['residential_id'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/text()').extract()
            item['residential_name'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/a[@class="district"]/text()').extract()
            item['district'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/a[@class="bizcircle"]/text()').extract()
            item['bizcircle'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/text()').extract()
            item['build_year'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemPrice"]/div[@class="totalPrice"]/span/text()').extract()
            item['avg_price'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemPrice"]/div[@class="priceDesc"]/text()').extract()
            item['avg_price_date'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemSellCount"]/a[@class="totalSellCount"]/span/text()').extract()
            item['on_sale_count'] = format(eles[0]) if len(eles) > 0 else ''
            yield item

        for node in nodes:
            eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/@href').extract()
            url = format(eles[0]) if len(eles) > 0 else ''
            if url != '':
                yield scrapy.Request(url, callback=self.resident_detail_parse)
            else:
                continue
コード例 #4
0
    def residential_trad_parse(self, response):
        sel = scrapy.Selector(response=response)
        nodes = sel.xpath('//div[@class="dealSent sentwrap"]/table/tbody/tr')

        name = sel.xpath(
            '//div[@class="ceninfo_sq"]/h1/a[@class="tt"]/text()').extract()[0]
        for node in nodes:
            item = ftx_residential_trade_item()
            item['residential_id'] = response.url
            item['residential_name'] = name
            eles = node.xpath(
                './td[@class="firsttd"]/div[@class="hspro"]/p[1]/b/text()'
            ).extract()
            item['house_type'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './td[@class="firsttd"]/div[@class="hspro"]/p[2]/text()'
            ).extract()
            item['floor'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './td[@class="firsttd"]/div[@class="hspro"]/p[3]/text()'
            ).extract()
            item['orientation'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./td[2]/text()').extract()
            item['area'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./td[3]/b/text()').extract()
            item['trade_date'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./td[4]/b/text()').extract()
            item['price'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath('./td[5]/text()').extract()
            item['unit_price'] = format(eles[0]) if len(eles) > 0 else ''
            yield item

        if response.url.endswith('chengjiao/'):
            page_str = sel.xpath(
                '//div[@class="detailTitle clearfix"]/div[@class="frpageChange  floatr"]/span[@class=" floatl ml10"]/text()'
            ).extract()
            page = page_str[0].split('/')[1] if len(page_str) > 0 else 1
            for i in range(1, int(page)):
                url = response.url + '-p1%d-t11/'
                yield scrapy.Request(url % i,
                                     callback=self.residential_trad_parse)
コード例 #5
0
    def parse(self, response):
        sel = scrapy.Selector(response=response)
        nodes = sel.xpath('//div[@class="houseList"]/div[@class="list rel"]')
        for node in nodes:
            item = ftx_residential_brief_item()
            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/@href'
            ).extract()
            item['residential_id'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/text()'
            ).extract()
            item['residential_name'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/p/span[@class="plotFangType"]/text()'
            ).extract()

            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/p/span[@class="dj"]/li[@class="half"]'
            ).extract()
            els2 = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/p/span[@class="dj"]/li[@class="no2"]'
            ).extract()
            item['build_type'] = 5 - len(els2) - 0.5 * len(eles)

            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/p[2]/a/text()|./dl[@class="plotListwrap clearfix"]/dd/p[2]/text()'
            ).extract()
            item['address'] = "".join([format(e) for e in eles])

            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[1]/a/text()'
            ).extract()
            item['on_sale_count'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[2]/a/text()'
            ).extract()
            item['rental_count'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[3]/text()'
            ).extract()
            item['build_year'] = format(eles[0]) if len(eles) > 0 else ''

            eles = node.xpath(
                './div[@class="listRiconwrap"]/p[@class="priceAverage"]/span/text()'
            ).extract()
            item['price'] = "".join([format(e) for e in eles])

            eles = node.xpath(
                './div[@class="listRiconwrap"]/p[@class="ratio"]/span/text()'
            ).extract()
            item['price_pct_change'] = format(eles[0]) if len(eles) > 0 else ''
            yield item

        for node in nodes:
            eles = node.xpath(
                './dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/@href'
            ).extract()
            url = format(eles[0]) if len(eles) > 0 else ''
            if url != '':
                url_baseinfo = ''
                url_chengjiao = ''
                if url.endswith('esf/'):
                    url_baseinfo = url.replace('esf', 'xiangqing')
                    url_chengjiao = url.replace('esf', 'chengjiao')
                else:
                    url_baseinfo = url + 'xiangqing/'
                    url_chengjiao = url + 'chengjiao/'
                yield scrapy.Request(url_baseinfo,
                                     callback=self.residential_baseinfo_parse)
                yield scrapy.Request(url_chengjiao,
                                     callback=self.residential_trad_parse)

            else:
                continue
コード例 #6
0
ファイル: test.py プロジェクト: xiaoxu727/HouseCrawler
t.tt = 'tt'


# 闭包
def mulby(num):
    def gn(val):
        return num * val

    return gn


zw = mulby(7)
# print(zw(9))

a = ['de ', 'das ']
b = ''.join([format(i) for i in a])
# print(b)

# t = ['a'  for i in a if 'a' in i else '']
# set1 = {x for x in 'hello world' if x not in 'low level'}
set1 = [True for x in a if 'c' in x]
# print('小区地址' in '小区地址:')
# print(set1)

field_map = {'小区地址': 'address', '小所属区域': 'district'}
item = ftx_residential_baseinfo_item()
keys = [x for x in field_map if '小' in x]
# keys =['']
# for key in keys:
#     item[field_map[key]] = 'value'
# print(item)
コード例 #7
0
    def resident_detail_parse(self, response):
        sel = scrapy.Selector(response=response)
        item = Residential_Detail()
        item['residential_id'] = response.url
        eles = sel.xpath("//div[@class='detailHeader fl']/h1[@class='detailTitle']/text()").extract()
        item['residential_name'] = format(eles[0]) if len(eles) > 0 else ''
        eles = sel.xpath("//div[@class='detailHeader fl']/div[@class='detailDesc']/text()").extract()
        item['address'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath("//div[@class='xiaoquOverview']/div[@class='xiaoquDescribe fr']/div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem']/span[@class='xiaoquInfoContent']/text()").extract()
        item['build_year'] = format(eles[0]) if len(eles) > 0 else ''
        item['build_type'] = format(eles[1]) if len(eles) > 1 else ''
        item['property_cost'] = format(eles[2]) if len(eles) > 2 else ''
        item['property_company'] = format(eles[3]) if len(eles) > 3 else ''
        item['develop_company'] = format(eles[4]) if len(eles) > 4 else ''
        item['building_count'] = format(eles[5]) if len(eles) > 5 else ''
        item['house_count'] = format(eles[6]) if len(eles) > 6 else ''
        item['near_shop'] = format(eles[7]) if len(eles) > 7 else ''

        eles = sel.re("resblockPosition.+?',")
        item['lat_lon'] = str.replace(format(eles[0]), 'resblockPosition:', '').replace("'", "") if len(eles) > 0 else ''
        yield item
コード例 #8
0
    def baseinfo_parse(self, response):
        item = ajk_residential_baseinfo_item()
        sel = scrapy.Selector(response=response)

        item['residential_id'] = response.url

        eles = sel.xpath('//div[@class="comm-title"]/h1/text()').extract()
        item['residential_name'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="comm-title"]/h1/span[@class="sub-hd"]/text()'
        ).extract()
        item['address'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="price"]/span[@class="average"]/text()|//div[@class="price"]/span[@class="average no-data"]/text()'
        ).extract()
        item['avge_price'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="price"]/span[@class="status up"]/text()').extract()
        item['pct_change_up'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="price"]/span[@class="status level"]/text()'
        ).extract()
        item['pct_change_level'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="price"]/span[@class="status down"]/text()').extract(
            )
        item['pct_change_down'] = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="basic-infos-box"]/dl[@class="basic-parms-mod"]/dd/text()'
        ).extract()
        field_map = {
            '物业类型': 'property_type',
            '物业费': 'property_cost',
            '总建面积': 'area',
            '总户数': 'house_count',
            '建造年代': 'build_year',
            '停车位': 'parking_count',
            '容  积  率': 'volumetric_rate',
            '绿化率': 'green_rate',
            '开  发  商': 'develop_company',
            '物业公司': 'property_company'
        }
        item['property_type'] = format(eles[0]) if len(eles) > 0 else ''
        item['property_cost'] = format(eles[1]) if len(eles) > 1 else ''
        item['area'] = format(eles[2]) if len(eles) > 2 else ''
        item['house_count'] = format(eles[3]) if len(eles) > 3 else ''
        item['build_year'] = format(eles[4]) if len(eles) > 4 else ''
        item['parking_count'] = format(eles[5]) if len(eles) > 5 else ''
        item['volumetric_rate'] = format(eles[6]) if len(eles) > 6 else ''
        item['green_rate'] = format(eles[7]) if len(eles) > 7 else ''
        item['develop_company'] = format(eles[8]) if len(eles) > 8 else ''
        item['property_company'] = format(eles[9]) if len(eles) > 9 else ''

        eles = sel.xpath(
            '//div[@class="basic-infos-box"]/div[@class="houses-sets-mod j-house-num"]/a[@class="num ershou-num"]/text()'
        ).extract()
        second_house_count = format(eles[0]) if len(eles) > 0 else ''

        eles = sel.xpath(
            '//div[@class="basic-infos-box"]/div[@class="houses-sets-mod j-house-num"]/a[@data-soj="baseinfozu"]/text()'
        ).extract()
        rental_count = format(eles[0]) if len(eles) > 0 else ''
        yield item