def parse(self, response): sel = scrapy.Selector(response=response) nodes = sel.xpath('//div[@class="list-content"]/div[@_soj="xqlb"]') for node in nodes: item = ajk_residential_brief_item() eles = node.xpath('./div[@class="li-info"]/h3/a/@href').extract() item['residential_id'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="li-info"]/h3/a/text()').extract() item['residential_name'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './div[@class="li-info"]/address/text()').extract() item['address'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './div[@class="li-info"]/p[@class="date"]/text()').extract() item['build_year'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './div[@class="li-info"]/p[@class="bot-tag"]/a/@href').extract( ) item['lat_lon'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './div[@class="li-side"]/p/strong/text()').extract() item['unit_price'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './div[@class="li-side"]/p[@class="price-txt price-down"]/text()' ).extract() item['pct_change_down'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './div[@class="li-side"]/p[@class="price-txt price-no"]/text()' ).extract() item['pct_change_no'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './div[@class="li-side"]/p[@class="price-txt"]/text()' ).extract() item['pct_change_up'] = format(eles[0]) if len(eles) > 0 else '' yield item for node in nodes: item = ajk_residential_brief_item() eles = node.xpath('./div[@class="li-info"]/h3/a/@href').extract() url = format(eles[0]) if len(eles) > 0 else '' if url != '': yield scrapy.Request(url, callback=self.baseinfo_parse, headers=self.headers) else: continue
def residential_baseinfo_parse(self, response): sel = scrapy.Selector(response=response) item = ftx_residential_baseinfo_item() item['residential_id'] = response.url eles = sel.xpath( '//div[@class="ceninfo_sq"]/h1/a[@class="tt"]/text()').extract() item['residential_name'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="box detaiLtop mt20 clearfix"]/dl[1]/dd/span/text()' ).extract() item['price'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="box detaiLtop mt20 clearfix"]/dl[2]/dd/span/text()' ).extract() item['price_mon_change'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="box detaiLtop mt20 clearfix"]/dl[@class="last"]/dd/span/text()' ).extract() item['price_year_change'] = format(eles[0]) if len(eles) > 0 else '' nodes = sel.xpath( '//div[@class="inforwrap clearfix"]/dl[@class=" clearfix mr30"]/dd' ) for node in nodes: strong = node.xpath('./strong/text()').extract()[0] if len( node.xpath('./strong/text()').extract()) > 0 else '' value = node.xpath('text()').extract()[0] if len( node.xpath('text()').extract()) > 0 else '' field_map = { '小区地址': 'address', '所属区域': 'district', '物业类别': 'build_type', '建筑年代': 'build_year', '开 发 商': 'develop_company', '建筑类型': 'build_type', '建筑面积': 'build_area', '占地面积': 'build_area', '房屋总数': 'house_count', '楼栋总数': 'build_count', '物业公司': 'property_company', '绿 化 率': 'green_rate', '容 积 率': 'volumetric_rate', '物 业 费': 'property_cost' } keys = [key for key in field_map if key in strong] for key in keys: item[field_map[key]] = value yield item
def parse(self, response): sel = scrapy.Selector(response=response) nodes = sel.xpath('//ul[@class="listContent"]/li') for node in nodes: item = Residential_Brief() eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/@href').extract() item['residential_id'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/text()').extract() item['residential_name'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/a[@class="district"]/text()').extract() item['district'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/a[@class="bizcircle"]/text()').extract() item['bizcircle'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/text()').extract() item['build_year'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemPrice"]/div[@class="totalPrice"]/span/text()').extract() item['avg_price'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemPrice"]/div[@class="priceDesc"]/text()').extract() item['avg_price_date'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemSellCount"]/a[@class="totalSellCount"]/span/text()').extract() item['on_sale_count'] = format(eles[0]) if len(eles) > 0 else '' yield item for node in nodes: eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/@href').extract() url = format(eles[0]) if len(eles) > 0 else '' if url != '': yield scrapy.Request(url, callback=self.resident_detail_parse) else: continue
def residential_trad_parse(self, response): sel = scrapy.Selector(response=response) nodes = sel.xpath('//div[@class="dealSent sentwrap"]/table/tbody/tr') name = sel.xpath( '//div[@class="ceninfo_sq"]/h1/a[@class="tt"]/text()').extract()[0] for node in nodes: item = ftx_residential_trade_item() item['residential_id'] = response.url item['residential_name'] = name eles = node.xpath( './td[@class="firsttd"]/div[@class="hspro"]/p[1]/b/text()' ).extract() item['house_type'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './td[@class="firsttd"]/div[@class="hspro"]/p[2]/text()' ).extract() item['floor'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './td[@class="firsttd"]/div[@class="hspro"]/p[3]/text()' ).extract() item['orientation'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./td[2]/text()').extract() item['area'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./td[3]/b/text()').extract() item['trade_date'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./td[4]/b/text()').extract() item['price'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./td[5]/text()').extract() item['unit_price'] = format(eles[0]) if len(eles) > 0 else '' yield item if response.url.endswith('chengjiao/'): page_str = sel.xpath( '//div[@class="detailTitle clearfix"]/div[@class="frpageChange floatr"]/span[@class=" floatl ml10"]/text()' ).extract() page = page_str[0].split('/')[1] if len(page_str) > 0 else 1 for i in range(1, int(page)): url = response.url + '-p1%d-t11/' yield scrapy.Request(url % i, callback=self.residential_trad_parse)
def parse(self, response): sel = scrapy.Selector(response=response) nodes = sel.xpath('//div[@class="houseList"]/div[@class="list rel"]') for node in nodes: item = ftx_residential_brief_item() eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/@href' ).extract() item['residential_id'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/text()' ).extract() item['residential_name'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/p/span[@class="plotFangType"]/text()' ).extract() eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/p/span[@class="dj"]/li[@class="half"]' ).extract() els2 = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/p/span[@class="dj"]/li[@class="no2"]' ).extract() item['build_type'] = 5 - len(els2) - 0.5 * len(eles) eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/p[2]/a/text()|./dl[@class="plotListwrap clearfix"]/dd/p[2]/text()' ).extract() item['address'] = "".join([format(e) for e in eles]) eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[1]/a/text()' ).extract() item['on_sale_count'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[2]/a/text()' ).extract() item['rental_count'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/ul[@class="sellOrRenthy clearfix"]/li[3]/text()' ).extract() item['build_year'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath( './div[@class="listRiconwrap"]/p[@class="priceAverage"]/span/text()' ).extract() item['price'] = "".join([format(e) for e in eles]) eles = node.xpath( './div[@class="listRiconwrap"]/p[@class="ratio"]/span/text()' ).extract() item['price_pct_change'] = format(eles[0]) if len(eles) > 0 else '' yield item for node in nodes: eles = node.xpath( './dl[@class="plotListwrap clearfix"]/dd/p/a[@class="plotTit"]/@href' ).extract() url = format(eles[0]) if len(eles) > 0 else '' if url != '': url_baseinfo = '' url_chengjiao = '' if url.endswith('esf/'): url_baseinfo = url.replace('esf', 'xiangqing') url_chengjiao = url.replace('esf', 'chengjiao') else: url_baseinfo = url + 'xiangqing/' url_chengjiao = url + 'chengjiao/' yield scrapy.Request(url_baseinfo, callback=self.residential_baseinfo_parse) yield scrapy.Request(url_chengjiao, callback=self.residential_trad_parse) else: continue
t.tt = 'tt' # 闭包 def mulby(num): def gn(val): return num * val return gn zw = mulby(7) # print(zw(9)) a = ['de ', 'das '] b = ''.join([format(i) for i in a]) # print(b) # t = ['a' for i in a if 'a' in i else ''] # set1 = {x for x in 'hello world' if x not in 'low level'} set1 = [True for x in a if 'c' in x] # print('小区地址' in '小区地址:') # print(set1) field_map = {'小区地址': 'address', '小所属区域': 'district'} item = ftx_residential_baseinfo_item() keys = [x for x in field_map if '小' in x] # keys =[''] # for key in keys: # item[field_map[key]] = 'value' # print(item)
def resident_detail_parse(self, response): sel = scrapy.Selector(response=response) item = Residential_Detail() item['residential_id'] = response.url eles = sel.xpath("//div[@class='detailHeader fl']/h1[@class='detailTitle']/text()").extract() item['residential_name'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath("//div[@class='detailHeader fl']/div[@class='detailDesc']/text()").extract() item['address'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath("//div[@class='xiaoquOverview']/div[@class='xiaoquDescribe fr']/div[@class='xiaoquInfo']/div[@class='xiaoquInfoItem']/span[@class='xiaoquInfoContent']/text()").extract() item['build_year'] = format(eles[0]) if len(eles) > 0 else '' item['build_type'] = format(eles[1]) if len(eles) > 1 else '' item['property_cost'] = format(eles[2]) if len(eles) > 2 else '' item['property_company'] = format(eles[3]) if len(eles) > 3 else '' item['develop_company'] = format(eles[4]) if len(eles) > 4 else '' item['building_count'] = format(eles[5]) if len(eles) > 5 else '' item['house_count'] = format(eles[6]) if len(eles) > 6 else '' item['near_shop'] = format(eles[7]) if len(eles) > 7 else '' eles = sel.re("resblockPosition.+?',") item['lat_lon'] = str.replace(format(eles[0]), 'resblockPosition:', '').replace("'", "") if len(eles) > 0 else '' yield item
def baseinfo_parse(self, response): item = ajk_residential_baseinfo_item() sel = scrapy.Selector(response=response) item['residential_id'] = response.url eles = sel.xpath('//div[@class="comm-title"]/h1/text()').extract() item['residential_name'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="comm-title"]/h1/span[@class="sub-hd"]/text()' ).extract() item['address'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="price"]/span[@class="average"]/text()|//div[@class="price"]/span[@class="average no-data"]/text()' ).extract() item['avge_price'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="price"]/span[@class="status up"]/text()').extract() item['pct_change_up'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="price"]/span[@class="status level"]/text()' ).extract() item['pct_change_level'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="price"]/span[@class="status down"]/text()').extract( ) item['pct_change_down'] = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="basic-infos-box"]/dl[@class="basic-parms-mod"]/dd/text()' ).extract() field_map = { '物业类型': 'property_type', '物业费': 'property_cost', '总建面积': 'area', '总户数': 'house_count', '建造年代': 'build_year', '停车位': 'parking_count', '容 积 率': 'volumetric_rate', '绿化率': 'green_rate', '开 发 商': 'develop_company', '物业公司': 'property_company' } item['property_type'] = format(eles[0]) if len(eles) > 0 else '' item['property_cost'] = format(eles[1]) if len(eles) > 1 else '' item['area'] = format(eles[2]) if len(eles) > 2 else '' item['house_count'] = format(eles[3]) if len(eles) > 3 else '' item['build_year'] = format(eles[4]) if len(eles) > 4 else '' item['parking_count'] = format(eles[5]) if len(eles) > 5 else '' item['volumetric_rate'] = format(eles[6]) if len(eles) > 6 else '' item['green_rate'] = format(eles[7]) if len(eles) > 7 else '' item['develop_company'] = format(eles[8]) if len(eles) > 8 else '' item['property_company'] = format(eles[9]) if len(eles) > 9 else '' eles = sel.xpath( '//div[@class="basic-infos-box"]/div[@class="houses-sets-mod j-house-num"]/a[@class="num ershou-num"]/text()' ).extract() second_house_count = format(eles[0]) if len(eles) > 0 else '' eles = sel.xpath( '//div[@class="basic-infos-box"]/div[@class="houses-sets-mod j-house-num"]/a[@data-soj="baseinfozu"]/text()' ).extract() rental_count = format(eles[0]) if len(eles) > 0 else '' yield item