def housing_handle(self, response): item = response.meta.get("item") item1 = Item() PropertyAddress = response.xpath( "//div[@class='detailDesc']/text()").extract_first() PriceUnit = response.xpath( "//span[@class='xiaoquUnitPrice']/text()").extract_first() detail_community = response.xpath("//div[@class='xiaoquInfo']") BuildedTime = detail_community.xpath( "./div[@class='xiaoquInfoItem'][1]/span[@class='xiaoquInfoContent']/text()" ).extract_first() BuildingType = detail_community.xpath( "./div[@class='xiaoquInfoItem'][2]/span[@class='xiaoquInfoContent']/text()" ).extract_first() PropertyFee = detail_community.xpath( "./div[@class='xiaoquInfoItem'][3]/span[@class='xiaoquInfoContent']/text()" ).extract_first() PropertyCompany = detail_community.xpath( "./div[@class='xiaoquInfoItem'][4]/span[@class='xiaoquInfoContent']/text()" ).extract_first() Developers = detail_community.xpath( "./div[@class='xiaoquInfoItem'][5]/span[@class='xiaoquInfoContent']/text()" ).extract_first() TotalBuilding = detail_community.xpath( "./div[@class='xiaoquInfoItem'][6]/span[@class='xiaoquInfoContent']/text()" ).extract_first() TotalHouseholds = detail_community.xpath( "./div[@class='xiaoquInfoItem'][7]/span[@class='xiaoquInfoContent']/text()" ).extract_first() NearbyStores = detail_community.xpath( "./div[@class='xiaoquInfoItem'][8]/span[@class='xiaoquInfoContent']" ).xpath("string(.)").extract_first() item1.fields["PropertyAddress"] = Field() item1["PropertyAddress"] = PropertyAddress item1.fields["PriceUnit"] = Field() item1["PriceUnit"] = PriceUnit item1.fields["BuildedTime"] = Field() item1["BuildedTime"] = BuildedTime item1.fields["BuildingType"] = Field() item1["BuildingType"] = BuildingType item1.fields["PropertyFee"] = Field() item1["PropertyFee"] = PropertyFee item1.fields["PropertyCompany"] = Field() item1["PropertyCompany"] = PropertyCompany item1.fields["Developers"] = Field() item1["Developers"] = Developers item1.fields["TotalBuilding"] = Field() item1["TotalBuilding"] = TotalBuilding item1.fields["TotalHouseholds"] = Field() item1["TotalHouseholds"] = TotalHouseholds item1.fields["NearbyStores"] = Field() item1["NearbyStores"] = NearbyStores item1.update(item) yield item1
def handle_1(self, response): item1 = response.meta.get("item1") req = response.meta.get("req") fragments = re.findall("url \+= '(.*?)'", response.text, re.S) detail_url = '' for j in fragments: detail_url += j item2 = Item() item2.fields["NewUrl"] = Field() item2["NewUrl"] = detail_url item2.update(item1) yield scrapy.Request(url=detail_url, callback=self.parse, meta={"req": req, "item2": item2, "last_page": True})
def handle_1(self, response): item = response.meta.get("item") item1 = Item() PropertyAddress = response.xpath( "//div[contains(@class,'rent-top')]/a/text()").extract_first() PriceUnit = response.xpath( "//div[contains(@class,'junjia')]/span/text()").extract_first() ls_detail = response.xpath( "//div[@class='xqfangs detail_bor_bottom']/ul[@class='clear']/li/text()" ).extract() BuildedTime = None BuildingType = None for detail in ls_detail: if "年" in detail: BuildedTime = detail else: BuildingType = detail PropertyCompany = response.xpath( "//ul/li[@class='wuyes']/em/text()").extract_first() Developers = response.xpath( "//ul/li[@class='kaifas']/em/text()").extract_first() TotalBuilding = response.xpath( "//div[@class='xqsaleinfo']/ul/li[1]/span/text()").extract_first() TotalHouseholds = response.xpath( "//div[@class='xqsaleinfo']/ul/li[2]/span/text()").extract_first() NearbyStores = response.xpath( "//div[@class='xqsaleinfo']/ul/li[6]/span/text()").extract_first() AroundTraffic = response.xpath( "//div[@class='xqsaleinfo']/ul/li[5]/span/text()").extract_first() item1.fields["PropertyAddress"] = Field() item1["PropertyAddress"] = PropertyAddress item1.fields["PriceUnit"] = Field() item1["PriceUnit"] = PriceUnit item1.fields["BuildedTime"] = Field() item1["BuildedTime"] = BuildedTime item1.fields["BuildingType"] = Field() item1["BuildingType"] = BuildingType item1.fields["PropertyCompany"] = Field() item1["PropertyCompany"] = PropertyCompany item1.fields["Developers"] = Field() item1["Developers"] = Developers item1.fields["TotalBuilding"] = Field() item1["TotalBuilding"] = TotalBuilding item1.fields["TotalHouseholds"] = Field() item1["TotalHouseholds"] = TotalHouseholds item1.fields["NearbyStores"] = Field() item1["NearbyStores"] = NearbyStores item1.fields["AroundTraffic"] = Field() item1["AroundTraffic"] = AroundTraffic item1.update(item) yield item1
def parse(self, response): req = response.meta.get("req") item = response.meta.get("item") url_list_handle = Selector(text=response.text) url_list = url_list_handle.xpath("//div[@class='txt-box']/h3/a/@href").extract() for index, url in enumerate(url_list): item1 = Item() item1.fields["Located"] = Field() item1["Located"] = index url = self.base_url + url url = self.get_real_url_handle(url) item1.update(item) yield scrapy.Request(url=url, callback=self.handle_1, meta={"req": req, "item1": item1})
def handle_2(self, response): item2 = response.meta.get("item2") item3 = Item() item3.fields["Title"] = Field() item3.fields["Content"] = Field() res_text = response.text res3_handle = Selector(text=res_text) # title title = res3_handle.xpath('//meta[@property="og:title"]/@content').extract_first() try: content = res3_handle.xpath("//div[@id='js_content']").xpath("string(.)").extract_first() content = content.strip() except Exception as e: content = None item3["Title"] = title item3["Content"] = content item3.update(item2) yield item3
def handle_1(self, response): item = response.meta.get("item") item1 = Item() PropertyCommunity = response.xpath( "//div[contains(@class,'detail_houseInfo_box_title')]/h3/text()" ).extract_first() item1.fields["PropertyCommunity"] = Field() item1["PropertyCommunity"] = PropertyCommunity PriceUnit = response.xpath( "//div[@class='Ap_content']/span/text()").extract_first() if PriceUnit: PriceUnit = str(round(float(PriceUnit) * 10000)) item1.fields["PriceUnit"] = Field() item1["PriceUnit"] = PriceUnit BuildingType = response.xpath( "//div[@class='infoF_cts_left_tenement']/span/text()" ).extract_first() item1.fields["BuildingType"] = Field() item1["BuildingType"] = BuildingType VolumeRatio = response.xpath( "//div[@class='infoF_cts_right_plotRatio']/span/text()" ).extract_first() item1.fields["VolumeRatio"] = Field() item1["VolumeRatio"] = VolumeRatio GreeningRatio = response.xpath( "//div[@class='infoF_cts_right_greeningate']/span/text()" ).extract_first() item1.fields["GreeningRatio"] = Field() item1["GreeningRatio"] = GreeningRatio Developers = response.xpath( "//div[@class='infoF_cts_left_cm']/span/text()").extract_first() item1.fields["Developers"] = Field() item1["Developers"] = Developers AroundSchool = response.xpath( "//div[@class='infoF_cts_left_school']/span/text()").extract_first( ) item1.fields["AroundSchool"] = Field() item1["AroundSchool"] = AroundSchool AroundTraffic = response.xpath( "//div[@class='infoF_cts_left_traffic']/span/text()" ).extract_first() if AroundTraffic: AroundTraffic = re.sub('\s+', '', AroundTraffic).strip() item1.fields["AroundTraffic"] = Field() item1["AroundTraffic"] = AroundTraffic BuildedTime = response.xpath( "//div[@class='infoF_cts_left_years']/span/text()").extract_first( ) if BuildedTime: BuildedTime = re.sub('\s+', '', BuildedTime).strip() item1.fields["BuildedTime"] = Field() item1["BuildedTime"] = BuildedTime PropertyCompany = response.xpath( "//div[@class='infoF_cts_left_Cp']/span/text()").extract_first() if PropertyCompany: PropertyCompany = re.sub('\s+', '', PropertyCompany).strip() item1.fields["PropertyCompany"] = Field() item1["PropertyCompany"] = PropertyCompany PropertyFee = response.xpath( "//div[@class='infoF_cts_left_pay']/span/text()").extract_first() if PropertyFee: PropertyFee = re.sub('\s+', '', PropertyFee).strip() item1.fields["PropertyFee"] = Field() item1["PropertyFee"] = PropertyFee item1.update(item) yield item1
def housing_handle(self, response): item = response.meta.get("item") item1 = Item() # 小区 community = response.xpath( "//div[@class='communityName']/a[contains(@class,'info')]/text()" ).extract_first() # 地址 address = response.xpath("//div[@class='areaName']").xpath( "string(.)").extract_first() try: address = address.replace("所在区域", "").replace("\xa0", '-') except: address = None # 总价 total_price = response.xpath( "//div[contains(@class,'price')]/span[@class='total']/text()" ).extract_first() # 单价 unit_price = response.xpath( "//span[@class='unitPriceValue']/text()").extract_first() floor_info = response.xpath( "//div[@class='room']/div[@class='subInfo']/text()").extract_first( ) # 总楼层 if floor_info: total_floor = re.search("(共(.*?)层)", floor_info) if total_floor: total_floor = total_floor.group(1) # 所在楼层 location_floor = re.search("(.*?)/", floor_info) if location_floor: location_floor = location_floor.group(1) else: total_floor = None location_floor = None item1.fields["TotalFloor"] = Field() item1["TotalFloor"] = total_floor item1.fields["Floor"] = Field() item1["Floor"] = location_floor item1.fields["PropertyCommunity"] = Field() item1["PropertyCommunity"] = community item1.fields["PropertyAddress"] = Field() item1["PropertyAddress"] = address item1.fields["TotalPrice"] = Field() item1["TotalPrice"] = total_price item1.fields["PriceUnit"] = Field() item1["PriceUnit"] = unit_price # 基本信息 base_detail_info_list = response.xpath( "//div[@class='base']/div[@class='content']/ul/li") for base_detail_info in base_detail_info_list: base_key_info = base_detail_info.xpath( "./span/text()").extract_first() base_value_info = base_detail_info.xpath( "./text()").extract_first() if base_key_info != "所在楼层": if base_key_info == "房屋户型": base_key_info = "HouseType" elif base_key_info == "建筑面积": base_key_info = "BuildingSquare" elif base_key_info == "套内面积": base_key_info = "PropertyWithinSquare" elif base_key_info == "房屋朝向": base_key_info = "HouseDirection" elif base_key_info == "装修情况": base_key_info = "FixTypeName" elif base_key_info == "配备电梯": base_key_info = "HasElevator" elif base_key_info == "户型结构": base_key_info = "HouseStructure" elif base_key_info == "建筑类型": base_key_info = "BuildingType" elif base_key_info == "建筑结构": base_key_info = "BuildingStructure" elif base_key_info == "梯户比例": base_key_info = "LadderProtition" elif base_key_info == "产权年限": base_key_info = "PropertyYears" item1.fields[base_key_info] = Field() item1[base_key_info] = base_value_info # 交易属性 trade_detail_info_list = response.xpath( "//div[@class='transaction']/div[@class='content']/ul/li") for trade_detail_info in trade_detail_info_list: trade_info = trade_detail_info.xpath("./span/text()").extract() trade_key_info = trade_info[0] trade_value_info = trade_info[1] if trade_key_info == "抵押信息": trade_value_info = trade_value_info.strip() if trade_key_info == "挂牌时间": trade_key_info = "UpShelfDate" elif trade_key_info == "交易权属": trade_key_info = "TradingOwnerShip" elif trade_key_info == "上次交易": trade_key_info = "LastTradingTime" elif trade_key_info == "房屋年限": trade_key_info = "HouseYears" elif trade_key_info == "抵押信息": trade_key_info = "MortgageInfo" elif trade_key_info == "房屋用途": trade_key_info = "HouseUse" elif trade_key_info == "产权所属": trade_key_info = "PropertyBelong" elif trade_key_info == "房本备件": trade_key_info = "HouseCertificate" item1.fields[trade_key_info] = Field() item1[trade_key_info] = trade_value_info item1.update(item) if self.is_finished(): # crawler pipeline = self.crawler.spider.pipeline scaned_url_list = pipeline.scaned_url_list url_list = pipeline.url_list housing_trade_list = [ x for x in url_list if x not in scaned_url_list ] logging.info(housing_trade_list) for housing_url in housing_trade_list: yield scrapy.Request(url=housing_url, callback=self.house_status_handle, headers=self.get_headers()) yield item1
def housing_handle(self, response): item = response.meta.get("item") item1 = Item() # 单价 unit_price = response.xpath("//div[@class='content__aside--title']/span/text()").extract_first() # 租赁方式 lease_way = response.xpath("//ul[@class='content__aside__list']/li[1]/text()").extract_first() # 房屋户型 room_type = response.xpath("//ul[@class='content__aside__list']/li[2]/text()").extract_first() if room_type: room_type = re.search("(.*?) (.*)", room_type).group(1) item1.fields["PriceUnit"] = Field() item1["PriceUnit"] = unit_price item1.fields["LeaseType"] = Field() item1["LeaseType"] = lease_way item1.fields["HouseType"] = Field() item1["HouseType"] = room_type # 详细信息 base_detail_info_list = response.xpath( "//div[@class='content__article__info']/ul/li[contains(text(),':')]/text()") for base_detail_info in base_detail_info_list: base_detail_info_str = base_detail_info.extract() base_key_info = re.search("(.*?):", base_detail_info_str).group(1) base_value_info = re.search(".*?:(.*)", base_detail_info_str).group(1) if base_key_info != "楼层": if base_key_info == "车位": base_key_info = "HasParkingPlace" elif base_key_info == "用电": base_key_info = "ElectriciType" elif base_key_info == "采暖": base_key_info = "HasHot" elif base_key_info == "租期": base_key_info = "LeaseTime" elif base_key_info == "看房": base_key_info = "WatchHouse" elif base_key_info == "入住": base_key_info = "TimeToLive" elif base_key_info == "电梯": base_key_info = "HasElevator" elif base_key_info == "用水": base_key_info = "WaterType" elif base_key_info == "燃气": base_key_info = "HasGas" elif base_key_info == "面积": base_key_info = "BuildingSquare" elif base_key_info == "朝向": base_key_info = "HouseDirection" item1.fields[base_key_info] = Field() item1[base_key_info] = base_value_info else: # 总楼层 location_floor = re.search("(.*?)/", base_value_info).group(1) # 所在楼层 total_floor = re.search(".*?/(.*)", base_value_info).group(1) item1.fields["TotalFloor"] = Field() item1["TotalFloor"] = total_floor item1.fields["Floor"] = Field() item1["Floor"] = location_floor item1.update(item) if self.is_finished(): # crawler pipeline = self.crawler.spider.pipeline scaned_url_list = pipeline.scaned_url_list url_list = pipeline.url_list housing_trade_list = [x for x in url_list if x not in scaned_url_list] logging.info(housing_trade_list) for housing_url in housing_trade_list: yield scrapy.Request(url=housing_url, callback=self.house_status_handle, headers=self.get_headers()) yield item1