def parse_detail(self, response): data = response.meta.copy() items = XiezilouItem() housing_info = response.xpath("//table")[0].xpath("./tr") for td in housing_info: td00 = td.xpath("./td")[0].xpath("./text()").extract_first() if td00 == "楼层": td01 = td.xpath("./td")[1].xpath("./text()").extract_first() if td01: items["housing_floor"] = td01.strip() peitao_info = response.xpath("//table[@id='li-12']/tr") for entry in peitao_info: label_name_01 = entry.xpath("./td")[0].xpath("./text()").extract_first() label_value_01 = entry.xpath("./td")[1].xpath("./text()").extract_first() label_name_02 = entry.xpath("./td")[2].xpath("./text()").extract_first() label_value_02 = entry.xpath("./td")[3].xpath("./text()").extract_first() if label_value_01: label_value_01 = label_value_01.strip() if label_value_02: label_value_02 = label_value_02.strip() if label_name_01 == "空调:": items["air_condition"] = label_value_01 if label_name_01 == "装修情况:": items["housing_decor"] = label_value_01 if label_name_01 == "公交站点:": items["gj_site"] = label_value_01 if label_name_01 == "物业公司:": items["property_company"] = label_value_01 if label_name_01 == "地铁线:": items["subway_line"] = label_value_01 if label_name_01 == "电梯:": items["elevator"] = label_value_01 if label_name_01 == "标准层高:": items["layer_height"] = label_value_01 if label_name_02 == "车位:": items["parking_place"] = label_value_02 if label_name_02 == "交付时间:": items["built_in"] = label_value_02 if label_name_02 == "公交路线:": items["gj_line"] = label_value_02 if label_name_02 == "物业费用:": items["property_fee"] = label_value_02 if label_name_02 == "地铁站点:": items["subway_site"] = label_value_02 if label_name_02 == "土地性质:": items["land_property"] = label_value_02 items["housing_name"] = data["housing_name"] items["housing_price1"] = data["housing_price1"] items["housing_area"] = data["housing_area"] items["flag"] = data["flag"] items["property_type"] = data["property_type"] items["publish_time"] = data["publish_time"] items["xzl_type"] = data["xzl_type"] items["business_circle"] = data["business_circle"] items["building_address"] = data["housing_address"] items["housing_url"] = data["housing_url"] items["city"] = data["city"] items["district"] = data["district"] if not self.redis.sismember(A5cbdSpider.name + "xzl_set", data["housing_url"]): yield items
def parse_detail(self, response): data = response.meta.copy() items = XiezilouItem() items["building_address"] = response.xpath( "//div[@class='base_info']/dl[@class='borderb mb10']/dt")[2].xpath( "./text()").extract_first().strip() items["flag"] = response.xpath( "//div[@class='base_info']/dl[@class='borderb mb10']/dd")[0].xpath( "./text()").extract()[1] items["xzl_type"] = response.xpath( "//div[@class='base_info']/dl[@class='borderb mb10']/dd")[1].xpath( "./span/text()").extract()[1].strip("型:") items["corp_reged"] = response.xpath( "//div[@class='base_info']/dl[@class='borderb mb10']/dd")[2].xpath( "./span/text()").extract_first().split(":")[1].strip() info = response.xpath("//dl[@class='info_c']/dd")[0].extract() if not re.search("个人", info): items["agent"] = response.xpath( "//span[@id='agentname']/text()").extract_first() items["agent_company"] = response.xpath( "//dl[@class='info_c']/dd[@class='black']/a/b/text()" ).extract_first() items["agent_phone"] = response.xpath( "//dl[@class='info_c']/dd[@class='gray6']/span/text()" ).extract_first() else: raw_agent_phone = response.xpath( "//dl[@class='info_c']/dd[@class='gray6']/span/b/text()" ).extract_first() items["agent_phone"] = re.search(".+(\d{11})", raw_agent_phone, re.M).group(1) items["city"] = data["city"] items["district"] = data["district"] items["housing_url"] = data["housing_url"] items["housing_name"] = data["housing_name"] items["housing_area"] = data["housing_area"] items["publish_time"] = data["publish_time"] items["housing_price1"] = data["housing_price1"] if not self.redis.sismember(Officese01Spider.name + "_xzl_set", data["housing_url"]): yield items
def parse_detail_info_205(self, response): data = response.meta.copy() res = json.loads(response.text)["data"]["data"]["result"] zs_key = data["zs_key"] items = XiezilouItem() data["bangong"] = res["shopInfoMap"]["办公"] data["zb_suite"] = res["shopInfoMap"]["配套"] data["traffic"] = res["shopInfoMap"]["交通"] data["place"] = res["shopInfoMap"]["地段"] if "housing_price2" in data: items["province"] = data["province"] if "city" in data: items["city"] = data["city"] if "district" in data: items["district"] = data["district"] if "street" in data: items["street"] = data["street"] if "xzl_type" in data: items["xzl_type"] = data["xzl_type"] if "flag" in data: items["flag"] = data["flag"] if "housing_url" in data: items["housing_url"] = data["housing_url"] if "publish_time" in data: items["publish_time"] = data["publish_time"] if "housing_name" in data: items["housing_name"] = data["housing_name"] if "housing_price1" in data: items["housing_price1"] = data["housing_price1"] if "housing_price2" in data: items["housing_price2"] = data["housing_price2"] if "pay_method" in data: items["pay_method"] = data["pay_method"] if "business_circle" in data: items["business_circle"] = data["business_circle"] if "loupan" in data: items["loupan"] = data["loupan"] if "housing_floor" in data: items["housing_floor"] = data["housing_floor"] if "building_address" in data: items["building_address"] = data["building_address"] if "agent" in data: items["agent"] = data["agent"] if "agent_phone" in data: items["agent_phone"] = data["agent_phone"] if "agent_company" in data: items["agent_company"] = data["agent_company"] if "property_level" in data: items["property_level"] = data["property_level"] if "housing_decor" in data: items["housing_decor"] = data["housing_decor"] if "property_fee" in data: items["property_fee"] = data["property_fee"] if "bangong" in data: items["bangong"] = data["bangong"] if "zb_suite" in data: items["zb_suite"] = data["zb_suite"] if "traffic" in data: items["traffic"] = data["traffic"] if "place" in data: items["place"] = data["place"] if "housing_detail_url" in data: items["housing_detail_url"] = data["housing_detail_url"] if "phone" in data: items["phone"] = data["phone"] if "housing_area" in data: items["housing_area"] = data["housing_area"] if self.redis.zscore(zs_key, data['housing_url']) == 1: print(items)
def parse_detail_info_281(self, response): data = response.meta.copy() res = json.loads(response.text)["data"]["data"]["result"] # data["housing_price1"] = res["price"] data["pay_method"] = res["paydetail"] data["business_circle"] = res["comarea"] data["loupan"] = res["projname"] data["housing_area"] = res["allacreage"] + "㎡" data["housing_floor"] = res["floor"] data["building_address"] = res["address"] data["property_level"] = res["propertygrade"] data["housing_decor"] = res["fitment"] data["property_fee"] = res["wuyefei"] + "元/平米·月" zs_key = data["zs_key"] items = XiezilouItem() if "housing_price2" in data: items["province"] = data["province"] if "city" in data: items["city"] = data["city"] if "district" in data: items["district"] = data["district"] if "street" in data: items["street"] = data["street"] if "xzl_type" in data: items["xzl_type"] = data["xzl_type"] if "flag" in data: items["flag"] = data["flag"] if "housing_url" in data: items["housing_url"] = data["housing_url"] if "publish_time" in data: items["publish_time"] = data["publish_time"] if "housing_name" in data: items["housing_name"] = data["housing_name"] if "housing_price1" in data: items["housing_price1"] = data["housing_price1"] if "housing_price2" in data: items["housing_price2"] = data["housing_price2"] if "pay_method" in data: items["pay_method"] = data["pay_method"] if "business_circle" in data: items["business_circle"] = data["business_circle"] if "loupan" in data: items["loupan"] = data["loupan"] if "housing_floor" in data: items["housing_floor"] = data["housing_floor"] if "building_address" in data: items["building_address"] = data["building_address"] if "agent" in data: items["agent"] = data["agent"] if "agent_phone" in data: items["agent_phone"] = data["agent_phone"] if "agent_company" in data: items["agent_company"] = data["agent_company"] if "property_level" in data: items["property_level"] = data["property_level"] if "housing_decor" in data: items["housing_decor"] = data["housing_decor"] if "property_fee" in data: items["property_fee"] = data["property_fee"] if "bangong" in data: items["bangong"] = data["bangong"] if "zb_suite" in data: items["zb_suite"] = data["zb_suite"] if "traffic" in data: items["traffic"] = data["traffic"] if "place" in data: items["place"] = data["place"] if "housing_detail_url" in data: items["housing_detail_url"] = data["housing_detail_url"] if "phone" in data: items["phone"] = data["phone"] if "housing_area" in data: items["housing_area"] = data["housing_area"] if self.redis.zscore(zs_key, data['housing_url']) == 2: yield items
def parse_detail(self, response): items = XiezilouItem() data = response.meta.copy() items["housing_name"] = response.xpath( "//h1[@class='tit-name']/span/text()").extract_first() housing_info = response.xpath("//div[@id='fy_info']/ul") for branch in housing_info: for item in branch.xpath("./li"): label_name = item.xpath("./span")[0].xpath( "./text()").extract_first().strip() label_value = item.xpath("./span")[2].xpath( "./text()").extract_first().strip() if label_name == "类型": items["housing_type"] = label_value if label_name == "日租金": items["housing_price1"] = label_value if label_name == "月租金": items["housing_price2"] = label_value if label_name == "押付": items["pay_method"] = label_value if label_name == "楼盘": label_value = item.xpath( "./span[@class='desc']/a/text()").extract_first() if label_value: items["loupan"] = label_value.strip() else: label_value = item.xpath( "./span[@class='desc']/text()").extract_first() if label_value: items["loupan"] = label_value.strip() if label_name == "地址": items["building_address"] = label_value if label_name == "面积": items["housing_area"] = label_value if label_name == "起租期": items["rent_lease"] = label_value if label_name == "使用率": items["housing_use_rate"] = label_value if label_name == "工位数": items["housing_workplace"] = label_value if label_name == "物业费": items["property_fee"] = label_value if label_name == "注册": items["corp_reged"] = label_value if label_name == "楼层": items["housing_floor"] = label_value if label_name == "装修": items["housing_decor"] = label_value if label_name == "单价": items["housing_price1"] = label_value if label_name == "售价": items["housing_price2"] = label_value peitao = [] items["central_air_condition"] = "无" suites = response.xpath( "//ul[@class='mod-peitao clearfix']/li[@class='']") for item in suites: ss = item.xpath("./p/text()").extract_first() if ss == "中央空调": items["central_air_condition"] = "有" if ss: peitao.append(ss) items["peitao"] = ",".join(peitao) trains = [] planes = [] train_info = response.xpath( "//dl[@class='train_box clearfix']/dd[@class='clearfix']/div") plane_info = response.xpath( "//dl[@class='plane_box clearfix']/dd[@class='clearfix']/div") for item in train_info: label_name = item.xpath("./span")[0].xpath( "./text()").extract_first() label_value = item.xpath("./span")[1].xpath( "./text()").extract_first() trains.append(label_name + " " + label_value) for item in plane_info: if item.xpath("./span"): label_name = item.xpath("./span")[0].xpath( "./text()").extract_first() label_value = item.xpath("./span")[1].xpath( "./text()").extract_first() planes.append(label_name + " " + label_value) items["traffic"] = ",".join(trains) + "; " + ",".join(planes) items["publish_time"] = response.xpath( "//div[@class='hd-sub']/text()")[1].extract().strip() items["agent"] = response.xpath( "//div[@class='bro-info clearfix']/h5[@class='name']/text()" ).extract_first().strip() items["agent_phone"] = response.xpath( "//div[@class='broker_tel']/text()").extract_first().strip() items["agent_company"] = response.xpath( "//p[@class='comp_info']/a/text()").extract_first().strip() items["city"] = data["city"] items["district"] = data["district"] items["street"] = data["street"] items["flag"] = data["flag"] items["housing_url"] = data["housing_url"] zs_key = data['zs_key'] if self.redis.zscore(zs_key, data['housing_url']) == 2: yield items
def parse_detail(self, response): items = XiezilouItem() data = response.meta.copy() items['publish_time'] = response.xpath( "//li[@class='date']/text()").extract_first().lstrip("更新于") items['housing_name'] = response.xpath( "//p[@class='card-title']/i/text()").extract_first().strip() price_unit = response.xpath( "//div[@class='price-wrap']/text()")[1].extract() price1 = response.xpath("//span[@class='price strongbox']" ) or response.xpath("//span[@class='price']") price2 = response.xpath("//span[@class='unit strongbox']" ) or response.xpath("//span[@class='unit']") items['housing_price1'] = price1.xpath( "./text()").extract_first() + price_unit items['housing_price2'] = price2.xpath( "./text()").extract_first().strip(" |") housing_info = response.xpath("//li[@class='item f-fl']") for item in housing_info: label = "".join( item.xpath("./span[@class='t']/text()").extract()).strip(":") content = item.xpath( "./span[@class='content']/text()").extract_first().strip() if label == "面积": items['housing_area'] = content if label == "楼层": items['housing_floor'] = content if label == "装修": items["housing_decor"] = content if label == "租期": items["rent_lease"] = content if label == "区域": items["business_circle"] = content address = response.xpath( "//li[@class='er-item f-fl']/span[@class='t2']") if address: items['building_address'] = response.xpath( "//li[@class='er-item f-fl']/span[@class='content']/text()" ).extract_first().strip() items['agent'] = response.xpath( "//div[@class='name']/a[@class='name']/text()").extract_first() items['agent_phone'] = response.xpath( "//a[@class='phone_num js_person_phone']/text()").extract_first() if response.xpath("//div[@class='user_other']"): items['agent_company'] = response.xpath( "//div[@class='user_other']")[0].xpath( "./span[@class='company']/text()").extract_first() suites = response.xpath( "//ul[@class='collocation f-clear']/li[@class='item']") peitao_raw = [] if suites: for item in suites: label = item.xpath("./p[@class='text']/text()").extract_first() peitao_raw.append(label) if label == "中央空调": items['central_air_condition'] = "有" items['peitao'] = ",".join(peitao_raw) items['city'] = data['city'] items['district'] = data['district'] items['street'] = data['street'] items['xzl_type'] = data['xzl_type'] items['housing_url'] = data['housing_url'] items['flag'] = data['flag'] zs_key = data['zs_key'] if self.redis.zscore(zs_key, data['housing_url']) == 2: yield items
def parse_detail(self, response): items = XiezilouItem() data = response.meta.copy() house_update = response.xpath( "//p[@class='house-update-info']/span[@class='up']") if house_update: items['publish_time'] = re.search( ".+(2\d+-\d+-\d+)", house_update[0].xpath("./text()").extract_first()).group(1) general_info1 = response.xpath("//ul[@class='general-item-left']/li") general_info2 = response.xpath("//ul[@class='general-item-right']/li") general_info = general_info1 + general_info2 flag = data['flag'] for item in general_info: label_name = item.xpath("./span[@class='mr_25 c_999']/text()" ).extract_first().strip(":") lable_value = item.xpath( "./span[@class='c_000']/text()").extract_first().strip() # if flag == "出租": # if label_name == "写字楼租金": # items['housing_price1'] = lable_value # if flag == "出售": # if label_name == "写字楼售价": # items['housing_price1'] = lable_value if label_name == "建筑面积": items['housing_area'] = lable_value if label_name == "可注册公司": items['corp_reged'] = lable_value if label_name == "起租期": items['rent_lease'] = lable_value if label_name == "物业费": items['property_fee'] = lable_value if label_name == "所在楼层": items['housing_floor'] = lable_value if label_name == "装修情况": items['housing_decor'] = lable_value if label_name == "使用率": items['housing_use_rate'] = lable_value if label_name == "付款方式": items['pay_method'] = lable_value if label_name == "参考容纳工位数": items['housing_workplace'] = lable_value suites = response.xpath("//li[@class='peitao-on']") peitao = [] for item in suites: label = item.xpath("./text()").extract_first() peitao.append(label) if label == "中央空调": items['central_air_condition'] = "有" items['peitao'] = ",".join(peitao) if response.xpath("//span[@class=' house_basic_title_money_mianyi ']"): items['housing_price2'] = "面议" items['housing_price1'] = "面议" else: if response.xpath("//span[@class='house_basic_title_money_num']"): items['housing_price1'] = response.xpath( "//span[@class='house_basic_title_money_num']/text()" ).extract_first() + response.xpath( "//span[@class='house_basic_title_money_unit']/text()" ).extract_first() if response.xpath( "//span[@class='house_basic_title_money_num_chushou']"): items['housing_price2'] = response.xpath( "//span[@class='house_basic_title_money_num_chushou']/text()" ).extract_first() elif response.xpath( "//span[@class=' house_basic_title_money_num_chuzu ']"): items['housing_price2'] = response.xpath( "//span[@class=' house_basic_title_money_num_chuzu ']/text()" ).extract_first() + response.xpath( "//span[@class='house_basic_title_money_unit_chuzu']/text()" ).extract_first() housing_info1 = response.xpath("//div[@class='house-basic-item2']/p") housing_info2 = response.xpath("//ul[@class='house-basic-item3']/li") for item in housing_info2: item_name = item.xpath( "./span[@class='c_999']/text()").extract_first().strip(":") if item_name == "楼盘": items['loupan'] = item.xpath( "./span[@class='c_000 mr_10']/span[@class='c_000']/text()" ).extract_first().strip() if item_name == "详细地址": address = [] if item.xpath("./span[@class='c_000 mr_10']"): for sec in item.xpath("./span[@class='c_000 mr_10']/a"): address.append( sec.xpath("./text()").extract_first().strip()) address.append( item.xpath("./span[@class='c_000 mr_10']/span/text()"). extract_first().strip()) if address: items['building_address'] = " ".join(address) # if item_name == "可注册公司": # items[''] = item.xpath("./span[@class='c_000 fou']").extract_first().strip() if data['xzl_type'] == "纯写字楼": items['property_level'] = response.xpath( "//div[@class='house-basic-item2']/p[@class='item3']/span[@class='sub']/text()" ).extract_first() items['agent'] = response.xpath( "//div[@class='jjr-name f14 c_555']/a[@class='c_000 jjr-name-txt']/text()" ).extract_first() items['agent_phone'] = response.xpath( "//p[@class='phone-num']/text()").extract_first() agent_company_raw = response.xpath("//p[@class='jr-item jjr-belong']") if agent_company_raw: items['agent_company'] = agent_company_raw.xpath( "./span[@class='c_000']/text()").extract_first() items['province'] = data['province'] items['city'] = data['city'] items['district'] = data['district'] items['street'] = data['street'] items['flag'] = data['flag'] items['xzl_type'] = data['xzl_type'] items['housing_url'] = data['housing_url'] zs_key = data['zs_key'] if self.redis.zscore(zs_key, data['housing_url']) == 2: yield items
def parse_detail_info(self, response): items = XiezilouItem() province = response.meta['province'] city = response.meta['city'] district = response.meta['district'] street = response.meta['street'] flag = response.meta['flag'] building_url = response.meta['building_url'] building_name = response.meta['building_name'] area_extent = response.meta['area_extent'] building_description = response.meta['building_description'] price_extent = response.meta['price_extent'] buiding_agent = response.meta['buiding_agent'] buiding_agent_phone = response.meta['buiding_agent_phone'] building_address = response.meta['building_address'] building_height = response.meta['building_height'] building_elevator = response.meta['building_elevator'] business_circle = response.meta['business_circle'] developer = response.meta['developer'] housing_name = response.meta['housing_name'] housing_description = response.meta['housing_description'] housing_url = response.meta['housing_url'] h_key = response.meta['h_key'] traffic = "" zb_suite = "" other = "" housing_area = "" housing_floor = "" housing_workplace = "" publish_time = "" housing_features = response.xpath("//div[@class='detail__feature-ul cf']/p[@class='detail__feature-text']") if housing_features: for feature in housing_features: label_name = feature.xpath("./strong/text()").extract_first().strip(":") label_value = feature.xpath("./text()").extract_first().strip() if label_name == "交通出行": traffic = label_value if label_name == "周边配套": zb_suite = label_value if label_name == "其他": other = label_value housing_price = response.xpath("//p[@class='detail__price']") housing_price1 = housing_price.xpath("./label/span/text()").extract_first() + housing_price.xpath("./label/text()").extract_first() housing_price2 = response.xpath("//div[@class='detail__priceunit']/text()").extract_first() detail_info = response.xpath("//div[@class='detail__info']/p") for item in detail_info: label_name = item.xpath("./span/text()").extract_first().strip(":") label_value = item.xpath("./text()").extract_first() if label_name == "面积": housing_area = label_value if label_name == "楼层": housing_floor = label_value if label_name == "工位": housing_workplace = label_value if label_name == "时间": publish_time = label_value detail_agent = response.xpath("//div[@class='detail__agent']") agents = [] for agent in detail_agent: agent_name = agent.xpath("./div[@class='detail__agent-info']/div[@class='detail__agent-name']/div[@class='detail__agent-top']/span[@class='detail__agent-name']/text()").extract_first() agent_phone = agent.xpath("./p[@class='detail__agent-phone']/text()").extract_first() agents.append(agent_name + ": " + agent_phone) housing_agents_info = ",".join(agents) items['province'] = province items['city'] = city items['district'] = district items['street'] = street items['flag'] = flag items['building_url'] = building_url items['building_name'] = building_name items['area_extent'] = area_extent items['building_description'] = building_description items['price_extent'] = price_extent items['buiding_agent'] = buiding_agent items['buiding_agent_phone'] = buiding_agent_phone items['building_address'] = building_address items['building_height'] = building_height items['building_elevator'] = building_elevator items['business_circle'] = business_circle items['developer'] = developer items['housing_name'] = housing_name items['housing_description'] = housing_description items['housing_url'] = housing_url items['traffic'] = traffic or "" items['zb_suite'] = zb_suite or "" items['other'] = other or "" items['housing_price1'] = housing_price1 items['housing_price2'] = housing_price2 items['housing_area'] = housing_area or "" items['housing_floor'] = housing_floor or "" items['housing_workplace'] = housing_workplace or "" items['publish_time'] = publish_time or "" items['housing_agents_info'] = housing_agents_info if self.redis.hexists(h_key, building_url) and not self.redis.hexists("lianjia_xzl_housing_finished_hashtable", housing_url): yield items
def parse_detail(self, response): items = XiezilouItem() data = response.meta.copy() housing_price1 = response.xpath( "//div[@class='rentBox']/span[@class='rentNum num']/text()" ).extract_first() + response.xpath( "//div[@class='rentBox']/span[@class='rentUnit']/text()" ).extract_first() housing_price_test = response.xpath( "//div[@class='saleBox m-l-30']/span[@class='noSalePrice fl']" ).extract() if not housing_price_test: housing_price2 = response.xpath( "//div[@class='saleBox m-l-30']/span[@class='saleNum num']/text()" ).extract_first() + response.xpath( "//div[@class='saleBox m-l-30']/span[@class='saleUnit']/text()" ).extract_first() else: housing_price2 = "暂无售价" housing_source = response.xpath( "//div[@class='fangyuanBox box']/span[@class='fysl']/text()" ).extract_first() housing_area = response.xpath( "//div[@class='xsmjBox box']/span[@class='fysl']/text()" ).extract_first() built_in = response.xpath( "//div[@class='basicMessage container clearfix']/div[@class='messageContent']/span[@class='yearsText text firstRow']/text()" ).extract_first() green_rate = response.xpath( "//div[@class='basicMessage container clearfix']/div[@class='messageContent']/span[@class='greeningText text']/text()" ).extract_first() building_info = response.xpath( "//div[@class='buildMessage container clearfix']") building_area = response.xpath( "//span[@class='zongmianjiNum text firstRow']/text()" ).extract_first() building_arch = response.xpath( "//span[@class='jiegouText text']/text()").extract_first() building_total = response.xpath( "//span[@class='blocks text']/text()").extract_first() elevator_num = response.xpath( "//span[@class='ketiNum text lastRow']/text()").extract_first() lift_num = response.xpath( "//span[@class='huotiNum text firstRow']/text()").extract_first() layer_height = response.xpath( "//span[@class='cenggaoText text']/text()").extract_first() property_level = response.xpath( "//span[@class='dengjiText text']/text()").extract_first() property_company = response.xpath( "//span[@class='wygs text firstRow']/text()").extract_first() property_fee = response.xpath( "//span[@class='wyf text']/text()").extract_first() parking_place = response.xpath( "//span[@class='carport text']/text()").extract_first() heat_supply = response.xpath( "//span[@class='cnfs text lastRow']/text()").extract_first() air_condition = response.xpath( "//span[@class='ktlx text firstRow']/text()").extract_first() power_voltage = response.xpath( "//span[@class='dianyaText text']/text()").extract_first() items["city"] = data["city"] items["district"] = data["district"] items["street"] = data["street"] items["housing_name"] = data["housing_name"] items["housing_url"] = data["housing_url"] items["building_address"] = data["housing_address"] items["housing_price1"] = housing_price1 items["housing_price2"] = housing_price2 items["housing_source"] = housing_source items["housing_area"] = housing_area items["built_in"] = built_in items["green_rate"] = green_rate items["building_area"] = building_area items["building_arch"] = building_arch items["building_total"] = building_total items["elevator_num"] = elevator_num items["lift_num"] = lift_num items["layer_height"] = layer_height items["property_company"] = property_company items["property_fee"] = property_fee items["parking_place"] = parking_place items["heat_supply"] = heat_supply items["air_condition"] = air_condition items["power_voltage"] = power_voltage items["property_level"] = property_level if not self.redis.sismember(KongjianjiaSpider.name + "_xzl_set", data["housing_url"]): yield items
def parse_detail(self, response): items = XiezilouItem() data = response.meta.copy() items["housing_price1"] = data["housing_price1"] items["publish_time"] = response.xpath( "//span[@class='fr pr20']/font[@class='color8']/text()" ).extract_first() publish_type = response.xpath( "//span[@class='bis_user fl']/font[@class='color2']/text()" ).extract_first().strip("[").strip("]") if publish_type == "中介": items["publish_type"] = publish_type items["publisher"] = response.xpath( "//span[@class='bis_user fl']/a")[0].xpath( "./text()").extract_first() base_info = response.xpath( "//div[@class='bis_actinfo clearfix']/ul/li") for i in range(len(base_info)): if i % 2 == 0: label_name = base_info[i].xpath( "./text()").extract_first().strip() label_value = base_info[i + 1].xpath( "./text()").extract_first() if label_name == "楼盘名称:": items["loupan"] = label_value if label_name == "楼盘类型:": items["xzl_type"] = label_value if label_name == "城区商圈:": items["business_circle"] = label_value if label_name == "楼盘地址:": items["building_address"] = label_value if label_name == "门店经理:": items["agent"] = label_value if label_name == "门店固话:": items["agent_phone"] = label_value peitao_info = response.xpath( "//ul[@class='bis_table clearfix']/li") for i in range(len(peitao_info)): if i % 2 == 0: label_name = peitao_info[i].xpath( "./text()").extract_first() label_value = peitao_info[i + 1].xpath( "./text()").extract_first() if label_name == "物管公司": items["property"] = label_value if label_name == "楼层状况": items["housing_floor"] = label_value if label_name == "总 建 面": items["building_area"] = label_value if label_name == "物 管 费": items["property_fee"] = label_value if label_name == "标准层高": items["layer_height"] = label_value if label_name == "空 调": items["central_air_condition"] = label_value if label_name == "车位数量": items["parking_place"] = label_value if label_name == "标准层建面": items["housing_area"] = label_value if label_name == "电 梯": items["elevator"] = label_value if label_name == "车 位 费": items["parking_fee"] = label_value if label_name == "开间建面": items["kaijian_area"] = label_value if label_name == "员工餐厅": items["employee_restaurant"] = label_value if label_name == "交通站点": items["traffic_site"] = label_value if label_name == "轨道公交": items["traffic"] = label_value if publish_type == "非中介": items["publish_type"] = publish_type items["publisher"] = response.xpath( "//span[@class='bis_user fl']/text()").extract_first() base_info = response.xpath( "//div[@class='box1 bis_info bgcolor1 clearfix']/ul/li") for i in range(len(base_info)): if i % 2 == 0: label_name = base_info[i].xpath( "./text()").extract_first().strip() label_value = base_info[i + 1].xpath( "./text()").extract_first() if label_name == "楼盘类型:": items["xzl_type"] = label_value if label_name == "所在楼层:": items["housing_floor"] = label_value if label_name == "招商面积:": items["building_area"] = label_value if label_name == "租售价格:": items["housing_price1"] = label_value if label_name == "付款方式:": items["pay_method"] = label_value if label_name == "装修状况:": items["housing_decor"] = label_value if label_name == "基本租期:": items["rent_lease"] = label_value if label_name == "楼盘地址:": items["building_address"] = label_value peitao_info = response.xpath( "//ul[@class='bis_table clearfix']/li") for i in range(len(peitao_info)): if i % 2 == 0: label_name = peitao_info[i].xpath( "./text()").extract_first() label_value = peitao_info[i + 1].xpath( "./text()").extract_first() if label_name == "物管公司": items["property"] = label_value if label_name == "楼层状况": items["housing_floor"] = label_value if label_name == "总 建 面": items["building_area"] = label_value if label_name == "物 管 费": items["property_fee"] = label_value if label_name == "标准层高": items["layer_height"] = label_value if label_name == "空 调": items["central_air_condition"] = label_value if label_name == "车位数量": items["parking_place"] = label_value if label_name == "标准层建面": items["housing_area"] = label_value if label_name == "电 梯": items["elevator"] = label_value if label_name == "车 位 费": items["parking_fee"] = label_value if label_name == "开间建面": items["kaijian_area"] = label_value if label_name == "员工餐厅": items["employee_restaurant"] = label_value if label_name == "交通站点": items["traffic_site"] = label_value if label_name == "轨道公交": items["traffic"] = label_value items["city"] = data["city"] items["district"] = data["district"] items["housing_url"] = data["housing_url"] if not self.redis.sismember(O571Spider.name + "_xzl_set", data["housing_url"]): yield items
def parse_detail(self, response): items = XiezilouItem() data = response.meta.copy() xzl_type = data["xzl_type"] items["publish_time"] = response.xpath( "//span[@class='ddz-timestamp']/text()").extract_first().strip() items["housing_name"] = response.xpath( "//h1[@class='fl']/text()").extract_first().strip() housing_price = response.xpath("//div[@class='top-price fr']")[0] housing_price_num = housing_price.xpath( "./span[@class='price-num']/text()").extract_first() housing_price_unit = housing_price.xpath( "./text()").extract_first().strip() items["housing_price1"] = housing_price_num + housing_price_unit housing_rent = response.xpath("//div[@class='fbody']/div/a") housing_rentings = [] if xzl_type == "写字楼": for housing in housing_rent: area = housing.xpath( "./div[@class='tj-pc-listingDetail-house-click f-area f-item']" ) area_num = area.xpath("./font/text()").extract_first() area_unit = area.xpath("./span/text()").extract_first() if area_unit: hr_area = area_num + area_unit else: hr_area = area_num price = housing.xpath( "./div[@class='tj-pc-listingDetail-house-click f-price f-item']/div[@class='tj-pc-listingDetail-house-click unit-show']" ) hr_price = price.xpath( "./span[@class='tj-pc-listingDetail-house-click price-num']/text()" ).extract_first() + price.xpath( "./span[@class='price-unit']/text()").extract_first() hr_floor = housing.xpath( "./div[@class='tj-pc-listingDetail-house-click f-floor f-item']/text()" ).extract_first() hr_decor = housing.xpath( "./div[@class='tj-pc-listingDetail-house-click f-decoraion f-item']/text()" ).extract_first() hr_update = housing.xpath( "./div[@class='tj-pc-listingDetail-house-click f-update f-item ddz-timestamp']/text()" ).extract_first() housing_rentings.append( "面积: {hr_area}, 单价: {hr_price}, 楼层: {hr_floor}, 装修: {hr_decor}, 更新: {hr_update}" .format(hr_area=hr_area, hr_price=hr_price, hr_floor=hr_floor, hr_decor=hr_decor, hr_update=hr_update)) housing_base_info = response.xpath( "//div[@class='clearfix donetime-address']/ul/li") for item in housing_base_info: label_name = item.xpath( "./span[@class='f-title']/text()").extract_first() label_value = item.xpath( "./span[@class='f-con']/text()").extract_first() if label_name == "地理位置": label_value = item.xpath( "./span[@class='f-con']/a/text()").extract_first() items["building_address"] = label_value if label_name == "竣工时间": items["built_in"] = label_value housing_infos = response.xpath("//div[@class='clearfix ul-layer']") for entry in housing_infos: lis = entry.xpath("./ul/li") for item in lis: label_name = item.xpath( "./span[@class='f-title']/text()").extract_first() label_value = item.xpath( "./span[@class='f-con']/text()").extract_first() if label_name == "层高": items["layer_height"] = label_value if label_name == "层数": items["building_height"] = label_value if label_name == "物业": items["property"] = label_value if label_name == "物业费": items["property_fee"] = label_value if label_name == "车位": items["parking_place"] = label_value if label_name == "车位月租金": items["parking_fee"] = label_value if label_name == "空调": items["air_condition"] = label_value if label_name == "空调费": items["air_condition_fee"] = label_value if label_name == "空调开放时长": items["air_condition_time"] = label_value if label_name == "电梯": items["elevator"] = label_value if label_name == "网络": items["network"] = label_value if label_name == "入驻企业": items['settled_enterprise'] = label_value else: pass items["city"] = data["city"] items["district"] = data["district"] items["street"] = data["street"] items["xzl_type"] = data["xzl_type"] items["housing_url"] = data["housing_url"] zs_key = data["zs_key"] if self.redis.zscore(zs_key, data['housing_url']) == 2: yield items