def get_build_info(self, build_info_list, co_id, comm_html, url): for i in build_info_list: try: building = Building(2) bu_name = i[1] # 楼栋名称 bu_num = bu_name.split('#')[0] # 楼号 bu_all_house = i[3] # 总套数 bu_build_size = i[5] # 面积 bu_price = i[9] # 价格 # 给对象增加属性 building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_build_size = bu_build_size building.bu_price = bu_price building.co_id = co_id # 小区id build_html = re.search(r'楼盘表(.*?)个楼栋信息', comm_html).group(1) build_url = re.search(r'<ahref="(.*?)">查看信息<', build_html).group(1) build_id = re.search('buildingId=(.*?)$', build_url).group(1) building.bu_id = build_id # 楼栋id building.insert_db() self.get_build_detail(build_url, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def get_comm_detail(self, href, comm): comm_detail_url = self.URL_FRONT + href response = requests.get(url=comm_detail_url, headers=self.headers) co_id = response.url co_id = int(co_id.split('=')[1]) # 小区id html = response.content.decode('gbk') co_name = self.regex_common(r'项目名称.*?<td.*?>(.*?)</td>', html) # 小区名字 co_owner = self.regex_common(r'房屋所有权证号.*?<td.*?>(.*?)</td>', html) co_use = self.regex_common(r'用 途.*?<td.*?>(.*?)</td>', html) co_develops = self.regex_common(r'开 发 商.*?<td.*?>(.*?)</td>', html) co_address = self.regex_common(r'项目位置.*?<td.*?>(.*?)</td>', html) co_pre_sale = self.regex_common(r'预售证号.*?<td.*?>(.*?)</td>', html) co_land_use = self.regex_common(r'土地使用权证.*?<td.*?>(.*?)</td>', html) co_land_type = self.regex_common(r'土地权证类型.*?<td.*?>(.*?)</td>', html) co_handed_time = self.regex_common(r'终止日期.*?<td.*?>(.*?)</td>', html) co_plan_pro = self.regex_common(r'规划许可证.*?<td.*?>(.*?)</td>', html) co_work_pro = self.regex_common(r'施工许可证.*?<td.*?>(.*?)</td>', html) co_type = self.regex_common(r'项目类型.*?<td.*?>(.*?)</td>', html) # 小区类型 co_size = self.regex_common(r'批准面积.*?<td.*?>(.*?)</td>', html) # 占地面积 comm.co_id = co_id comm.co_name = co_name comm.co_type = co_type comm.co_size = co_size comm.co_owner = co_owner comm.co_use = co_use comm.co_develops = co_develops comm.co_address = co_address comm.co_pre_sale = co_pre_sale comm.co_land_use = co_land_use comm.co_land_type = co_land_type comm.co_handed_time = co_handed_time comm.co_plan_pro = co_plan_pro comm.co_work_pro = co_work_pro # 获取楼栋url列表 build_url_list = re.findall(r"<td><a href='(.*?)'", html, re.M | re.S) if not build_url_list: return else: for build_url in build_url_list: try: building = Building(self.CO_INDEX) build_id = re.search(r'<td>(\d{2,6})</td>', html, re.M | re.S).group(1) # 楼栋id bu_all_house = re.search(r'<td>(\d{1,3})</td>', html, re.M | re.S).group(1) # 总套数 bu_price_demo = re.findall('<td>[\.\d]+</td>', html, re.M | re.S)[4] bu_price = re.search('\d+', bu_price_demo).group() data_dict = self.get_build_detail(build_url) bu_num = data_dict['bu_num'] # 楼号 bu_build_size = data_dict['bu_build_size'] # 建筑面积 co_address = data_dict['co_address'] # 小区地址 co_build_end_time = data_dict['co_build_end_time'] # 竣工时间 co_build_type = data_dict['co_build_type'] # 竣工时间 if not co_build_end_time: building.co_is_build = '1' comm.co_address = co_address comm.co_build_end_time = co_build_end_time comm.bu_build_size = bu_build_size comm.co_build_type = co_build_type # 楼栋 building.bu_num = bu_num building.bu_build_size = bu_build_size building.bu_all_house = bu_all_house building.bu_id = build_id building.co_id = co_id building.bu_price = bu_price # 插入 building.insert_db() except Exception as e: build_detail_url = self.URL_FRONT + build_url print('楼栋错误:', build_detail_url) comm.insert_db()