def get_build_info(self, build_lis, co_id): for build_ in build_lis: build_url = "http://xx.yyfdcw.com" + build_ try: build_res = requests.get(build_url, headers=self.headers) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('Bid=(\d+)', build_).group(1) bu.bu_num = re.search('名称.*?">(.*?)</spa', con).group(1) bu.bu_pre_sale = re.search("编.*?red'>(.*?)</a", con).group(1) bu.bu_pre_sale_date = re.search('颁发日期.*?Date">(.*?)</span', con).group(1) bu.bo_build_start_time = re.search('开工日期.*?">(.*?)</span', con).group(1) bu.bo_build_end_time = re.search('竣工日期.*?">(.*?)</span', con).group(1) bu.bo_develops = re.search('单位.*?">(.*?)</span', con).group(1) bu.bu_floor = re.search('层数.*?">(.*?)</span', con).group(1) bu.bu_live_size = re.search('住宅面积.*?">(.*?)</span', con).group(1) bu.size = re.search('总面积.*?">(.*?)</span', con).group(1) bu.insert_db() id = re.search('测量号.*?">(.*?)</span', con).group(1) self.get_house_info(co_id, bu.bu_id, id)
def get_build_info(self, build_id_list, co_id): bu = Building(co_index) for build_id in build_id_list: formdata = {} formdata["action"] = "qeurySingleBuilding" formdata['pk'] = str(build_id) header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Referer': 'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildingList.php' } try: build_info = self.s.post( 'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildHouseAction.php', data=formdata, headers=header) except Exception as e: print("co_idnex={},楼栋错误".format(co_index), e) build_con = build_info.text bu.bu_id = build_id bu.co_id = co_id bu.bu_num = re.search('幢名称.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bu_floor = re.search('总层数.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bu_build_size = re.search('>建筑面积.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bo_develops = re.search('房地产企业.*?">(.*?)</td', build_con, re.S | re.M).group(1) bu.insert_db() self.get_house_info(build_con, co_id, build_id)
def parse(self, res): html = etree.HTML(res.content.decode('gbk')) bu_list = html.xpath("//div[@class='listCon']") for i in bu_list: temp = i.xpath("./a[@class='listCon2']/@href")[0] name = i.xpath("./a[@class='listCon1']/@title")[0] url = "http://www.hyfc365.com" + temp try: bu_res = requests.get(url, headers=self.headers) content = bu_res.content.decode('gbk') bu = Building(co_index) bu.bu_num = name project_id = re.search('ID=(.*)', temp).group(1) bu.bu_pre_sale = re.search('预售证名称.*?NAME">(.*?)</span', content, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('申领时间.*?">(.*?)</span', content, re.S | re.M).group(1) bu.bo_develops = re.search('申领单位.*?">(.*?)</span', content, re.S | re.M).group(1) bu.bu_build_size = re.search('"SALE_HOUSE_AREA">(.*?)<', content, re.S | re.M).group(1) bu.bu_all_house = re.search('"SALE_HOUSE_COUNT">(.*?)<', content, re.S | re.M).group(1) detail_url = 'http://www.hyfc365.com/RealEstate/Project/BuildingList.aspx?ID=' + project_id detail_res = requests.get(detail_url) bu_id = re.search("BUILDING_ID=(.*?)'", detail_res.text).group(1) bu.bu_id = bu_id bu.insert_db() except Exception as e: log.error("{}楼栋页面解析失败{}".format(url, e)) continue self.house_parse(bu_id)
def comm(self, id): bu = Building(co_index) house_url = self.start_url + "/api/buildInfos/getHouseInfosByPannelNumber?pannelNumber=" + str( id) comm_url = self.start_url + "/api/buildInfos/getHomePageBuildingInfo?blockNumber=" + str( id) comm_detail_url = self.start_url + "/api/buildInfos/getDetailsBuildingInfo?blockNumber=" + str( id) comm_res = requests.get(comm_url) comm_detail_res = requests.get(comm_detail_url) house_res = requests.get(house_url) comm_dict = json.loads(comm_res.text) comm_detail_dict = json.loads(comm_detail_res.text) house_dict = json.loads(house_res.text) bu.bu_id = id bu.bu_num = comm_dict["data"]["nameBuildings"] bu.area = comm_detail_dict['data']['houseingArea'] bu.bu_address = comm_dict["data"]["houseaddress"] bu.bu_pre_sale = comm_detail_dict["data"]["yszh"] bu.bu_type = comm_dict["data"]["propertycategory"] bu.bo_develops = comm_dict["data"]["companyName"] bu.insert_db() house_num = house_dict["data"] for hu in house_num: ho = House(co_index) h = hu["data"] if len(h) > 0: for i in h: try: room_id = i["houseNumber"] room_url = self.start_url + "/api/buildInfos/getHouseInfoByHouseNumber?houseNumber=" + str( room_id) res = requests.get(room_url, headers=self.headers) dict = json.loads(res.text) ho.bu_id = id # ho.ho_num = room_id ho.ho_name = dict["data"]["houseNo"] ho.ho_build_size = dict["data"]["buildArea"] ho.ho_true_size = dict["data"]["jacketArea"] ho.ho_share_size = dict["data"]["apportionedArea"] ho.ho_floor = dict["data"]["nominalLevel"] ho.insert_db() except Exception as e: print(e) else: continue
def get_build_info(self, build_url_list, co_id): for i in build_url_list: build_url = 'http://www.fjlyfdc.com.cn/' + i try: build = Building(co_index) response = requests.get(build_url, headers=self.headers) html = response.text build.bu_id = re.search('buildingInfoID=(.*?)&', build_url).group(1) build.co_id = co_id build.bo_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('坐落位置:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_num = re.search('幢号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_build_structural = re.search('建筑结构:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_type = re.search('设计用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_floor = re.search('总 层 数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_all_size = re.search('总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.insert_db() house_url_list = re.findall('href="(/House/HouseInfo\?HouseCenterID=.*?)"', html, re.S | re.M) self.get_house_info(house_url_list, build.bu_id, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con): build = Building(co_index) build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_pre_sale = bu_pre_sale build.bo_develops = bo_develops build.co_name = bu_co_name build.insert_db()
def get_build_detail(self, all_building_url_list): house_url_list = [] for i in all_building_url_list: try: response = requests.get(i, headers=self.headers) html = response.text tree = etree.HTML(html) bo_develops = tree.xpath( '//*[@id="content_1"]/div[3]/text()[2]')[0] # 开发商 bu_build_size = tree.xpath( '//*[@id="houseTable_1"]/tr[2]/td[6]/a/text()') # 销售面积 if bu_build_size: bu_build_size = bu_build_size[0] bu_pre_sale = tree.xpath( '//*[@id="houseTable_1"]/tr[2]/td[1]/a/text()') # 预售证书 if bu_pre_sale: bu_pre_sale = bu_pre_sale[0] bu_floor = tree.xpath( '//*[@id="houseTable_1"]/tr[2]/td[3]/a/text()')[0] # 总层数 bu_all_house = tree.xpath( '//*[@id="houseTable_1"]/tr[2]/td[4]/a/text()')[0] # 总套数 bu_type = tree.xpath( '//*[@id="houseTable_1"]/tr[2]/td[5]/a/text()')[0] # 房屋用途 build_html = re.search('houseTable_1.*?当前共有', html, re.S | re.M).group() build_detail_html = re.findall( 'class.*?</a></td>.*?</a></td>.*?</a></td>', build_html, re.S | re.M) bu_num = re.findall('项目名称:</b>(.*?)</div>', html, re.S | re.M)[0].strip() url_list = [] for bu in build_detail_html: try: build = Building(co_index) build.bu_id = re.search( "href='roomTable.aspx\?id=(.*?)&", bu, re.S | re.M).group(1) build.bu_address = re.search( "_blank.*?_blank'>(.*?)</a></td><td>", bu, re.S | re.M).group(1).strip() build.bo_develops = bo_develops build.bu_build_size = bu_build_size build.bu_pre_sale = bu_pre_sale build.bu_num = bu_num build.bu_floor = bu_floor build.bu_all_house = bu_all_house build.bu_type = bu_type for k in self.area_list: if k in build.bu_address: build.area = k continue build.insert_db() house_url = re.search( "(roomTable.aspx\?id=.*?&vc=.*?)'", bu, re.S | re.M).group(1) url_list.append( 'http://dgfc.dg.gov.cn/dgwebsite_v2/Vendition/' + house_url) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) house_url_list = url_list + house_url_list except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) return house_url_list