def build_info(self, co_id, temp_url_list): for temp_url in temp_url_list: try: build_url = "http://222.77.178.63:7002/" + temp_url res = requests.get(build_url, headers=self.headers) html = etree.HTML(res.content.decode('gbk')) build_info_list = html.xpath("//tr[@class='indextabletxt']") for build_info in build_info_list: bu = Building(co_index) ho_url = build_info.xpath("./td/a/@href")[0] bu.co_id = co_id bu.bu_id = re.search('Param=(.*)', ho_url).group(1) bu.bu_num = build_info.xpath("./td/a/text()")[0] bu.bu_all_house = build_info.xpath("./td[2]/text()")[0] try: bu.bu_all_size = build_info.xpath("./td[3]/text()")[0] except: bu.bu_all_size = None try: bu.bu_live_size = build_info.xpath("./td[5]/text()")[0] except: bu.bu_live_size = None bu.insert_db() except Exception as e: # log.error('楼栋信息错误{}'.format(e)) print('楼栋信息错误{}'.format(e)) continue self.house_info(ho_url, co_id, bu.bu_id)
def get_build_info(self, build_lis, co_id): for build_ in build_lis: build_url = "http://xx.yyfdcw.com" + build_ try: build_res = requests.get(build_url, headers=self.headers) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('Bid=(\d+)', build_).group(1) bu.bu_num = re.search('名称.*?">(.*?)</spa', con).group(1) bu.bu_pre_sale = re.search("编.*?red'>(.*?)</a", con).group(1) bu.bu_pre_sale_date = re.search('颁发日期.*?Date">(.*?)</span', con).group(1) bu.bo_build_start_time = re.search('开工日期.*?">(.*?)</span', con).group(1) bu.bo_build_end_time = re.search('竣工日期.*?">(.*?)</span', con).group(1) bu.bo_develops = re.search('单位.*?">(.*?)</span', con).group(1) bu.bu_floor = re.search('层数.*?">(.*?)</span', con).group(1) bu.bu_live_size = re.search('住宅面积.*?">(.*?)</span', con).group(1) bu.size = re.search('总面积.*?">(.*?)</span', con).group(1) bu.insert_db() id = re.search('测量号.*?">(.*?)</span', con).group(1) self.get_house_info(co_id, bu.bu_id, id)
def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def get_build_info(self, build_url_list, co_name): for i in build_url_list: try: build = Building(co_index) build.co_name = co_name build_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(build_url, headers=self.headers) html = response.text # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0] for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M): try: build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k result = requests.get(build_url_detail, headers=self.headers) content = result.text build.bu_num = re.findall( 'BuildingInfo1_lblBuildingName">(.*?)<', content, re.S | re.M)[0] build.bu_all_house = re.findall( 'BuildingInfo1_lblZts">(.*?)<', content, re.S | re.M)[0] build.bu_floor = re.findall( 'BuildingInfo1_lblZcs">(.*?)<', content, re.S | re.M)[0] build.bu_build_size = re.findall( 'BuildingInfo1_lblJzmj">(.*?)<', content, re.S | re.M)[0] build.bu_live_size = re.findall( 'BuildingInfo1_lblZzmj">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale = re.findall( 'BuildingInfo1_lblYsxkzh">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale_date = re.findall( 'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content, re.S | re.M)[0] build.insert_db() house_url_list = re.findall( "onClick=.getMoreHouseInfo\('(.*?)'\)", content, re.S | re.M) self.get_house_info(house_url_list, co_name, build.bu_num) except Exception as e: print(e) except Exception as e: print(e)
def get_build_info(self, presell_url_list, co_id): for presell_url in presell_url_list: pre_url = self.url + presell_url res = requests.get(pre_url, headers=self.headers) build_url_list = re.findall('【<a href="(.*?)" target="_self"', res.text, re.S | re.M) for build_url in build_url_list: build_info_url = self.url + build_url try: build_res = requests.get(build_info_url, headers=self.headers) con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID=(\d+)', build_url).group(1) bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span', con, re.S | re.M).group(1) bu.bu_floor = re.search('总 层 数.*?(\d+)</span', con, re.S | re.M).group(1) bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_not_live_size = re.search( '非住宅面积.*?Fzzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span', con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span', con, re.S | re.M).group(1) bu.bu_type = re.search('项目类型.*?Type">(.*?)</span', con, re.S | re.M).group(1) bu.insert_db() except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"", con, re.S | re.M) self.get_house_info(co_id, bu.bu_id, house_detail_list)
def bu_parse(self, co_id, bulist): for bo in bulist: bu_url = "http://110.89.45.7:8082" + bo bu_res = requests.get(bu_url, headers=self.headers) con = bu_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('buildingInfoID=(.*?)&', bo).group(1) bu.bu_num = re.search('幢号.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_floor = re.search('总 层 数.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_live_size = re.search('批准销售.*?">.*?</td.*?">(.*?)</td', con, re.S | re.M).group(1) bu.bu_all_size = re.search('总面积.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_type = re.search('设计用途.*?">(.*?)</', con, re.S | re.M).group(1) bu.insert_db() bu_html = etree.HTML(con) ho_list = bu_html.xpath("//td[@style]/a") self.ho_parse(co_id, bu.bu_id, ho_list)
def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con): build = Building(co_index) build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_pre_sale = bu_pre_sale build.bo_develops = bo_develops build.co_name = bu_co_name build.insert_db()