def bu_parse(self,bu_url,co_id,co_url): build_url = "http://61.143.241.154/" + bu_url global headers headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Referer': co_url } bu_res = requests.get(build_url,headers=headers) bu_con = bu_res.content.decode('gbk') bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a',bu_con,re.S|re.M).group(1) bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td',bu_con,re.S|re.M).group(1) bu_html = etree.HTML(bu_con) bu_list = bu_html.xpath("//table[@id='donglist']//tr") for bo in bu_list: bu = Building(co_index) bu.co_id = co_id bo_url = bo.xpath("./td/a/@href")[0] bu.bu_id = re.search('dbh=(.*?)&', bo_url).group(1) bu.bu_num = bo.xpath("./td[3]/text()")[0] bu.bu_floor = bo.xpath("./td[4]/text()")[0] bu.bu_pre_sale = bu_pre_sale bu.bu_pre_sale_date = bu_pre_sale_date bu.insert_db() self.house_parse(bo_url,co_id,bu.bu_id)
def parse(self, res): html = etree.HTML(res.content.decode('gbk')) bu_list = html.xpath("//div[@class='listCon']") for i in bu_list: temp = i.xpath("./a[@class='listCon2']/@href")[0] name = i.xpath("./a[@class='listCon1']/@title")[0] url = "http://www.hyfc365.com" + temp try: bu_res = requests.get(url, headers=self.headers) content = bu_res.content.decode('gbk') bu = Building(co_index) bu.bu_num = name project_id = re.search('ID=(.*)', temp).group(1) bu.bu_pre_sale = re.search('预售证名称.*?NAME">(.*?)</span', content, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('申领时间.*?">(.*?)</span', content, re.S | re.M).group(1) bu.bo_develops = re.search('申领单位.*?">(.*?)</span', content, re.S | re.M).group(1) bu.bu_build_size = re.search('"SALE_HOUSE_AREA">(.*?)<', content, re.S | re.M).group(1) bu.bu_all_house = re.search('"SALE_HOUSE_COUNT">(.*?)<', content, re.S | re.M).group(1) detail_url = 'http://www.hyfc365.com/RealEstate/Project/BuildingList.aspx?ID=' + project_id detail_res = requests.get(detail_url) bu_id = re.search("BUILDING_ID=(.*?)'", detail_res.text).group(1) bu.bu_id = bu_id bu.insert_db() except Exception as e: log.error("{}楼栋页面解析失败{}".format(url, e)) continue self.house_parse(bu_id)
def get_build_info(self, build_lis, co_id): for build_ in build_lis: build_url = "http://xx.yyfdcw.com" + build_ try: build_res = requests.get(build_url, headers=self.headers) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('Bid=(\d+)', build_).group(1) bu.bu_num = re.search('名称.*?">(.*?)</spa', con).group(1) bu.bu_pre_sale = re.search("编.*?red'>(.*?)</a", con).group(1) bu.bu_pre_sale_date = re.search('颁发日期.*?Date">(.*?)</span', con).group(1) bu.bo_build_start_time = re.search('开工日期.*?">(.*?)</span', con).group(1) bu.bo_build_end_time = re.search('竣工日期.*?">(.*?)</span', con).group(1) bu.bo_develops = re.search('单位.*?">(.*?)</span', con).group(1) bu.bu_floor = re.search('层数.*?">(.*?)</span', con).group(1) bu.bu_live_size = re.search('住宅面积.*?">(.*?)</span', con).group(1) bu.size = re.search('总面积.*?">(.*?)</span', con).group(1) bu.insert_db() id = re.search('测量号.*?">(.*?)</span', con).group(1) self.get_house_info(co_id, bu.bu_id, id)
def build_info(self, bu_list, co_id): for bu in bu_list: bu_url = bu.xpath("./td[4]/a/@href")[0] build_url = self.start_url + '/' + bu_url bu_res = requests.get(build_url, headers=self.headers) bu_res.encoding = 'gbk' con = bu_res.text bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a', con, re.S | re.M).group(1) bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td', con, re.S | re.M).group(1) bu_html = etree.HTML(con) donglist = bu_html.xpath("//table[@id='donglist']/tr") for dong in donglist: dong_url = dong.xpath("./td/a/@href")[0] bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID={(.*?)}', dong_url).group(1) bu.bu_num = dong.xpath("./td[3]/text()")[0] bu.bu_floor = dong.xpath("./td[4]/text()")[0] bu.bu_pre_sale = bu_pre_sale bu.bu_pre_sale_date = bu_pre_sale_date bu.insert_db() self.house_info(co_id, bu.bu_id, dong_url)
def get_build_info(self, comm_url_list): for i in comm_url_list: try: sid = re.findall('\+(\d+)\+', i)[0] pid = re.findall('\+(\d+)\+', i)[1] build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid # print(build_url) response = requests.get(build_url) html = response.text build = Building(co_index) build.bu_id = pid build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_pre_sale = re.search('预售证号.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('时间.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_all_house = re.search('dM.*?">(.*?) ', html, re.S | re.M).group(1) # build.bu_address = re.search('售楼处地址.*?">(.*?) ', html, re.S | re.M).group(1) build.insert_db() except Exception as e: print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url), e) house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001¶ms=' + sid # print(house_url) result = requests.get(house_url) html_ = result.text for house_info in re.findall('<Result.*?</Result>', html_, re.S | re.M): try: house = House(co_index) house.bu_id = build.bu_id house.bu_num = build.bu_num house.ho_name = re.search('<ONAME>(.*?)</ONAME>', house_info, re.S | re.M).group(1) house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info, re.S | re.M).group(1) house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>', house_info, re.S | re.M).group(1) house.ho_floor = re.search('<FORC>(.*?)</FORC>', house_info, re.S | re.M).group(1) house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>', house_info, re.S | re.M).group(1) house.insert_db() except Exception as e: print('co_index={}, 房号错误'.format(co_index), e)
def get_build_info(self, build_url_list, co_name): for i in build_url_list: try: build = Building(co_index) build.co_name = co_name build_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(build_url, headers=self.headers) html = response.text # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0] for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M): try: build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k result = requests.get(build_url_detail, headers=self.headers) content = result.text build.bu_num = re.findall( 'BuildingInfo1_lblBuildingName">(.*?)<', content, re.S | re.M)[0] build.bu_all_house = re.findall( 'BuildingInfo1_lblZts">(.*?)<', content, re.S | re.M)[0] build.bu_floor = re.findall( 'BuildingInfo1_lblZcs">(.*?)<', content, re.S | re.M)[0] build.bu_build_size = re.findall( 'BuildingInfo1_lblJzmj">(.*?)<', content, re.S | re.M)[0] build.bu_live_size = re.findall( 'BuildingInfo1_lblZzmj">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale = re.findall( 'BuildingInfo1_lblYsxkzh">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale_date = re.findall( 'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content, re.S | re.M)[0] build.insert_db() house_url_list = re.findall( "onClick=.getMoreHouseInfo\('(.*?)'\)", content, re.S | re.M) self.get_house_info(house_url_list, co_name, build.bu_num) except Exception as e: print(e) except Exception as e: print(e)
def build_parse(self, co_id): bu = Building(co_index) url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id) res = requests.get(url, headers=self.headers) con_html = etree.HTML(res.text) build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1] a = con_html.xpath("//td[@width='54%']") for index in range(0, len(build_url_list)): try: build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index] res = requests.get(build_info_url, headers=self.headers) con = res.text bu.co_id = co_id bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con, re.S | re.M).group(1) bu.bu_num = re.search('幢.*?did">(.*?)<', con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con, re.S | re.M).group(1) bu.bu_address = re.search('位置.*?ss">(.*?)<', con, re.S | re.M).group(1) bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con, re.S | re.M).group(1) bu.bu_type = re.search('性质.*?type">(.*?)<', con, re.S | re.M).group(1) bu.bu_all_house = re.search('套数.*?number">(.*?)<', con, re.S | re.M).group(1) bu.bu_id = re.search('id=(\d+)', build_url_list[index]).group(1) bu.insert_db() except Exception as e: print( '楼栋错误,co_index={},url={}'.format(co_index, build_info_url), e) continue try: house_url = a[index].xpath("./a/@href")[0] self.house_parse(house_url, co_id, bu.bu_id) except Exception as e: continue
def get_build_info(self, build_url_list, co_id): for i in build_url_list: build_url = 'http://gold.ncfdc.com.cn/' + i.replace('amp;', '') res = requests.get(build_url) co_name = re.search('ctl15_proname">(.*?)<', res.text, re.S | re.M).group(1) str = re.search('项目楼栋列表.*?ctl17_fLinks_pDataShow', res.text, re.S | re.M).group() for info in re.findall('<tr>.*?</tr>', str, re.S | re.M): if 'href' not in info: continue try: build = Building(co_index) build.co_name = co_name build.bu_num = re.search( '<tr>.*?<td>.*?<a href=.*?>(.*?)<', info, re.S | re.M).group(1) build.bu_pre_sale = re.search( 'onclick="BinSHouseInfo.*?>(.*?)<', info, re.S | re.M).group(1) build.bu_pre_sale_date = re.search( 'onclick="BinSHouseInfo.*?<td>(.*?)<', info, re.S | re.M).group(1) build.bu_all_house = re.search('color:#ec5f00;">(.*?)<', info, re.S | re.M).group(1) build.bu_id = re.search("DisplayB_ld&hrefID=(.*?)'", info, re.S | re.M).group(1) build.co_id = co_id build.insert_db() except Exception as e: print( '楼栋错误,co_index={},url={}'.format(co_index, build_url), e) house_url_list = re.findall( "</span>.*?</td><td>.*?<a href='(.*?xs.*?)' target=\"_blank\">.*?查看", res.text, re.S | re.M) self.get_house_info(house_url_list)
def build_parse(self, co_id): bu_url = "http://www.zyfgj.org/spf/GetBTable.ashx" bu_data = {"itemRecord": co_id, "houseCode": 0} res = requests.post(bu_url, data=bu_data, headers=self.headers) con = res.content.decode() bu_list = re.findall('<tr id.*?</tr>', con) for bo in bu_list: bu = Building(co_index) bu.co_id = co_id bu_id = re.search('GetData.*?,(.*?)\)', bo).group(1) bu.bu_id = bu_id.strip("'") try: bu.bu_num = re.search('预售证时间:.*?<td>(.*?)</td', bo).group(1) bu.bu_pre_sale = re.search('预售证号:(.*?)</td', bo).group(1) bu.bu_pre_sale_date = re.search('预售证时间:(.*?)</td', bo).group(1) bu.bu_all_house = re.search('预售证号:.*?<td>(\d+)</td', bo).group(1) except Exception as e: log.error("{}楼栋无预售号等信息{}".format(bo, e)) bu.insert_db() self.house_parse(co_id, bu.bu_id)
def get_build_info(self, presell_url_list, co_id): for presell_url in presell_url_list: pre_url = self.url + presell_url res = requests.get(pre_url, headers=self.headers) build_url_list = re.findall('【<a href="(.*?)" target="_self"', res.text, re.S | re.M) for build_url in build_url_list: build_info_url = self.url + build_url try: build_res = requests.get(build_info_url, headers=self.headers) con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID=(\d+)', build_url).group(1) bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span', con, re.S | re.M).group(1) bu.bu_floor = re.search('总 层 数.*?(\d+)</span', con, re.S | re.M).group(1) bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_not_live_size = re.search( '非住宅面积.*?Fzzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span', con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span', con, re.S | re.M).group(1) bu.bu_type = re.search('项目类型.*?Type">(.*?)</span', con, re.S | re.M).group(1) bu.insert_db() except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"", con, re.S | re.M) self.get_house_info(co_id, bu.bu_id, house_detail_list)
def get_comm_info(self,comm_info): co = Comm(co_index) co.co_name = re.search('_blank">(.*?)</a',comm_info).group(1) try: co.co_address = re.findall('px">(.*?)</td',comm_info)[1] except: co.co_address = None co.area = re.search('center">(.*?)</td>',comm_info).group(1) co_detail_url = re.search("href='(.*?)'",comm_info).group(1) co_url = "http://www.qyfgj.cn/newys/"+co_detail_url try: res = requests.get(co_url,headers=self.headers) except Exception as e: print("co_index={}小区未请求到".format(co_index),e) con = res.content.decode('gbk') try: co.co_develops = re.search('开发商名称.*?px;">(.*?)</a',con,re.S|re.M).group(1) co.co_all_house = re.search('总套数.*?">(\d+) ',con,re.S|re.M).group(1) co.co_all_size = re.search('总面积.*?">(\d+.\d+) m',con,re.S|re.M).group(1) except: print("小区无开发商等信息") co.insert_db() try: build = re.findall('<tr bgcolor="white">(.*?)</tr>',con,re.S|re.M) except: print("小区没有楼栋信息") build_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': co_url } for build_info in build: if "进入" in build_info: build_url = re.search('href="(.*?)"><font',build_info).group(1) build_url = "http://www.qyfgj.cn/newys/" + build_url ho_headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': build_url } build_res = requests.get(build_url, headers=build_headers) build_con = build_res.content.decode('gbk') if re.search('ID=(\d+)',build_url): #现售 bu = Building(co_index) bu_id = re.search('ID=(\d+)',build_url).group(1) bu.bu_id = bu_id bu.co_name =co.co_name bu.insert_db() self.get_house_info(headers=ho_headers,bu_id=bu_id,url=build_url) else: #预售 bu = Building(co_index) bu.co_name = co.co_name bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>', build_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con, re.S | re.M).group(1) ret = re.findall('<tr onmouseover(.*?)</tr',build_con,re.S|re.M) for i in ret: house_url = re.search('href="(.*?)"',i).group(1) house_url = "http://www.qyfgj.cn/newys/" + house_url bu.bu_id = re.search('dbh=(.*?)&',i).group(1) bu.bu_num = re.search('<td width="89.*?">(.*?)</',i).group(1) bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',i).group(1) bu.insert_db() ho_res = requests.get(house_url,headers=ho_headers) ho_con = ho_res.content.decode('gbk') new_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': house_url } self.get_house_info(ho_con=ho_con,headers=new_headers,bu_id=bu.bu_id) else: print("楼栋无链接地址")
def start_crawler(self): url = 'http://zzx.zzfc.com/ajaxpro/xy_ysxk_more,App_Web_mjeeodb-.ashx' for i in range(1, 21): payload = "{\"pageNo\":" + str( i) + ",\"pageSize\":30,\"rowcount\":589}" try: response = requests.post(url, data=payload, headers=self.headers) con = response.content.decode() except Exception as e: log.error('楼栋请求失败{}'.format(e)) continue co_list = re.findall('\[\d+,.*?\d+\]', con) for comm in co_list: try: sid = re.search('\[(\d+),', comm).group(1) pid = re.search('",(\d+),', comm).group(1) bu_url = 'http://zzx.zzfc.com/xy_bldg.aspx?pid=' + pid + '&sid=' + sid bu_res = requests.get(bu_url, headers=self.headers) bu_con = bu_res.content.decode() bu = Building(co_index) bu.bu_id = sid bu.bu_address = re.search('楼栋座落.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('预售证号.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('预售日期.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_all_house = re.search('套数.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.insert_db() except Exception as e: log.error("{}楼栋解析失败{}".format(comm, e)) continue ho_url = 'http://zzx.zzfc.com/ajaxpro/xy_housetag,App_Web_xg4ulr9n.ashx' data = "{\"m_key\":\"WWW_LPB_001\",\"m_param\":\"" + sid + "\"}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'X-AjaxPro-Method': 'GETLPBDS' } try: ho_res = requests.post(ho_url, data=data, headers=headers) ho_con = ho_res.content.decode() except Exception as e: log.error("房屋请求失败{}".format(e)) continue ho_list = re.findall('\["\d+.*?\d+\]', ho_con) for house in ho_list: try: ho = House(co_index) ho.bu_id = sid info_list = house.split(",") ho.ho_name = info_list[4] ho.ho_floor = re.search('(\d+)层', house).group(1) ho.ho_build_size = info_list[-3] ho.ho_true_size = info_list[-2] ho.insert_db() except Exception as e: log.error("{}房屋解析错误{}".format(house, e)) continue
def start_crawler(self): response = requests.get(url) html = response.text tree = etree.HTML(html) comm_list = tree.xpath('//tr[@class="Row"]/td[1]/text()') co_develops_list = tree.xpath('//tr[@class="Row"]/td[3]/text()') co_address_list = tree.xpath('//tr[@class="Row"]/td[8]/text()') co_open_time_list = tree.xpath('//tr[@class="Row"]/td[9]/text()') co_pre_sale_list = tree.xpath('//tr[@class="Row"]/td[5]/text()') co_all_house_list = tree.xpath('//tr[@class="Row"]/td[11]/text()') co_build_size_list = tree.xpath('//tr[@class="Row"]/td[10]/text()') co_name_list = tree.xpath('//tr[@class="Row"]/td[4]/text()') for co in range(0, len(comm_list)): try: comm = Comm(co_index) comm_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/PreSellLicenceDetailInfo.aspx?PreSellLicenceSN=' + \ comm_list[ co] result = requests.get(comm_url) html_build = result.text tree = etree.HTML(html_build) build_list = tree.xpath('//tr[@class="Row"]/td[1]/text()') area = tree.xpath('//*[@id="LabSCFW"]/text()')[0] comm.co_id = comm_list[co] comm.area = area comm.co_develops = co_develops_list[co] comm.co_address = co_address_list[co] comm.co_open_time = co_open_time_list[co] comm.co_pre_sale = co_pre_sale_list[co] comm.co_all_house = co_all_house_list[co] comm.co_build_size = co_build_size_list[co] comm.co_develops = co_develops_list[co] comm.co_name = co_name_list[co] comm.insert_db() for bu in range(0, len(build_list)): try: build_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/PubRptHouseList.aspx?BuildingSN=' + \ build_list[bu] res = requests.get(build_url, headers=self.headers) con = res.content.decode('gbk') building = Building(co_index) building.co_id = comm.co_id building.bu_id = build_list[bu] building.bu_num = re.search( '栋号.*?<span.*?">(.*?)</span', con, re.S | re.M).group(1) building.bu_build_size = re.search( '总建筑面积.*?<span.*?">(.*?)</span', con, re.S | re.M).group(1) building.bu_floor = re.search( '层数.*?<span.*?">(.*?)</span', con, re.S | re.M).group(1) building.bu_all_house = re.search( '预售套数.*?<span.*?">(.*?)</span', con, re.S | re.M).group(1) building.bu_pre_sale_date = re.search( '有效期.*?<span.*?">(.*?)</span', con, re.S | re.M).group(1) building.bu_type = re.search( '土地用途.*?<span.*?">(.*?)</span', con, re.S | re.M).group(1) building.bu_pre_sale = re.search( '许可证编号.*?<span.*?">(.*?)</span', con, re.S | re.M).group(1) building.insert_db() house_list = re.findall('房号:<a href="(.*?)"', con) for ho in house_list: try: house = House(co_index) house_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/' + ho respon = requests.get(house_url) html = respon.text house.co_id = comm.co_id house.bu_id = building.bu_id house.ho_name = re.search( '房号:.*?<span.*?>(.*?)<', html, re.M | re.S).group(1) house.ho_build_size = re.search( '预测建筑面积:.*?<span.*?>(.*?)<', html, re.M | re.S).group(1) house.ho_true_size = re.search( '预测套内面积:.*?<span.*?>(.*?)<', html, re.M | re.S).group(1) house.ho_share_size = re.search( '预测分摊面积:.*?<span.*?>(.*?)<', html, re.M | re.S).group(1) house.ho_type = re.search( '房屋用途:.*?<span.*?>(.*?)<', html, re.M | re.S).group(1) house.ho_room_type = re.search( '户型结构:.*?<span.*?>(.*?)<', html, re.M | re.S).group(1) house.insert_db() except Exception as e: print("co_index={},房屋{}信息提取失败".format( co_index, house_url)) print(e) continue except Exception as e: print(e) print('co_idnex={},楼栋{}提取失败'.format( co_index, build_url)) continue except Exception as e: print('co_index={},小区{}提取失败'.format(co_index, comm_url)) print(e) continue