def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def get_house_info(self, ho_con=None, headers=None, bu_id=None, url=None): if ho_con == None: res = requests.get(url, headers=headers) con = res.content.decode('gbk') html = etree.HTML(con) else: html = etree.HTML(ho_con) ho_url_list = html.xpath("//td[@width='120']/a/@href") for ho_url in ho_url_list: ho_detail = 'http://www.qyfgj.cn/newys/' + ho_url res = requests.get(ho_detail, headers=headers) con = res.content.decode('gbk') ho = House(co_index) ho.bu_id = bu_id ho.ho_num = re.search('房屋号.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?">(.*?)m', con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?">(.*?)m', con, re.S | re.M).group(1) ho.ho_type = re.search('房屋用途.*?">(.*?)</td', con, re.S | re.M).group(1) ho.insert_db()
def house_crawler(self, house_url, bu_num, co_id, bu_id): ho = House(co_index, bu_num=bu_num, co_id=co_id, bu_id=bu_id) url = self.url + house_url con = requests.get(url, headers=self.headers) tr = con.text ho_name = re.findall('室号:(.*?)户', tr, re.S | re.M) # 房号:3单元403 # ho_num = re.findall('_td(\d+)"', tr) # 房号id ho_floor = re.findall('(\d+)层', tr) # 楼层 ho_type = re.findall('房屋属性:(.*?)"', tr, re.S | re.M) # 房屋类型:普通住宅 / 车库仓库 ho_room_type = re.findall('户型:(.*?)所', tr, re.S | re.M) # 户型 ho_build_size = re.findall('建筑面积:(.*?)房', tr, re.S | re.M) # 建筑面积 for floor in ho_floor: try: ho.ho_floor = floor for index in range(1, len(ho_name) + 1): ho.ho_name = ho_name[index] ho.ho_type = ho_type[index] ho.ho_room_type = ho_room_type[index] ho.ho_build_size = ho_build_size[index] # ho.ho_num = ho_num[index] ho.insert_db() except: continue
def get_house_detail(self, house_url_list): for i in house_url_list: res = requests.get(i) html = res.content.decode('gbk') bu_name = re.search('楼号:.*?HouseNum">(.*?)</span>', html, re.S | re.M).group(1) co_name = re.search('项目名称.*?PrjName">(.*?)</span>', html, re.S | re.M).group(1) ho_id = re.findall("aspx\?Room=(.*?)'.*?<b>(.*?)</b>", html, re.S | re.M) # 房号和房号id对应的字段 ho_id_dict = {} for k in ho_id: ho_id_dict[k[0]] = k[1] house_info = re.findall("<Room><Cell RoomID='(.*?)'.*?BArea='(.*?)'.*?HouseUse='(.*?)'.*?</Room>", html, re.S | re.M) for j in house_info: try: h = House(self.co_index) h.ho_name = ho_id_dict[j[0]] h.ho_true_size = j[1] h.ho_type = j[2] h.co_name = co_name h.bu_num = bu_name h.insert_db() except Exception as e: print('房屋错误,co_index={},url={}'.format(co_index, i), e) continue
def get_house_info(self, build_url_list): for i in build_url_list: qrykey = re.search('qrykey=(.*?)&', i).group(1) house_url = 'http://old.newhouse.cnnbfdc.com/GetHouseTable.aspx?qrykey=' + qrykey response = requests.get(house_url, headers=self.headers) html = response.text info_list = re.findall('(房号:.*?")', html, re.S | re.M) # ho_name_list = re.findall('title=.*?center.*?center.*?<a.*?>(.*?)<', html, re.S | re.M) for index in range(len(info_list)): try: house = House(co_index) # house.info = info_list[index] # house.ho_name = ho_name_list[index] info = info_list[index] house.ho_name = re.search('房号:(.*?)&', info, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积:(.*?)&', info, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积:(.*?)&', info, re.S | re.M).group(1) house.info = info house.bu_id = qrykey house.insert_db() except Exception as e: print( 'co_index={},房号错误,url ={} '.format( co_index, house_url), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: dong_ID = re.search('dongID=(.*?)$', i).group(1) yszh = re.search('yszh=(.*?)&', i).group(1) house_url = 'http://www.gzbjfc.com/Controls/HouseControls/FloorView.aspx?dongID=' + dong_ID + '&qu=%E6%AF%95%E8%8A%82&yszh=' + yszh + '&zhlx=xs&danyuan=all' response = requests.get(house_url, headers=self.headers) html = response.text bu_id = re.findall('dongID=(.*?)&', html, re.S | re.M)[0] info_str = re.search('<div class="HouseFloorView".*', html, re.S | re.M).group() for k in re.findall('<div class.*?</table></div>', info_str, re.S | re.M): house = House(co_index) if '层' in k: continue if '单元' in k: continue print(k) house.info = k house.ho_name = re.search('span.*?>(.*?)</span>', k, re.S | re.M).group(1) house.ho_true_size = re.search('title.*\n(.*?)\n', k).group(1) house.bu_id = bu_id house.insert_db() # ho_name_list = re.findall('<span.*?>(.*?)<', html, re.S | re.M) # info_list = re.findall("<div class=.*?title='(.*?)'.*?<span", html, re.S | re.M) # for i in range(len(ho_name_list)): # house = House(co_index) # house.bu_id = bu_id # house.ho_name = ho_name_list[i] # house.info = info_list[i] # house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, form_data_list): for data in form_data_list: house_url = 'http://www.gafdc.cn/newhouse/GetBuildTableByAjax.ashx' try: response = requests.post(url=house_url, data=data, headers=self.headers) html = response.text ho_info_html = re.findall("<td width='95'.*?</td>", html, re.S | re.M) bu_id_html = re.search("^.*?overflow-x:auto;", html, re.S | re.M).group() bu_id = re.findall("GetData\('.*?','(.*?)'\)", bu_id_html, re.S | re.M)[-1] for i in ho_info_html: try: h = House(co_index) h.bu_id = bu_id h.ho_name = re.search('<td.*?>(.*?)<', i, re.S | re.M).group(1) h.ho_type = re.search('物业类别:(.*?) ', i, re.S | re.M).group(1) h.ho_build_size = re.search('建筑面积:(.*?) ', html).group(1) h.insert_db() except Exception as e: print( '房屋报错,co_index={},url={}'.format( co_index, house_url), e) except Exception as e: print('房屋报错,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: build_url = 'http://www.jmfc.com.cn' + i response = requests.get(build_url, headers=self.headers) html = response.text bu_id = re.search('lzbm=(.*?)&', build_url).group(1) ho_name_list = re.findall('width="35%".*?房号:.*?<TD.*?>(.*?)<', html, re.S | re.M) ho_true_size_list = re.findall( 'width="35%".*?房号:.*?<TD.*?<TD.*?<TD.*?>(.*?)<', html, re.S | re.M) ho_type_list = re.findall( 'width="35%".*?房号:.*?<font.*?<TD.*?<TD.*?>(.*?)<', html, re.S | re.M) for i in range(0, len(ho_name_list)): try: house = House(co_index) house.ho_name = ho_name_list[i].strip() house.ho_true_size = ho_true_size_list[i].strip() house.ho_type = ho_type_list[i].strip() house.bu_id = bu_id house.insert_db() except Exception as e: print(e) except Exception as e: print(e)
def ho_info(self,ho_list,co_id,bu_id): for hou in ho_list: try: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = hou.xpath("./text()")[0] ho_info = hou.xpath("./@title")[0] ho.ho_build_size = re.search('建筑面积:(.*?)\n',ho_info).group(1) ho.ho_share_size = re.search('分摊面积:(.*)',ho_info).group(1) ho.ho_true_size = re.search('套内面积:(.*?)\n',ho_info).group(1) ho.insert_db() except Exception as e: # log.error("房屋信息错误{}".format(e)) print("房屋信息错误{}".format(e))
def comm(self, id): bu = Building(co_index) house_url = self.start_url + "/api/buildInfos/getHouseInfosByPannelNumber?pannelNumber=" + str(id) comm_url = self.start_url + "/api/buildInfos/getHomePageBuildingInfo?blockNumber=" + str(id) comm_detail_url = self.start_url + "/api/buildInfos/getDetailsBuildingInfo?blockNumber=" + str(id) comm_res = requests.get(comm_url) comm_detail_res = requests.get(comm_detail_url) house_res = requests.get(house_url) comm_dict = json.loads(comm_res.text) comm_detail_dict = json.loads(comm_detail_res.text) house_dict = json.loads(house_res.text) bu.bu_id = id bu.bu_num = comm_dict["data"]["nameBuildings"] bu.area = comm_detail_dict['data']['houseingArea'] bu.bu_address = comm_dict["data"]["houseaddress"] bu.bu_pre_sale = comm_detail_dict["data"]["yszh"] bu.bu_type = comm_dict["data"]["propertycategory"] bu.bo_develops = comm_dict["data"]["companyName"] bu.insert_db() house_num = house_dict["data"] for hu in house_num: ho = House(co_index) h = hu["data"] if len(h) > 0: for i in h: try: room_id = i["houseNumber"] room_url = self.start_url + "/api/buildInfos/getHouseInfoByHouseNumber?houseNumber=" + str( room_id) res = requests.get(room_url, headers=self.headers) dict = json.loads(res.text) ho.bu_id = id # ho.ho_num = room_id ho.ho_name = dict["data"]["houseNo"] ho.ho_build_size = dict["data"]["buildArea"] ho.ho_true_size = dict["data"]["jacketArea"] ho.ho_share_size = dict["data"]["apportionedArea"] ho.ho_floor = dict["data"]["nominalLevel"] ho.insert_db() except Exception as e: print(e) else: continue
def get_house_info(self, zu_house_url, bu_num, co_id): try: house = House(co_index) house.bu_num = bu_num house.co_id = co_id result = self.s.get(zu_house_url, headers=self.headers).text house.info = re.search('ItemName.*?>(.*?)<', result).group(1).strip() ho_code_list = re.findall("OnClick=.__doPostBack\(.*?,'(.*?)'\)", result, re.S | re.M) ho_msg_list = re.findall("OnClick=.__doPostBack\('(.*?)'", result, re.S | re.M) self.get_house_detail(zu_house_url, ho_msg_list, ho_code_list, house) except Exception as e: print(e)
def get_data_obj(self, analyzer, co_index): if analyzer == 'comm': return Comm(co_index) elif analyzer == 'build': return Building(co_index) elif analyzer == 'house': return House(co_index)
def get_build_info(self, co_id, co_name): url = 'http://www.czhome.com.cn/Presell.asp?projectID=' + co_id + '&projectname=' + co_name response = requests.get(url, headers=self.headers) html = response.content.decode('gbk') tree = etree.HTML(html) xpath_list = tree.xpath('//tr[@class="indextabletxt"]') for i in xpath_list[1:]: build_url = i.xpath('td[2]/a/@href')[0] url = 'http://www.czhome.com.cn/' + build_url result = requests.get(url, headers=self.headers) if result.status_code is not 200: print("co_index={},预售url:{}连接失败".format(co_index, url)) continue html = result.content.decode('gbk') tree = etree.HTML(html) # 总套数 bu_xpath = tree.xpath( '/html/body/table/tr/td/table/tr/td/table/tr')[1:] for i in bu_xpath: try: building = Building(7) global building_id building_id += 1 building.bu_id = building_id bu_all_house = i.xpath('td[7]/text()')[0] bu_url = i.xpath('td[1]/a/@href')[0] url = 'http://www.czhome.com.cn/' + bu_url response = requests.get(url, headers=self.headers) if response.status_code is not 200: print("co_index={},楼栋url:{}连接失败".format(co_index, url)) continue html = response.content.decode('gbk') tree = etree.HTML(html) # 楼层 bu_floor = tree.xpath( '//*[@id="Table4"]/tr[2]/td/table[3]/tr/td[1]/u/text()' )[-1] house_url_list = tree.xpath( '//*[@id="Table4"]/tr[2]/td/table[3]/tr/td/a/@href') bu_address = re.search( '<center><font color=.*? (.*?)<', html, re.S | re.M).group(1) building.bu_all_house = bu_all_house building.bu_address = bu_address building.bu_floor = bu_floor building.bu_id = building_id building.co_id = co_id building.insert_db() for i in house_url_list: try: house = House(7) house_url = 'http://www.czhome.com.cn/' + i self.get_house_info(house_url, house, co_id, building_id, building) except Exception as e: print(e) except Exception as e: print(e)
def get_build_detail(self, building_url, building, co_id): try: res = requests.get(url=building_url, headers=self.headers) html = res.content.decode('gb2312', 'ignore').replace( '\n', '').replace('\r', '').replace('\t', '').replace(' ', '') bu_id = building_url.split('=')[1].split('&')[0] # 楼栋id bu_name = re.search( r'项目名称:</td><tdwidth="1"rowspan="6"background="images/trbg3.gif"></td><tdwidth="200"align="left"class="padingleft3px">(.*)?</td><tdwidth="1"rowspan="6"align="right"bgcolor="#CECFCE"></td><tdwidth="2"rowspan="6"align="right"bgcolor="#FFFFFF">', html).group(1) # 楼栋名称 bu_num = re.search( r'号:</td><tdwidth="1"rowspan="6"background="images/trbg3.gif"></td><tdalign="left"class="padingleft3px">(.*)?</td></tr><tr><tdheight="25"align="right">总 套 数', html).group(1) # 栋号 # print(bu_num) bu_all_house = re.search( r'总 套 数:</td><tdalign="left"class="padingleft3px">(.*?)</td><tdalign="right">可售套数', html).group(1) # 总套数 bu_floor = re.search( r'总层数:</td><tdalign="left"class="padingleft3px">(.*)?</td><tdalign="right">项目类型', html).group(1) # 总层数 bu_build_size = re.search( r'建筑面积:</td><tdalign="left"class="padingleft3px"><FONTcolor=#ff0000>(.*)?M²</FONT></td><tdalign="right">住宅面积', html).group(1) # 建筑面积 bu_live_size = re.search( r'住宅面积:</td><tdalign="left"class="padingleft3px">(.*)?M²</td></tr><tr><tdheight="25"align="right">幢套内建筑面积', html).group(1) # 住宅面积 bu_not_live_size = re.search( r'非住宅面积:</td><tdalign="left"class="padingleft3px">(.*)?M²</td></tr><tr><tdheight="25"align="right">预', html).group(1) # 非住宅面积 bu_price = re.search( r'拟销住宅价格:</td><tdbackground="images/trbg3.gif"></td><tdalign="left"class="padingleft3px">(.*)?</td><tdalign="right"bgcolor="#CECFCE"></td><tdalign="right"bgcolor="#FFFFFF"></td><tdalign="right"bgcolor="#CECFCE"></td><tdalign="right">拟销商业门面价格', html).group(1).split('元')[0] # 住宅价格 bu_type = re.search('项目类型:</td>.*?ft3px">(.*?)</td>', html, re.S | re.M).group(1) building.co_id = co_id building.bu_id = bu_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_not_live_size = bu_not_live_size building.bu_price = bu_price building.bu_type = bu_type # 获取房号超链接 house_url_list = re.findall(r"window.open\('(.+?)'\)", html) for i in house_url_list: house_url = 'http://www.bsfcj.com/PubInfo/' + i house = House(1) house_obj = self.get_house_detail(house_url, house, co_id, bu_id) house_obj.insert_db() return building except Exception as e: print( '楼栋解析或者请求的过程中出现错误,co_index={},url={}'.format( self.co_index, building_url), e)
def ho_parse(self, co_id, bu_id, res): ho_html = etree.HTML(res.text) house_list = ho_html.xpath("//td[@title]") for house in house_list: house_info = house.xpath("./@title")[0] house_name = house.xpath(".//span/text()")[0] ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = house_name try: ho.ho_build_size = re.search('建筑面积:(.*?)\r', house_info).group(1) ho.ho_true_size = re.search('套内面积:(.*?)\r', house_info).group(1) ho.ho_share_size = re.search('分摊面积:(.*)', house_info).group(1) except Exception as e: log.info("房间无面积") ho.insert_db()
def house_info(self,co_id,bu_id,bu_url): ho_url = 'http://www.bdfdc.net' + bu_url res = requests.get(ho_url,headers=self.headers) time.sleep(5) html = etree.HTML(res.text) house_info_list = html.xpath("//a[@wf]") for house_info in house_info_list: ho = House(co_index) detail = house_info.xpath("./@wf")[0] ho.ho_name = house_info.xpath("./text()")[0] ho.bu_id = bu_id ho.co_id = co_id ho.ho_build_size = re.search('建筑面积:(.*?)m',detail).group(1) ho.ho_type = re.search('用途:(.*?)<br',detail).group(1) ho.insert_db()
def get_house_info(self, con, co_id, build_id): html_str = re.search('houseTableData.*?特别申明', con, re.S | re.M).group() for info in re.findall('<div style.*?</div>', html_str, re.S | re.M): try: ho = House(co_index) ho.ho_name = re.search("'HC_HOUSENUMB':'(.*?)',", info, re.S | re.M).group(1) ho.ho_room_type = re.search("'HC_HOUSETYPE':'(.*?)',", info, re.S | re.M).group(1) ho.ho_build_size = re.search("'HC_STCTAREA':'(.*?)',", info, re.S | re.M).group(1) ho.bu_id = build_id ho.co_id = co_id ho.insert_db() except Exception as e: print('house error, co_index={}'.format(co_index))
def get_house_info(self, house_url, ho_name, bu_id, co_id): house = House(co_index) url = 'http://www.bjjs.gov.cn' + house_url if '#' not in url: house = self.get_house_detail(url, house) house.ho_name = ho_name house.bu_id = bu_id house.co_id = co_id house.insert_db()
def house_parse(self, co_id, bu_id, bu_con): name_list = re.findall('<a style.*?\)>(.*?)</a', bu_con) for name in name_list: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = name ho.insert_db()
def get_house_info(self, bu_id, co_id): house_url = 'http://b.fang99.com/buildinglistselect.aspx?buildingid=' + co_id + '&xmbh=&lzbh=' + bu_id response = self.request_proxy(house_url, headers=self.headers) html = response.content.decode('gbk') house_html = re.search('rpt_ewlpblc_fjlistdiv_0.*?erp_con_2', html, re.S | re.M).group() house_info_list = re.findall('<span.*?</span>', house_html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_room_type = re.search('title="(.*?),', i, re.S | re.M).group(1) house.ho_build_size = re.search('title=".*?,(.*?)"', i, re.S | re.M).group(1) if '<a' in i: house.ho_name = re.search('<a.*?>(.*?)<', i, re.S | re.M).group(1) else: house.ho_name = re.search('<span.*?>(.*?)<', i, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, build_num, sid): try: house_url = 'http://www.tmsf.com/newhouse/NewPropertyHz_showbox.jspx?buildingid=' + build_num + '&sid=' + sid house = House(co_index) house.bu_id = 'buildingid":(.*?),' house.co_build_size = 'builtuparea":(.*?),' house.ho_price = 'declarationofroughprice":(.*?),' house.ho_name = 'houseno":(.*?),' house.ho_true_size = 'setinsidefloorarea":(.*?),' house.ho_share_size = 'poolconstructionarea":(.*?),' house.ho_type = 'houseusage":(.*?),' p_2 = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p_2.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, bu_id): house_url = 'http://www.ytfcjy.com/Common/Agents/ExeFunCommon.aspx' payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ bu_id + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>80</item>\r\n<item>720</item>\r\n<item>g_oBuildTable</item>\r\n<item> 1=1</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.request("POST", house_url, data=payload, headers=headers) html = response.text house_info_list = re.findall("title='(.*?)'", html, re.S | re.M) for i in house_info_list: house = House(co_index) house.ho_name = re.search('房号:(.*?)单元', i, re.S | re.M).group(1) house.ho_build_size = re.search('总面积:(.*?) 平方米', i, re.S | re.M).group(1) house.ho_type = re.search('用途:(.*?)户', i, re.S | re.M).group(1) house.ho_price = re.search('价格:(.*?) 元', i, re.S | re.M).group(1) house.bu_id = bu_id house.info = i house.insert_db()
def get_house_info(self, house_url_list, bu_id): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.lpsfdc.cn/Templets/LPS/aspx/' + i response = requests.get(house_url) html = response.text ho_name = re.findall('ROOM_ROOMNO">(.*?)<', html, re.S | re.M)[0] ho_type = re.findall('ROOM_FWLX">(.*?)<', html, re.S | re.M)[0] ho_build_size = re.findall('ROOM_YCJZMJ">(.*?)<', html, re.S | re.M)[0] ho_true_size = re.findall('ROOM_YCTNMJ">(.*?)<', html, re.S | re.M)[0] ho_share_size = re.findall('ROOM_YCFTMJ">(.*?)<', html, re.S | re.M)[0] house.ho_name = ho_name house.ho_type = ho_type house.ho_build_size = ho_build_size house.ho_true_size = ho_true_size house.ho_share_size = ho_share_size house.bu_id = bu_id house.insert_db() except Exception as e: print(e)
def get_house_info(self, house_url, co_id, bu_id): house_url_ = 'http://58.51.240.121:8503/' + house_url try: response = requests.get(house_url_, headers=self.headers) html = response.text house_info_list = re.findall('getMoreHouseInfo.*?</table>', html, re.S | re.M)[1:] for i in house_info_list: house = House(co_index) house.co_id = co_id house.bu_id = bu_id house.ho_name = re.search('>(.*?)<', i, re.S | re.M).group(1) house.ho_type = re.search('性质 (.*?)<', i, re.S | re.M).group(1) house.ho_build_size = re.search('面积 (.*?)<', i, re.S | re.M).group(1) house.co_build_structural = re.search('结构 (.*?)<', i, re.S | re.M).group(1) house.insert_db() except Exception as e: print('请求错误,co_index={},url={}'.format(co_index, house_url_), e)
def get_house_info(self, house_url_list, bu_id): for i in house_url_list: house = House(co_index) house_url = 'http://183.63.60.194:8808/public/web/' + i time.sleep(1) response = self.s.get(house_url, headers=self.headers) if response.status_code is not 200: print('房号错误,co_index={},url={}'.format(co_index, house_url)) continue html = response.text house.bu_id = bu_id house.ho_name = re.search('HouseNO.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('HouseArea.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('SumBuildArea1.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('HouseUse.*?>(.*?)<', html, re.S | re.M).group(1) house.orientation = re.search('CHX.*?>(.*?)<', html, re.S | re.M).group(1) house.insert_db()
def get_house_info(self, code, co_name): house_url = 'http://house.bffdc.gov.cn/Common/Agents/ExeFunCommon.aspx?' payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ code[ 0] + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>55</item>\r\n<item>840</item>\r\n<item>g_oBuildTable</item>\r\n<item>false</item>\r\n<item> 1=1</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.post(url=house_url, data=payload, headers=headers) html = response.text info = re.findall("title='(.*?)'", html, re.S | re.M) for i in info: try: house = House(co_index) house.bu_num = code[1] house.ho_name = re.search('房号:(.*?)\r\n', i).group(1) house.ho_type = re.search('用途:(.*?)\r\n', i).group(1) house.ho_room_type = re.search('户型:(.*?)\r\n', i).group(1) house.ho_build_size = re.search('总面积:(.*?)\r\n', i).group(1) house.co_name = co_name house.insert_db() except Exception as e: print(e)
def ho_info(self, house_url, co_id, bu_id): res = requests.get(house_url, headers=self.headers) html = etree.HTML(res.text) ho_info_list = html.xpath("//tbody//td[@unitname]") for ho_info in ho_info_list: try: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_info.xpath("./text()")[0] ho.insert_db() except Exception as e: log.error("小区房屋信息提取失败", e)
def get_house_detail(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_detail_url = 'http://222.184.103.50:7700/WW/housedetail.aspx?houseID=' + i response = requests.get(house_detail_url, headers=self.headers) html = response.text house.ho_name = re.search('id="Label1">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="Label2">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="Label3">(.*?)<', html, re.S | re.M).group(1) house.co_id = co_id house.bu_id = bu_id house.insert_db() except Exception as e: print( '请求错误,co_index={},url={}'.format(co_index, house_detail_url), e)
def get_house_detail(self, house_url_list): print(house_url_list) for i in house_url_list: try: response = requests.get(i, headers=self.headers) html = response.text house_html = re.search('id=.roomTable.*?id="remarkDiv"', html, re.S | re.M).group() house_info_list = re.findall('<td class=.*?title.*?</td>', house_html, re.S | re.M) bu_id = re.search('roomTable.aspx\?id=(.*?)&', html, re.S | re.M).group(1) for i in house_info_list: house = House(co_index) house.bu_id = bu_id house.ho_build_size = re.search('建筑面积:(.*?) ', i, re.S | re.M).group(1) house.info = re.search("(建筑面积:.*?)'>", i, re.S | re.M).group(1) house.ho_name = re.search("<td.*?>(.*?)</td>", i, re.S | re.M).group(1) if 'id' in house.ho_name: house.ho_name = re.search('<a.*?>(.*?)</a>', house.ho_name, re.S | re.M).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, i), e) print('房号放入完成')
def get_house_info(self, house_url, bu_id): response = requests.get('http://thfdc.net/' + house_url, headers=self.headers) html = response.text house_info_list = re.findall('<tr onClick=.*?</tr>', html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_name = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house.area = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house.ho_build_size = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house.ho_type = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house.bu_id = bu_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format())