def get_house_detail(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_detail_url = 'http://www.lhfdc.gov.cn/templets/lh/aspx/hpms/RoomInfo.aspx?code=' + i response = requests.get(house_detail_url, headers=self.headers) html = response.text house.ho_name = re.search('id="ROOM_ROOMNO">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="ROOM_FWHX">(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="ROOM_GHYT">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="ROOM_YCJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="ROOM_YCTNMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('id="ROOM_YCFTMJ">(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print( '房号错误,co_index={},url={}'.format(co_index, house_detail_url), e)
def get_house_detail(self, house_detail_url_list, co_id, bu_id): for i in house_detail_url_list: detail_url = 'http://www.yzfdc.cn/' + i try: house = House(co_index) time.sleep(3) response = self.s.get(detail_url, headers=self.headers) html = response.text house.co_name = re.search('lblxmmc.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_num = re.search('lbldh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_name = re.search('lblfh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('lbljzmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('lbltnmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('lblftmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('lblfwxz.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('lblhuxin.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, detail_url), e)
def house_info(self, co_id, bu_id, house_url_list): for house_ in house_url_list: house_url = "http://www.njhouse.com.cn/2016/spf/" + house_ try: # ho_res = requests.get(house_url,headers=self.headers) ho_pro = Proxy_contact(app_name="nanjing", method='get', url=house_url, headers=self.headers) ho_con = ho_pro.contact() ho_con = ho_con.decode('gbk') # ho_con = ho_res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房号.*?;">(.*?)</td', ho_con, re.S | re.M).group(1) ho.ho_price = re.search('价格.*?<td>(.*?)元', ho_con, re.S | re.M).group(1) ho.ho_floor = re.search('楼层.*?;">(.*?)</td', ho_con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_type = re.search('房屋类型.*?<td>(.*?)</td', ho_con, re.S | re.M).group(1) except Exception as e: log.error("房屋详情页错误{}".format(e)) continue ho.insert_db()
def get_house_detail(self, house_detail_url, co_id, bu_id): try: house = House(co_index) house_detail_url_ = 'http://www.yfci.gov.cn:8080/HousePresell/' + house_detail_url response = requests.get(house_detail_url_, headers=self.headers) html = response.text if '找不到记录' in html: return house.ho_name = re.search('id="HouseNO".*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="HouseArea".*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="SumBuildArea1".*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="HouseUse".*?>(.*?)<', html, re.S | re.M).group(1) house.orientation = re.search('id="CHX".*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="CHX".*?>(.*?)<', html, re.S | re.M).group(1) house.co_id = co_id house.bu_id = bu_id house.insert_db() except Exception as e: print( '房号错误,co_index={},url={}'.format(co_index, house_detail_url_), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: build_url = 'http://www.jmfc.com.cn' + i response = requests.get(build_url, headers=self.headers) html = response.text bu_id = re.search('lzbm=(.*?)&', build_url).group(1) ho_name_list = re.findall('width="35%".*?房号:.*?<TD.*?>(.*?)<', html, re.S | re.M) ho_true_size_list = re.findall( 'width="35%".*?房号:.*?<TD.*?<TD.*?<TD.*?>(.*?)<', html, re.S | re.M) ho_type_list = re.findall( 'width="35%".*?房号:.*?<font.*?<TD.*?<TD.*?>(.*?)<', html, re.S | re.M) for i in range(0, len(ho_name_list)): try: house = House(co_index) house.ho_name = ho_name_list[i].strip() house.ho_true_size = ho_true_size_list[i].strip() house.ho_type = ho_type_list[i].strip() house.bu_id = bu_id house.insert_db() except Exception as e: print(e) except Exception as e: print(e)
def house_parse(self,bu_id,co_id,sid,propertyid): data = { 'propertyid':propertyid, 'sid':sid, 'buildingid':bu_id, 'tid':'price', 'page':1 } res = requests.post('http://tmsf.qzfdcgl.com/newhouse/property_pricesearch.htm',data=data,headers=self.headers) page = re.search('页数.*?/(\d+)',res.text).group(1) for i in range(1,int(page)+1): data['page'] = i ho_res = requests.post('http://tmsf.qzfdcgl.com/newhouse/property_pricesearch.htm', data=data, headers=self.headers) con = ho_res.text ho_html = etree.HTML(con) house_list = ho_html.xpath("//tr[@onmouseout]") for house in house_list: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = house.xpath("./td[3]/a/div/text()")[0] ho.unit = house.xpath("./td[2]/a/div/text()")[0] buildsize = house.xpath("./td[4]/a/div/span/@class") truesize = house.xpath("./td[5]/a/div/span/@class") price = house.xpath("./td[9]/a/div/span/@class") ho.ho_build_size = self.number_replace(buildsize) ho.ho_true_size = self.number_replace(truesize) ho.ho_price = self.number_replace(price) ho.insert_db()
def house_info(self,ho_url,co_id,bu_id): url = "http://222.77.178.63:7002/" + ho_url url.rstrip('=') res = requests.get(url,headers=self.headers) res.encoding = 'gbk' html = etree.HTML(res.text) house_detail_list = html.xpath("//td/a[@target]/@href") for house_detail in house_detail_list: try: detail_url = "http://222.77.178.63:7002/" + house_detail detail_res = requests.get(detail_url,headers=self.headers) detail_res.encoding = 'gbk' con = detail_res.text ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('室号.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_floor = re.search('实际层.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_type = re.search('房屋类型.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_build_size = re.search('预测建筑面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_true_size = re.search('预测套内面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_share_size = re.search('预测分摊面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_price = re.search('总价.*?">(.*?)<',con,re.S|re.M).group(1) ho.insert_db() except Exception as e: # log.error('房屋信息错误{}'.format(e)) print('房屋信息错误{}'.format(e))
def ho_info(self, url, co_id, bu_id): ho_url = 'http://www.aqhouse.net/' + url while True: try: proxy = self.proxies[random.randint(0, 9)] ho_res = requests.get(ho_url, headers=self.headers, proxies=proxy) break except Exception as e: print(e) ho_html = etree.HTML(ho_res.text) room_list = ho_html.xpath("//td[@nowrap]/a/..") for room in room_list: try: room_info = room.xpath("./@title")[0] ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = room.xpath("./a/text()")[0] ho.ho_build_size = re.search('建筑面积:(.*?)平方米', room_info).group(1) ho.ho_true_size = re.search('套内面积:(.*?)平方米', room_info).group(1) ho.ho_share_size = re.search('分摊面积:(.*?)平方米', room_info).group(1) ho.ho_room_type = re.search('套型:(.*)', room_info).group(1) ho.ho_price = re.search('价格.*?:(.*?)元/平方米', room_info).group(1) ho.insert_db() except: print('房屋解析失败')
def house_parse(self, house_url, co_id, bu_id): ho = House(co_index) url = "http://spf.tlfdc.cn/" + house_url res = requests.get(url, headers=self.headers) con = res.text ho_name = re.findall('室号:(.*?)套', con, re.S | re.M) ho_room_type = re.findall('套型:(.*?)建', con, re.S | re.M) ho_build_size = re.findall('建筑面积:(.*?)参', con, re.S | re.M) ho_price = re.findall('价格:(.*?)元', con, re.S | re.M) ho_detail = re.findall('href="(show.*?\?id=\d+&id2=\d+&prjid=\d+)"', con, re.S | re.M) for index in range(0, len(ho_name)): try: ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_name[index] ho.ho_room_type = ho_room_type[index] ho.ho_build_size = ho_build_size[index] ho.ho_price = ho_price[index] ho_detail_url = "http://spf.tlfdc.cn/" + ho_detail[index] res = requests.get(ho_detail_url, headers=self.headers) res = res.content.decode('gb2312') ho.ho_floor = re.findall('楼层.*?">(.*?)</td>', res, re.S | re.M)[0].strip() ho.insert_db() except: print('房号错误,co_index={},url={}'.format(co_index, url), e) continue
def get_build_info(self, url, response,co_id, bu_id): house = House(co_index) json_html = json.loads(response.text) for i in json_html: ho_name = i['roomno'] # 房号 ho_type = i['ghyt'] # 用途 ho_true_size = i['tnmj'] # 预测套内面积 ho_floor = i['floorindex'] # 楼层 ho_build_size = i['jzmj'] # 建筑面积 house.co_id = co_id house.bu_id = bu_id house_code = i["fwcode"] house.ho_name = ho_name house.ho_type = ho_type house.ho_true_size = ho_true_size house.ho_floor = ho_floor house.ho_build_size = ho_build_size house_detail_url = "http://fsfc.fsjw.gov.cn/hpms_project/roomview.jhtml?id="+str(house_code) try: res = requests.get(house_detail_url,headers=self.headers) house.ho_share_size = re.search('实测分摊面积.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1) house.ho_price = re.search('总价.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1) except Exception as e: print("co_index={},房屋详情页{}请求失败!".format(co_index,house_detail_url)) print(e) continue house.insert_db()
def get_house_info(self, build_url_list): for i in build_url_list: qrykey = re.search('qrykey=(.*?)&', i).group(1) house_url = 'http://old.newhouse.cnnbfdc.com/GetHouseTable.aspx?qrykey=' + qrykey response = requests.get(house_url, headers=self.headers) html = response.text info_list = re.findall('(房号:.*?")', html, re.S | re.M) # ho_name_list = re.findall('title=.*?center.*?center.*?<a.*?>(.*?)<', html, re.S | re.M) for index in range(len(info_list)): try: house = House(co_index) # house.info = info_list[index] # house.ho_name = ho_name_list[index] info = info_list[index] house.ho_name = re.search('房号:(.*?)&', info, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积:(.*?)&', info, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积:(.*?)&', info, re.S | re.M).group(1) house.info = info house.bu_id = qrykey house.insert_db() except Exception as e: print( 'co_index={},房号错误,url ={} '.format( co_index, house_url), e)
def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.fjnpfdc.com/House/' + i house_res = requests.get(house_url, headers=self.headers) house_con = house_res.content.decode('gbk') house.bu_id = bu_id house.co_id = co_id house.bu_num = re.search('幢 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_name = re.search('房 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.co_name = re.search('项目名称:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_floor = re.search('所 在 层:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.insert_db() except Exception as e: print("co_index={},房屋{}错误".format(co_index, i), e)
def get_house_info(self, form_data_list): for data in form_data_list: house_url = 'http://www.gafdc.cn/newhouse/GetBuildTableByAjax.ashx' try: response = requests.post(url=house_url, data=data, headers=self.headers) html = response.text ho_info_html = re.findall("<td width='95'.*?</td>", html, re.S | re.M) bu_id_html = re.search("^.*?overflow-x:auto;", html, re.S | re.M).group() bu_id = re.findall("GetData\('.*?','(.*?)'\)", bu_id_html, re.S | re.M)[-1] for i in ho_info_html: try: h = House(co_index) h.bu_id = bu_id h.ho_name = re.search('<td.*?>(.*?)<', i, re.S | re.M).group(1) h.ho_type = re.search('物业类别:(.*?) ', i, re.S | re.M).group(1) h.ho_build_size = re.search('建筑面积:(.*?) ', html).group(1) h.insert_db() except Exception as e: print( '房屋报错,co_index={},url={}'.format( co_index, house_url), e) except Exception as e: print('房屋报错,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, bu_id, co_id): house_url = "http://www.xyfdc.gov.cn/wsba/Common/Agents/ExeFunCommon.aspx" payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ bu_id + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>80</item>\r\n<item>840</item>\r\n<item>g_oBuildTable</item>\r\n<item> 1=1</item>\r\n<item>1</item>\r\n<item>false</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.request("POST", house_url, data=payload, headers=headers) html = response.text house_info_list = re.findall( "onclick=.g_oBuildTable.clickRoom.*? title='(.*?)'", html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_name = re.search('房号:(.*?)单元:', i, re.S | re.M).group(1) house.ho_build_size = re.search('总面积:(.*?)平方米', i, re.S | re.M).group(1) house.ho_type = re.search('用途:(.*?)户型', i, re.S | re.M).group(1) house.ho_room_type = re.search('户型:(.*?)状态', i, re.S | re.M).group(1) house.info = i house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print( '房号错误,co_index={},url={},data={}'.format( co_index, house_url, payload), e)
def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) response = requests.get(i, headers=self.headers) html = response.text house.ho_name = re.search('门牌号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_floor = re.search('所在层:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('房屋性质:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('预测建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('预测套内面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('预测分摊面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.co_address = re.search('房屋坐落:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, i), e)
def get_house_info(self, house_id_list, bu_id, co_id): for i in house_id_list: house_url = 'http://www.hbczfdc.com:4993/HPMS/RoomInfo.aspx?code=' + i try: house = House(co_index) response = requests.get(house_url, headers=self.headers) html = response.text house.bu_id = bu_id house.co_id = co_id house.ho_name = re.search('id="ROOM_HH">(.*?)<', html, re.S | re.M).group(1) house.ho_floor = re.search('id="ROOM_MYC">(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="ROOM_FWYT">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="ROOM_HX">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="ROOM_YCJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="ROOM_YCTNJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('id="ROOM_YCFTJZMJ">(.*?)<', html, re.S | re.M).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: dong_ID = re.search('dongID=(.*?)$', i).group(1) yszh = re.search('yszh=(.*?)&', i).group(1) house_url = 'http://www.gzbjfc.com/Controls/HouseControls/FloorView.aspx?dongID=' + dong_ID + '&qu=%E6%AF%95%E8%8A%82&yszh=' + yszh + '&zhlx=xs&danyuan=all' response = requests.get(house_url, headers=self.headers) html = response.text bu_id = re.findall('dongID=(.*?)&', html, re.S | re.M)[0] info_str = re.search('<div class="HouseFloorView".*', html, re.S | re.M).group() for k in re.findall('<div class.*?</table></div>', info_str, re.S | re.M): house = House(co_index) if '层' in k: continue if '单元' in k: continue print(k) house.info = k house.ho_name = re.search('span.*?>(.*?)</span>', k, re.S | re.M).group(1) house.ho_true_size = re.search('title.*\n(.*?)\n', k).group(1) house.bu_id = bu_id house.insert_db() # ho_name_list = re.findall('<span.*?>(.*?)<', html, re.S | re.M) # info_list = re.findall("<div class=.*?title='(.*?)'.*?<span", html, re.S | re.M) # for i in range(len(ho_name_list)): # house = House(co_index) # house.bu_id = bu_id # house.ho_name = ho_name_list[i] # house.info = info_list[i] # house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def get_house_info(self, house_url_list): for i in house_url_list: try: dongid = re.search('dongid=(.*?)&', i).group(1) roomid = re.search('roomid=(.*?)&', i).group(1) house_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/roominfo.aspx?dongid=' + dongid + '&roomid=' + roomid house = House(co_index) house.co_name = 'Labelxqmc">(.*?)<' house.area = 'Labelxzq">(.*?)<' house.bu_num = 'Labeldongmc">(.*?)<' house.ho_type = 'Labelyxyongtu">(.*?)<' house.ho_name = '<span id="Labelroommc".*?>(.*?)</span>' house.ho_build_size = 'Labeljzmianji">(.*?)<' house.ho_true_size = 'Labeltaonei">(.*?)<' house.ho_share_size = 'Labelgongtan">(.*?)<' house.ho_room_type = 'Labelhuxing">(.*?)<' house.bu_id = 'dongid=(.*?)&' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def house_parse(self, bu_id, co_id): # 房屋信息解析 ho = House(co_index) house_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/probld/NBView.do?" formdata = {"nid": bu_id, "projectid": co_id} try: res = requests.post(house_url, data=formdata, headers=self.headers) except Exception as e: print("co_index={},房屋详情页无法访问".format(co_index), e) con = res.text ho_name = re.findall('\'\);">(.*?) ', con, re.S | re.M) ho_build_size = re.findall('<span.*?建筑面积:(.*?)㎡', con, re.S | re.M) ho_true_size = re.findall('<span.*?套内面积:(.*?)分', con, re.S | re.M) ho_share_size = re.findall('<span.*?分摊面积:(.*?)㎡', con, re.S | re.M) ho_type = re.findall('<span.*?用途:(.*?)房', con, re.S | re.M) ho_price = re.findall('<span.*?单价:(.*?)"', con, re.S | re.M) ho_id = re.findall("getHouseBaseInfo\('(.*?)'\)", con, re.S | re.M) for index in range(0, len(ho_id)): ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_name[index] ho.ho_build_size = ho_build_size[index] ho.ho_type = ho_type[index] ho.ho_share_size = ho_share_size[index] ho.ho_price = ho_price[index] ho.ho_true_size = ho_true_size[index] ho.ho_num = ho_id[index] ho.insert_db()
def house_parse(self, ho_url, co_id, bu_id): house_url = "http://61.143.241.154/" + ho_url ho_res = requests.get(house_url, headers=headers) html = etree.HTML(ho_res.content.decode('gbk')) detail_list = html.xpath("//td[@height='80']/a/@href") for detail in detail_list: try: detail_url = 'http://61.143.241.154/' + detail res = requests.get(detail_url, headers=headers) con = res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房屋号.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?">(.*?)</td', con, re.S | re.M).group(1) ho.orientation = re.search('房屋朝向.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_price = re.search('申报总价.*?">(.*?)</td', con, re.S | re.M).group(1) ho.insert_db() except Exception as e: log.error("{}房屋请求解析失败{}".format(detail, e))
def get_house_info(self, ho_con=None, headers=None, bu_id=None, url=None): if ho_con == None: res = requests.get(url, headers=headers) con = res.content.decode('gbk') html = etree.HTML(con) else: html = etree.HTML(ho_con) ho_url_list = html.xpath("//td[@width='120']/a/@href") for ho_url in ho_url_list: ho_detail = 'http://www.qyfgj.cn/newys/' + ho_url res = requests.get(ho_detail, headers=headers) con = res.content.decode('gbk') ho = House(co_index) ho.bu_id = bu_id ho.ho_num = re.search('房屋号.*?">(.*?)</td', con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?">(.*?)m', con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?">(.*?)m', con, re.S | re.M).group(1) ho.ho_type = re.search('房屋用途.*?">(.*?)</td', con, re.S | re.M).group(1) ho.insert_db()
def ho_parse(self, bid, co_id): payload = '<?xml version="1.0" encoding="utf-8" standalone="yes"?><param funname="SouthDigital.CMS.CBuildTableEx.GetBuildHTMLEx"><item>'\ +bid+'</item><item>1</item><item>1</item><item>100</item><item>1000</item><item>g_oBuildTable</item><item> 1=1</item><item>1</item></param>' payload = parse.quote(payload) try: res = requests.post( 'http://www.hbsfdc.com/Common/Agents/ExeFunCommon.aspx', data=payload, headers=self.headers) except Exception as e: log.error("{}楼栋请求失败".format(bid)) con = res.content.decode() ho_list = re.findall("title='(.*?)'>", con, re.S | re.M) for ho in ho_list: house = House(co_index) house.co_id = co_id house.bu_id = bid house.ho_name = re.search('房号:(.*)', ho).group(1) house.ho_type = re.search('用途:(.*)', ho).group(1) house.ho_room_type = re.search('户型:(.*)', ho).group(1) house.ho_build_size = re.search('总面积:(.*)', ho).group(1) if re.search('售价:(.*)', ho): house.ho_price = re.search('售价:(.*)', ho).group(1) else: house.ho_price = None house.insert_db()
def get_house_info(self, bu_id, co_id): url = 'http://www.fzfgj.cn/website/presale/home/HouseTableControl/GetData.aspx?Building_ID=' + bu_id try: response = requests.get(url=url, headers=self.headers) xml = response.text tree = etree.XML(xml) logo = tree.xpath('//LOGICBUILDING_ID/text()')[0] url_2 = 'http://www.fzfgj.cn/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logo result = requests.get(url_2, headers=self.headers) xml_2 = result.text tree_2 = etree.XML(xml_2) house_info_list = tree_2.xpath('T_HOUSE') for i in house_info_list: try: house = House(11) ho_name = i.xpath('ROOM_NUMBER/text()')[0] ho_build_size = i.xpath('BUILD_AREA/text()')[0] ho_true_size = i.xpath('BUILD_AREA_INSIDE/text()')[0] ho_share_size = i.xpath('BUILD_AREA_SHARE/text()')[0] ho_floor = i.xpath('FLOOR_REALRIGHT/text()')[0] ho_type = i.xpath('USE_FACT/text()')[0] house.co_id = co_id house.bu_id = bu_id house.ho_build_size = ho_build_size house.ho_true_size = ho_true_size house.ho_share_size = ho_share_size house.ho_floor = ho_floor house.ho_name = ho_name house.ho_type = ho_type house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, url_2), e) except BaseException as e: print('房号错误,co_index={},url={}'.format(co_index, url), e)
def get_house_info(self, co_id, bu_id): house_url = "http://202.103.219.149:7000/LeadingEstate/buildingtable/ShowNewBuildingTable.aspx" payload = "IsShowHouse=1&BuidID=" + bu_id headers = {'Content-Type': "application/x-www-form-urlencoded"} try: response = requests.request("POST", house_url, data=payload, headers=headers) html = response.text house_info_list = re.findall('HouseID.*?\}', html, re.S | re.M) for i in house_info_list: house = House(co_index) house.bu_id = bu_id house.co_id = co_id house.ho_name = re.search('"YCHouseNo":"(.*?)"', i, re.S | re.M).group(1) house.ho_floor = re.search('"ActFLoor":"(.*?)"', i, re.S | re.M).group(1) house.ho_build_size = re.search('"YCJZArea":"(.*?)"', i, re.S | re.M).group(1) house.ho_true_size = re.search('"YCTNJZArea":"(.*?)"', i, re.S | re.M).group(1) house.ho_share_size = re.search('"YCFTJZArea":"(.*?)"', i, re.S | re.M).group(1) house.insert_db() except Exception as e: print('请求错误,url={},data={}'.format(house_url, payload))
def get_house_info(self, co_id, bu_id, id): house_list_url = "http://xx.yyfdcw.com/hetong/fdc_xxdxx.asp?id=" + str( id) res = requests.get(house_list_url, headers=self.headers) con = res.content.decode('gbk') house_list = re.findall("onClick=.*?open\('(.*?)',", con, re.S | re.M) for house_ in house_list: try: house_url = "http://xx.yyfdcw.com/hetong/" + house_ except Exception as e: print("co_index={},房屋信息错误".format(co_index), e) continue ho_res = requests.get(house_url, headers=self.headers) ho_con = ho_res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('室号.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_floor = re.search('实际层.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_price = re.search('价格.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.insert_db()
def house_parse(self, co_id, bu_id, bu_con): name_list = re.findall('<a style.*?\)>(.*?)</a', bu_con) for name in name_list: ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = name ho.insert_db()
def get_house_info(self, house_url, ho_name, bu_id, co_id): house = House(co_index) url = 'http://www.bjjs.gov.cn' + house_url if '#' not in url: house = self.get_house_detail(url, house) house.ho_name = ho_name house.bu_id = bu_id house.co_id = co_id house.insert_db()
def get_build_info(self, comm_url_list): for i in comm_url_list: try: sid = re.findall('\+(\d+)\+', i)[0] pid = re.findall('\+(\d+)\+', i)[1] build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid # print(build_url) response = requests.get(build_url) html = response.text build = Building(co_index) build.bu_id = pid build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_pre_sale = re.search('预售证号.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('时间.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_all_house = re.search('dM.*?">(.*?) ', html, re.S | re.M).group(1) # build.bu_address = re.search('售楼处地址.*?">(.*?) ', html, re.S | re.M).group(1) build.insert_db() except Exception as e: print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url), e) house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001¶ms=' + sid # print(house_url) result = requests.get(house_url) html_ = result.text for house_info in re.findall('<Result.*?</Result>', html_, re.S | re.M): try: house = House(co_index) house.bu_id = build.bu_id house.bu_num = build.bu_num house.ho_name = re.search('<ONAME>(.*?)</ONAME>', house_info, re.S | re.M).group(1) house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info, re.S | re.M).group(1) house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>', house_info, re.S | re.M).group(1) house.ho_floor = re.search('<FORC>(.*?)</FORC>', house_info, re.S | re.M).group(1) house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>', house_info, re.S | re.M).group(1) house.insert_db() except Exception as e: print('co_index={}, 房号错误'.format(co_index), e)
def house(self, house_url, bu_id, co_id): ho_url = "http://www.syfc.com.cn" + house_url try: res = requests.get(ho_url, headers=self.headers) con = etree.HTML(res.text) ho_detail_url = con.xpath("//iframe/@src")[0] response = requests.get(ho_detail_url, headers=self.headers) except Exception as e: print("co_index={},楼栋详情页无法访问".format(co_index), e) html = etree.HTML(response.text) content = html.xpath("//td[@width='70']") for td in content: ho = House(co_index) try: room_url = td.xpath("./a/@href")[0] ho.ho_name = td.xpath("./a/text()")[0] # ho.ho_id = re.search('id=(\d+)&', room_url).group(1) ho.bu_id = bu_id ho.co_id = co_id room_url = "http://www.syfc.com.cn" + room_url try: res = requests.get(room_url, headers=self.headers) con = res.text except Exception as e: print("co_idnex={},房屋详情页无法访问".format(co_index), e) # print(con) ho.ho_build_size = re.search('建筑面积.*?">(.*?)<', con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?">(.*?)<', con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?">(.*?)<', con, re.S | re.M).group(1) ho.ho_type = re.search('类型.*?">(.*?)<', con, re.S | re.M).group(1) ho.insert_db() except: ho.bu_id = bu_id ho.co_id = co_id ho.ho_name = td.xpath("./text()")[0] ho.insert_db()