def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.fjnpfdc.com/House/' + i house_res = requests.get(house_url, headers=self.headers) house_con = house_res.content.decode('gbk') house.bu_id = bu_id house.co_id = co_id house.bu_num = re.search('幢 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_name = re.search('房 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.co_name = re.search('项目名称:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_floor = re.search('所 在 层:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.insert_db() except Exception as e: print("co_index={},房屋{}错误".format(co_index, i), e)
def get_house_detail(self, house_url_list): for i in house_url_list: res = requests.get(i) html = res.content.decode('gbk') bu_name = re.search('楼号:.*?HouseNum">(.*?)</span>', html, re.S | re.M).group(1) co_name = re.search('项目名称.*?PrjName">(.*?)</span>', html, re.S | re.M).group(1) ho_id = re.findall("aspx\?Room=(.*?)'.*?<b>(.*?)</b>", html, re.S | re.M) # 房号和房号id对应的字段 ho_id_dict = {} for k in ho_id: ho_id_dict[k[0]] = k[1] house_info = re.findall("<Room><Cell RoomID='(.*?)'.*?BArea='(.*?)'.*?HouseUse='(.*?)'.*?</Room>", html, re.S | re.M) for j in house_info: try: h = House(self.co_index) h.ho_name = ho_id_dict[j[0]] h.ho_true_size = j[1] h.ho_type = j[2] h.co_name = co_name h.bu_num = bu_name h.insert_db() except Exception as e: print('房屋错误,co_index={},url={}'.format(co_index, i), e) continue
def get_house_info(self, house_url_list, co_name, bu_num): for i in house_url_list: try: house = House(co_index) house.co_name = co_name house.bu_num = bu_num house_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(house_url, headers=self.headers) html = response.text house.ho_floor = re.findall('HouseInfo1_lblFwlc">(.*?)<', html, re.S | re.M)[0] house.ho_name = re.findall('HouseInfo1_lblFwfh">(.*?)<', html, re.S | re.M)[0] house.ho_type = re.findall('HouseInfo1_lblFwlx">(.*?)<', html, re.S | re.M)[0] house.ho_room_type = re.findall('HouseInfo1_lblFwhx">(.*?)<', html, re.S | re.M)[0] house.ho_build_size = re.findall( 'HouseInfo1_lblycfwjzmj">(.*?)<', html, re.S | re.M)[0] house.ho_true_size = re.findall( 'HouseInfo1_lblycfwtnmj">(.*?)<', html, re.S | re.M)[0] house.ho_share_size = re.findall( 'HouseInfo1_lblycfwftmj">(.*?)<', html, re.S | re.M)[0] house.orientation = re.findall('HouseInfo1_lblCx">(.*?)<', html, re.S | re.M)[0] house.insert_db() except Exception as e: print(e)
def get_house_info(self, house_url_list): for i in house_url_list: try: dongid = re.search('dongid=(.*?)&', i).group(1) roomid = re.search('roomid=(.*?)&', i).group(1) house_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/roominfo.aspx?dongid=' + dongid + '&roomid=' + roomid house = House(co_index) house.co_name = 'Labelxqmc">(.*?)<' house.area = 'Labelxzq">(.*?)<' house.bu_num = 'Labeldongmc">(.*?)<' house.ho_type = 'Labelyxyongtu">(.*?)<' house.ho_name = '<span id="Labelroommc".*?>(.*?)</span>' house.ho_build_size = 'Labeljzmianji">(.*?)<' house.ho_true_size = 'Labeltaonei">(.*?)<' house.ho_share_size = 'Labelgongtan">(.*?)<' house.ho_room_type = 'Labelhuxing">(.*?)<' house.bu_id = 'dongid=(.*?)&' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_detail(self, house_detail_url_list, co_id, bu_id): for i in house_detail_url_list: detail_url = 'http://www.yzfdc.cn/' + i try: house = House(co_index) time.sleep(3) response = self.s.get(detail_url, headers=self.headers) html = response.text house.co_name = re.search('lblxmmc.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_num = re.search('lbldh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_name = re.search('lblfh.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('lbljzmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('lbltnmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('lblftmj.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('lblfwxz.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('lblhuxin.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, detail_url), e)
def get_build_info(self, build_url_list, bu_pre_sale_list, co_name, co_id): for i in range(len(build_url_list)): try: build = Building(co_index) build.co_id = co_id build.co_name = co_name build.bu_pre_sale = bu_pre_sale_list[i] build.bu_id = re.search('lh=(\d+)', build_url_list[i]).group(1) build_url = 'http://221.2.144.162:8090/' + build_url_list[i] response = requests.get(build_url, headers=self.headers) html = response.content.decode('gbk') build.bu_num = re.findall('<font color=white.*?><b>(.*?)<', html, re.S | re.M)[0] build.bu_address = re.findall('坐落位置:</b>(.*?)<', html, re.S | re.M)[0] build.insert_db() ho_url_list = re.findall('background-.*?href=(.*?) ', html, re.S | re.M) ho_name_list = re.findall('background-color.*?<a.*?>(.*?)<', html, re.S | re.M) for i in range(len(ho_url_list)): try: house = House(co_index) house_url = 'http://221.2.144.162:8090/' + ho_url_list[ i] result = requests.get( house_url, headers=self.headers).content.decode('gbk') house.bu_id = build.bu_id house.co_id = co_id house.ho_type = re.findall( '用 途:.*?<td.*?>(.*?)<', result, re.S | re.M)[0] house.ho_build_size = re.findall( '建筑面积:.*?<td>(.*?)<', result, re.S | re.M)[0] house.bu_num = build.bu_num house.co_name = co_name house.ho_name = ho_name_list[i] house.insert_db() except Exception as e: print("co_index={},房屋信息错误".format(co_index), e) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.ndjsj.gov.cn/House/' + i house.bu_num = '幢 号:.*?<td.*?>(.*?)<' house.ho_name = '房 号:.*?<td.*?>(.*?)<' house.co_name = '项目名称:.*?<td.*?>(.*?)<' house.ho_build_size = '建筑面积:.*?<td.*?>(.*?)<' house.ho_true_size = '套内面积:.*?<td.*?>(.*?)<' house.ho_share_size = '分摊面积:.*?<td.*?>(.*?)<' house.ho_type = '房屋用途:.*?<td.*?>(.*?)<' house.ho_floor = '所 在 层:.*?<td.*?>(.*?)<' house.ho_room_type = '房屋户型:.*?<td.*?>(.*?)<' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('宁德房号错误,url={}'.format(house_url), e)
def get_house_info(self, code, co_name): house_url = 'http://house.bffdc.gov.cn/Common/Agents/ExeFunCommon.aspx?' payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \ code[ 0] + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>55</item>\r\n<item>840</item>\r\n<item>g_oBuildTable</item>\r\n<item>false</item>\r\n<item> 1=1</item>\r\n</param>\r\n" headers = { 'Content-Type': "text/xml", } response = requests.post(url=house_url, data=payload, headers=headers) html = response.text info = re.findall("title='(.*?)'", html, re.S | re.M) for i in info: try: house = House(co_index) house.bu_num = code[1] house.ho_name = re.search('房号:(.*?)\r\n', i).group(1) house.ho_type = re.search('用途:(.*?)\r\n', i).group(1) house.ho_room_type = re.search('户型:(.*?)\r\n', i).group(1) house.ho_build_size = re.search('总面积:(.*?)\r\n', i).group(1) house.co_name = co_name house.insert_db() except Exception as e: print(e)
def get_build_url_list(self, url_list): for i in url_list: try: res = requests.get(i) html = res.content.decode('gbk') for k in re.findall('项目名称.*?</dl>', html, re.S | re.M): try: c = Comm(self.co_index) c.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) c.co_address = re.search('class="address"(.*?)</dd>', k, re.S | re.M).group(1) c.area = re.search('"city">(.*?)</dd>', k, re.S | re.M).group(1) c.co_develops = re.search('"average">(.*?)</dd>', k, re.S | re.M).group(1) c.insert_db() global count count += 1 print(count) url = re.search('a href="(.*?)">', k, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') build_info_str = re.search('楼盘表</td>(.*?)合 计', html, re.S | re.M).group(1) for j in re.findall('<tr.*?</tr>', build_info_str, re.S | re.M): try: b = Building(self.co_index) b.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) b.bu_all_house = re.search( 'absmiddle" />(.*?)</a>', j, re.S | re.M).group(1) b.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) b.bu_build_size = re.search( 'td class="t_c">.*?td class="t_c">(.*?㎡)</td>', j, re.S | re.M).group(1) b.insert_db() url = re.search('a href="(.*?)"', j, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') # 解析html获取iframe表单的数据 house_url = self.url_source + re.search( '<iframe.*?"(.*?)"', html, re.S | re.M).group(1) logic_house_url = house_url.replace( 'Default', 'GetData') logic_house_html = requests.get( url=logic_house_url).content.decode() logic_id = re.search( '<LOGICBUILDING_ID>(.*?)<', logic_house_html, re.S | re.M).group(1) final_url = 'http://www.yingtanfdc.com/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logic_id final_html = requests.get( url=final_url).content.decode('gbk') for l in re.findall( '<ROOM_NUMBER>(.*?)</ROOM_NUMBER>', final_html, re.S | re.M): try: h = House(self.co_index) h.info = final_html h.ho_name = l h.co_name = re.search( 'html">(.*?)</a>', k, re.S | re.M).group(1) h.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) h.insert_db() except Exception as e: continue except Exception as e: continue except Exception as e: continue except Exception as e: continue