def house_crawler(self, house_url, bu_num, co_id, bu_id): ho = House(co_index, bu_num=bu_num, co_id=co_id, bu_id=bu_id) url = self.url + house_url con = requests.get(url, headers=self.headers) tr = con.text ho_name = re.findall('室号:(.*?)户', tr, re.S | re.M) # 房号:3单元403 # ho_num = re.findall('_td(\d+)"', tr) # 房号id ho_floor = re.findall('(\d+)层', tr) # 楼层 ho_type = re.findall('房屋属性:(.*?)"', tr, re.S | re.M) # 房屋类型:普通住宅 / 车库仓库 ho_room_type = re.findall('户型:(.*?)所', tr, re.S | re.M) # 户型 ho_build_size = re.findall('建筑面积:(.*?)房', tr, re.S | re.M) # 建筑面积 for floor in ho_floor: try: ho.ho_floor = floor for index in range(1, len(ho_name) + 1): ho.ho_name = ho_name[index] ho.ho_type = ho_type[index] ho.ho_room_type = ho_room_type[index] ho.ho_build_size = ho_build_size[index] # ho.ho_num = ho_num[index] ho.insert_db() except: continue
def get_house_info(self, house_id_list, bu_id, co_id): for i in house_id_list: house_url = 'http://www.hbczfdc.com:4993/HPMS/RoomInfo.aspx?code=' + i try: house = House(co_index) response = requests.get(house_url, headers=self.headers) html = response.text house.bu_id = bu_id house.co_id = co_id house.ho_name = re.search('id="ROOM_HH">(.*?)<', html, re.S | re.M).group(1) house.ho_floor = re.search('id="ROOM_MYC">(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('id="ROOM_FWYT">(.*?)<', html, re.S | re.M).group(1) house.ho_room_type = re.search('id="ROOM_HX">(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('id="ROOM_YCJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('id="ROOM_YCTNJZMJ">(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('id="ROOM_YCFTJZMJ">(.*?)<', html, re.S | re.M).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_build_info(self, url, response,co_id, bu_id): house = House(co_index) json_html = json.loads(response.text) for i in json_html: ho_name = i['roomno'] # 房号 ho_type = i['ghyt'] # 用途 ho_true_size = i['tnmj'] # 预测套内面积 ho_floor = i['floorindex'] # 楼层 ho_build_size = i['jzmj'] # 建筑面积 house.co_id = co_id house.bu_id = bu_id house_code = i["fwcode"] house.ho_name = ho_name house.ho_type = ho_type house.ho_true_size = ho_true_size house.ho_floor = ho_floor house.ho_build_size = ho_build_size house_detail_url = "http://fsfc.fsjw.gov.cn/hpms_project/roomview.jhtml?id="+str(house_code) try: res = requests.get(house_detail_url,headers=self.headers) house.ho_share_size = re.search('实测分摊面积.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1) house.ho_price = re.search('总价.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1) except Exception as e: print("co_index={},房屋详情页{}请求失败!".format(co_index,house_detail_url)) print(e) continue house.insert_db()
def house_info(self,ho_url,co_id,bu_id): url = "http://222.77.178.63:7002/" + ho_url url.rstrip('=') res = requests.get(url,headers=self.headers) res.encoding = 'gbk' html = etree.HTML(res.text) house_detail_list = html.xpath("//td/a[@target]/@href") for house_detail in house_detail_list: try: detail_url = "http://222.77.178.63:7002/" + house_detail detail_res = requests.get(detail_url,headers=self.headers) detail_res.encoding = 'gbk' con = detail_res.text ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('室号.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_floor = re.search('实际层.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_type = re.search('房屋类型.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_build_size = re.search('预测建筑面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_true_size = re.search('预测套内面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_share_size = re.search('预测分摊面积.*?">(.*?)<',con,re.S|re.M).group(1) ho.ho_price = re.search('总价.*?">(.*?)<',con,re.S|re.M).group(1) ho.insert_db() except Exception as e: # log.error('房屋信息错误{}'.format(e)) print('房屋信息错误{}'.format(e))
def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) response = requests.get(i, headers=self.headers) html = response.text house.ho_name = re.search('门牌号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_floor = re.search('所在层:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_type = re.search('房屋性质:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('预测建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('预测套内面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('预测分摊面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.co_address = re.search('房屋坐落:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.bu_id = bu_id house.co_id = co_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, i), e)
def get_house_info(self, house_url_list, co_name, bu_num): for i in house_url_list: try: house = House(co_index) house.co_name = co_name house.bu_num = bu_num house_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(house_url, headers=self.headers) html = response.text house.ho_floor = re.findall('HouseInfo1_lblFwlc">(.*?)<', html, re.S | re.M)[0] house.ho_name = re.findall('HouseInfo1_lblFwfh">(.*?)<', html, re.S | re.M)[0] house.ho_type = re.findall('HouseInfo1_lblFwlx">(.*?)<', html, re.S | re.M)[0] house.ho_room_type = re.findall('HouseInfo1_lblFwhx">(.*?)<', html, re.S | re.M)[0] house.ho_build_size = re.findall( 'HouseInfo1_lblycfwjzmj">(.*?)<', html, re.S | re.M)[0] house.ho_true_size = re.findall( 'HouseInfo1_lblycfwtnmj">(.*?)<', html, re.S | re.M)[0] house.ho_share_size = re.findall( 'HouseInfo1_lblycfwftmj">(.*?)<', html, re.S | re.M)[0] house.orientation = re.findall('HouseInfo1_lblCx">(.*?)<', html, re.S | re.M)[0] house.insert_db() except Exception as e: print(e)
def house_parse(self, house_url, co_id, bu_id): ho = House(co_index) url = "http://spf.tlfdc.cn/" + house_url res = requests.get(url, headers=self.headers) con = res.text ho_name = re.findall('室号:(.*?)套', con, re.S | re.M) ho_room_type = re.findall('套型:(.*?)建', con, re.S | re.M) ho_build_size = re.findall('建筑面积:(.*?)参', con, re.S | re.M) ho_price = re.findall('价格:(.*?)元', con, re.S | re.M) ho_detail = re.findall('href="(show.*?\?id=\d+&id2=\d+&prjid=\d+)"', con, re.S | re.M) for index in range(0, len(ho_name)): try: ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = ho_name[index] ho.ho_room_type = ho_room_type[index] ho.ho_build_size = ho_build_size[index] ho.ho_price = ho_price[index] ho_detail_url = "http://spf.tlfdc.cn/" + ho_detail[index] res = requests.get(ho_detail_url, headers=self.headers) res = res.content.decode('gb2312') ho.ho_floor = re.findall('楼层.*?">(.*?)</td>', res, re.S | re.M)[0].strip() ho.insert_db() except: print('房号错误,co_index={},url={}'.format(co_index, url), e) continue
def get_house_info(self, bu_id, co_id): url = 'http://www.fzfgj.cn/website/presale/home/HouseTableControl/GetData.aspx?Building_ID=' + bu_id try: response = requests.get(url=url, headers=self.headers) xml = response.text tree = etree.XML(xml) logo = tree.xpath('//LOGICBUILDING_ID/text()')[0] url_2 = 'http://www.fzfgj.cn/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logo result = requests.get(url_2, headers=self.headers) xml_2 = result.text tree_2 = etree.XML(xml_2) house_info_list = tree_2.xpath('T_HOUSE') for i in house_info_list: try: house = House(11) ho_name = i.xpath('ROOM_NUMBER/text()')[0] ho_build_size = i.xpath('BUILD_AREA/text()')[0] ho_true_size = i.xpath('BUILD_AREA_INSIDE/text()')[0] ho_share_size = i.xpath('BUILD_AREA_SHARE/text()')[0] ho_floor = i.xpath('FLOOR_REALRIGHT/text()')[0] ho_type = i.xpath('USE_FACT/text()')[0] house.co_id = co_id house.bu_id = bu_id house.ho_build_size = ho_build_size house.ho_true_size = ho_true_size house.ho_share_size = ho_share_size house.ho_floor = ho_floor house.ho_name = ho_name house.ho_type = ho_type house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, url_2), e) except BaseException as e: print('房号错误,co_index={},url={}'.format(co_index, url), e)
def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.fjnpfdc.com/House/' + i house_res = requests.get(house_url, headers=self.headers) house_con = house_res.content.decode('gbk') house.bu_id = bu_id house.co_id = co_id house.bu_num = re.search('幢 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_name = re.search('房 号:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.co_name = re.search('项目名称:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.ho_floor = re.search('所 在 层:.*?<td>(.*?)<', house_con, re.S | re.M).group(1) house.insert_db() except Exception as e: print("co_index={},房屋{}错误".format(co_index, i), e)
def get_house_detail(self, house_id, bu_id): try: house = House(co_index) detail_url = 'http://222.223.160.199:8088/website/Hutu?id=' + house_id response = requests.get(detail_url, headers=self.headers) html = response.text house.ho_floor = re.search('层号.*?value="(.*?)"', html, re.S | re.M).group(1) house.ho_build_size = re.search('总面积.*?value="(.*?)"', html, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积.*?value="(.*?)"', html, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积.*?value="(.*?)"', html, re.S | re.M).group(1) house.ho_type = re.search('房屋用途.*?value="(.*?)"', html, re.S | re.M).group(1) house.ho_floor = re.search('层号.*?value="(.*?)"', html, re.S | re.M).group(1) house.bu_id = bu_id house.insert_db() except Exception as e: print('请求错误,url={}'.format(detail_url), e)
def house_info(self, co_id, bu_id, house_url_list): for house_ in house_url_list: house_url = "http://www.njhouse.com.cn/2016/spf/" + house_ try: # ho_res = requests.get(house_url,headers=self.headers) ho_pro = Proxy_contact(app_name="nanjing", method='get', url=house_url, headers=self.headers) ho_con = ho_pro.contact() ho_con = ho_con.decode('gbk') # ho_con = ho_res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房号.*?;">(.*?)</td', ho_con, re.S | re.M).group(1) ho.ho_price = re.search('价格.*?<td>(.*?)元', ho_con, re.S | re.M).group(1) ho.ho_floor = re.search('楼层.*?;">(.*?)</td', ho_con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?<td>(.*?)m', ho_con, re.S | re.M).group(1) ho.ho_type = re.search('房屋类型.*?<td>(.*?)</td', ho_con, re.S | re.M).group(1) except Exception as e: log.error("房屋详情页错误{}".format(e)) continue ho.insert_db()
def get_house_info(self, co_id, bu_id, id): house_list_url = "http://xx.yyfdcw.com/hetong/fdc_xxdxx.asp?id=" + str( id) res = requests.get(house_list_url, headers=self.headers) con = res.content.decode('gbk') house_list = re.findall("onClick=.*?open\('(.*?)',", con, re.S | re.M) for house_ in house_list: try: house_url = "http://xx.yyfdcw.com/hetong/" + house_ except Exception as e: print("co_index={},房屋信息错误".format(co_index), e) continue ho_res = requests.get(house_url, headers=self.headers) ho_con = ho_res.content.decode('gbk') ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('室号.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_floor = re.search('实际层.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_true_size = re.search('套内面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_price = re.search('价格.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?fafa>(.*?)</TD', ho_con, re.S | re.M).group(1) ho.insert_db()
def get_house_info(self, co_id, bu_id): house_url = "http://202.103.219.149:7000/LeadingEstate/buildingtable/ShowNewBuildingTable.aspx" payload = "IsShowHouse=1&BuidID=" + bu_id headers = {'Content-Type': "application/x-www-form-urlencoded"} try: response = requests.request("POST", house_url, data=payload, headers=headers) html = response.text house_info_list = re.findall('HouseID.*?\}', html, re.S | re.M) for i in house_info_list: house = House(co_index) house.bu_id = bu_id house.co_id = co_id house.ho_name = re.search('"YCHouseNo":"(.*?)"', i, re.S | re.M).group(1) house.ho_floor = re.search('"ActFLoor":"(.*?)"', i, re.S | re.M).group(1) house.ho_build_size = re.search('"YCJZArea":"(.*?)"', i, re.S | re.M).group(1) house.ho_true_size = re.search('"YCTNJZArea":"(.*?)"', i, re.S | re.M).group(1) house.ho_share_size = re.search('"YCFTJZArea":"(.*?)"', i, re.S | re.M).group(1) house.insert_db() except Exception as e: print('请求错误,url={},data={}'.format(house_url, payload))
def get_build_info(self, comm_url_list): for i in comm_url_list: try: sid = re.findall('\+(\d+)\+', i)[0] pid = re.findall('\+(\d+)\+', i)[1] build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid # print(build_url) response = requests.get(build_url) html = response.text build = Building(co_index) build.bu_id = pid build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_pre_sale = re.search('预售证号.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('时间.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_all_house = re.search('dM.*?">(.*?) ', html, re.S | re.M).group(1) # build.bu_address = re.search('售楼处地址.*?">(.*?) ', html, re.S | re.M).group(1) build.insert_db() except Exception as e: print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url), e) house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001¶ms=' + sid # print(house_url) result = requests.get(house_url) html_ = result.text for house_info in re.findall('<Result.*?</Result>', html_, re.S | re.M): try: house = House(co_index) house.bu_id = build.bu_id house.bu_num = build.bu_num house.ho_name = re.search('<ONAME>(.*?)</ONAME>', house_info, re.S | re.M).group(1) house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info, re.S | re.M).group(1) house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>', house_info, re.S | re.M).group(1) house.ho_floor = re.search('<FORC>(.*?)</FORC>', house_info, re.S | re.M).group(1) house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>', house_info, re.S | re.M).group(1) house.insert_db() except Exception as e: print('co_index={}, 房号错误'.format(co_index), e)
def room_crawler(self, room): # 房屋 ho = House(co_index) house_url = "http://www.hzszjj.gov.cn" + room res = requests.get(house_url, ) con = etree.HTML(res.text) ho_table = con.xpath("//tr[@bgcolor='#fbf3e6']") for ho_list in ho_table[1:-1]: ho_floor = ho_list.xpath("./td[@align='center']/text()")[0] honum_list = ho_list.xpath(".//tr/td[@height='40']") for house in honum_list: ho.ho_floor = ho_floor # 楼层 id_num = re.search(r"(\d+)&\w+=(\d+)", room) ho.co_id = id_num.group(1) # 小区id ho.bu_id = id_num.group(2) # 楼栋id ho_url = house.xpath("./a/@href")[0] if len(ho_url) == 1: ho_info = house.xpath("./a/@wf")[0] ho.ho_name = house.xpath("./a/text()")[0] info = re.search( r":(.*?)<br>.*?:(.*?)<br>(.*?)<br><hr>.*?:(.*?)m.*?<br>.*?:(.*?)<br>.*?:(.*?)m", ho_info) ho.ho_type = info.group(5) ho.ho_build_size = info.group(4) ho.ho_room_type = info.group(2) else: detail_url = "http://www.hzszjj.gov.cn/ts_web_dremis/web_house_dir/" + ho_url res = requests.get(detail_url) con = etree.HTML(res.text) ho.ho_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_name']/text()" )[0] ho.ho_type = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_type']/text()" )[0] ho.ho_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_build_area']/text()" )[0] ho.ho_share_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_share_area']/text()" )[0] ho.ho_true_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_house_inside_area']/text()" )[0] ho.insert_db()
def get_house_info(self, bu_url, bu_id): qrykey = re.search('qrykey=(.*?)&', bu_url).group(1) house_url = 'http://old.newhouse.cnnbfdc.com/GetHouseTable.aspx?qrykey=' + qrykey response = requests.get(house_url, headers=self.headers) html = response.text house_code_list = re.findall("onclick=select_room\('(.*?)'", html, re.S | re.M) for i in house_code_list: house_detail_url = 'http://old.newhouse.cnnbfdc.com/openRoomData.aspx?roomId=' + str( i) # while True: # proxies = self.proxy_pool() try: res = requests.get( house_detail_url, headers=self.headers, ) except Exception as e: print("{}城市无法访问房屋页面{}".format(city, house_detail_url), e) continue # if res.status_code ==200: time.sleep(2) # self.proxy_status(proxies,0) # break # else: # self.proxy_status(proxies,1) # continue content = res.text ho = House(co_index) ho.bu_id = bu_id try: ho.ho_name = re.search('室号.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_floor = re.search('楼层.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_room_type = re.search('户型.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_type = re.search('用途.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_build_size = re.search('预测建筑面积.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_true_size = re.search('预测套内面积.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.ho_share_size = re.search('预测分摊面积.*?">(.*?)</td>', content, re.S | re.M).group(1) ho.insert_db() except Exception as e: print("{}房号错误,请求频繁,当前页面{}未提取".format(city, house_detail_url), e) continue
def get_house_info(self, house_url, bu_id, co_id): try: house = House(co_index) house.bu_id = bu_id house.co_id = co_id response = requests.post(house_url, headers=self.headers) html = response.content.decode('gbk') house.ho_floor = re.search('所在楼层:.*?<td>(.*?)<', html, re.M | re.S).group(1) house.ho_name = re.search('房号:.*?<td>(.*?)<', html, re.M | re.S).group(1) house.ho_build_size = re.search('预测总面积:.*?<td>(.*?)<', html, re.M | re.S).group(1) house.ho_true_size = re.search('预测套内面积.*?<td>(.*?)<', html, re.M | re.S).group(1) house.ho_share_size = re.search('预测公摊面积.*?<td>(.*?)<', html, re.M | re.S).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def comm(self, id): bu = Building(co_index) house_url = self.start_url + "/api/buildInfos/getHouseInfosByPannelNumber?pannelNumber=" + str(id) comm_url = self.start_url + "/api/buildInfos/getHomePageBuildingInfo?blockNumber=" + str(id) comm_detail_url = self.start_url + "/api/buildInfos/getDetailsBuildingInfo?blockNumber=" + str(id) comm_res = requests.get(comm_url) comm_detail_res = requests.get(comm_detail_url) house_res = requests.get(house_url) comm_dict = json.loads(comm_res.text) comm_detail_dict = json.loads(comm_detail_res.text) house_dict = json.loads(house_res.text) bu.bu_id = id bu.bu_num = comm_dict["data"]["nameBuildings"] bu.area = comm_detail_dict['data']['houseingArea'] bu.bu_address = comm_dict["data"]["houseaddress"] bu.bu_pre_sale = comm_detail_dict["data"]["yszh"] bu.bu_type = comm_dict["data"]["propertycategory"] bu.bo_develops = comm_dict["data"]["companyName"] bu.insert_db() house_num = house_dict["data"] for hu in house_num: ho = House(co_index) h = hu["data"] if len(h) > 0: for i in h: try: room_id = i["houseNumber"] room_url = self.start_url + "/api/buildInfos/getHouseInfoByHouseNumber?houseNumber=" + str( room_id) res = requests.get(room_url, headers=self.headers) dict = json.loads(res.text) ho.bu_id = id # ho.ho_num = room_id ho.ho_name = dict["data"]["houseNo"] ho.ho_build_size = dict["data"]["buildArea"] ho.ho_true_size = dict["data"]["jacketArea"] ho.ho_share_size = dict["data"]["apportionedArea"] ho.ho_floor = dict["data"]["nominalLevel"] ho.insert_db() except Exception as e: print(e) else: continue
def get_house_info(self, co_id, bu_id): house_url = 'http://www.yanjifc.com/jdi' payload = "page=1&rows=10000&module=jtsActHouses&buildingGuid=" + bu_id + "&activityId=" + co_id response = requests.post(house_url, data=payload, headers=self.headers) html = response.json() house_list = html['ROWS']['ROW'] for i in house_list: house = House(co_index) house.ho_build_size = self.dict_get(i, 'BUILDING_AREA') house.ho_floor = self.dict_get(i, 'UNIT') house.ho_type = self.dict_get(i, 'PLANNING_USAGE') house.ho_true_size = self.dict_get(i, 'INNER_AREA') house.co_build_structural = self.dict_get(i, 'STRUCTURE') house.ho_name = self.dict_get(i, 'PART') house.bu_id = bu_id house.co_id = co_id house.insert_db()
def get_house_info(self, house_url, bu_id, co_id): ho_url = 'http://www.fangdi.com.cn/' + house_url response = requests.get(ho_url, headers=self.headers) html = response.content.decode('gbk') house_html = re.search('室号 <.*?</table>.*?</table>', html, re.S | re.M).group() house_info_list = re.findall('title.*?</td>', house_html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_build_size = re.search('实测面积:(.*?)>', i, re.S | re.M).group(1) house.ho_name = re.search('实测面积.*?>(.*?)<br>', i, re.S | re.M).group(1).strip() house.bu_id = bu_id house.co_id = co_id if '<a' in house.ho_name: house_detail_url_code = re.search('href="(.*?)"', house.ho_name, re.S | re.M).group(1) house_detail_url = 'http://www.fangdi.com.cn/' + house_detail_url_code result = requests.get(house_detail_url, headers=self.headers) html_str = result.content.decode('gbk') house.ho_floor = re.search('实际层.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_name = re.search('室号.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_type = re.search('房屋类型.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_room_type = re.search('房型.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_build_size = re.search('实测建筑面积.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_true_size = re.search('实测套内面积.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.ho_share_size = re.search('实测分摊面积.*?<TD.*?>(.*?)<', html_str, re.S | re.M).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, ho_url), e)
def house_info(self, house_list, bu_id, co_id): ho = House(co_index) for house_url in house_list: url = "http://ris.szpl.gov.cn/bol/" + house_url res = requests.get(url, headers=self.headers) ho.ho_num = re.search('id=(\d+)', house_url).group(1) con = res.text ho.bu_num = re.search('情况.*?">(.*?)&', con).group(1) ho.bu_id = bu_id ho.co_id = co_id ho.ho_floor = re.search('楼层.*?">(\d+)&', con).group(1) ho.ho_num = re.search('房号.*?">(\d+)&', con).group(1) ho.ho_type = re.search('用途.*?">(\d+)&', con).group(1) ho.ho_room_type = re.search('户型.*?">(\d+)&', con).group(1) ho.ho_build_size = re.search('建筑面积<.*?">(\d+.\d+)平方米', con).group(1) ho.ho_true_size = re.search('户内面积<.*?">(\d+.\d+)平方米', con).group(1) ho.insert_db()
def house_info(self, bu_id, bu_url, co_id): proxy = Proxy_contact(app_name='wuhan', method='get', url=bu_url, headers=self.headers) res = proxy.contact() # res = requests.get(bu_url,headers=self.headers) html = etree.HTML(res.decode('gb18030')) con = html.xpath("//tr[@bgcolor='#FFFFFF']") for i in con: try: ho = House(co_index) ho.bu_id = bu_id ho.co_id = co_id ho.ho_floor = i.xpath("./td/text()")[2] house_num_list = i.xpath("./td[@bgcolor='#CCFFFF']") for house_num in house_num_list: ho.ho_name = house_num.xpath(".//a/text()")[0] ho.insert_db() except Exception as e: log.error('房号错误{}'.format(e))
def get_house_info(self, house_url, co_id, bu_id): response = requests.get(house_url) html = response.text info = re.search('var houselist =.*?eval\((.*?)\);', html, re.S | re.M).group(1) data_list = json.loads(info) for data in data_list: try: house = House(co_index) house.ho_name = data['HouseName'] house.unit = data['UnitName'] house.co_build_structural = data['StruTypeName'] house.ho_build_size = data['PreBuildArea'] house.ho_true_size = data['PreInnerArea'] house.ho_share_size = data['PreApportionArea'] house.ho_floor = data['FloorName'] house.ho_type = data['LayoutTypeName'] house.co_id = co_id house.bu_id = bu_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, house_url_list, bu_id, co_id): for i in house_url_list: house_url = 'http://www.fjlyfdc.com.cn/' + i try: response = requests.get(house_url, headers=self.headers) html = response.text house = House(co_index) house.bu_id = bu_id house.co_id = co_id house.ho_name = re.search('房 号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.ho_floor = re.search('所 在 层:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, house_url_list): for url in house_url_list: response = requests.get(url) html = etree.HTML(response.text) con = html.xpath("//tr[@align='center']") for i in con: try: house = House(co_index) # house.ho_num = 'NHOUSENO">(.*?)<' house.ho_name = i.xpath("./td/text()")[1] house.ho_floor = i.xpath("./td/text()")[0] house.ho_build_size = i.xpath("./td/text()")[3] house.ho_true_size = i.xpath("./td/text()")[4] house.ho_share_size = i.xpath("./td/text()")[5] house.ho_room_type = i.xpath("./td/text()")[2] house.ho_price = i.xpath("./td/text()")[-1] house.orientation = i.xpath("./td/text()")[-2] house.bu_id = re.search('ID=(\d+)',url).group(1) house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, url), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.ndjsj.gov.cn/House/' + i house.bu_num = '幢 号:.*?<td.*?>(.*?)<' house.ho_name = '房 号:.*?<td.*?>(.*?)<' house.co_name = '项目名称:.*?<td.*?>(.*?)<' house.ho_build_size = '建筑面积:.*?<td.*?>(.*?)<' house.ho_true_size = '套内面积:.*?<td.*?>(.*?)<' house.ho_share_size = '分摊面积:.*?<td.*?>(.*?)<' house.ho_type = '房屋用途:.*?<td.*?>(.*?)<' house.ho_floor = '所 在 层:.*?<td.*?>(.*?)<' house.ho_room_type = '房屋户型:.*?<td.*?>(.*?)<' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('宁德房号错误,url={}'.format(house_url), e)
def get_house_info(self, bu_con): bu_html = etree.HTML(bu_con) house = House(co_index) ho = bu_html.xpath("//tr[@height='30']//span/a") bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) for ho_info in ho: try: ho_detail = "http://www.hcsfcglj.com/Templets/BoZhou/aspx/" + ho_info.xpath( "./@value")[0] try: ho_res = requests.get(ho_detail, headers=self.headers) ho_con = ho_res.text except Exception as e: print("co_index={},房屋详情页{}请求失败".format( co_index, ho_detail)) print(e) continue house.ho_name = re.search('房号.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_floor = re.search('所在层.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊共有面积.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.ho_type = re.search('房屋用途.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.bu_num = re.search('幢号.*?<td>(.*?)<', ho_con, re.S | re.M).group(1) house.bu_id = bu_id except: house.ho_name = ho_info.xpath("./@id")[0] house.bu_id = bu_id house.insert_db()
def ho_parse(self, co_id, bu_id, ho_list): for ho in ho_list: ho_url = ho.xpath("./@href")[0] house_url = "http://110.89.45.7:8082" + ho_url # while True: # try: # proxy = self.proxies[random.randint(0,9)] try: ho_res = requests.get( house_url, headers=self.headers, ) except: continue # break # except: # continue con = ho_res.text house = House(co_index) house.co_id = co_id house.bu_id = bu_id house.ho_name = re.search('房 号.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_build_size = re.search('建筑面积.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_true_size = re.search('套内面积.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_share_size = re.search('分摊面积.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_floor = re.search('所 在 层.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.ho_price = re.search('申报单价.*?">(.*?)</td', con, re.S | re.M).group(1) house.ho_type = re.search('房屋用途.*?<td>(.*?)</td', con, re.S | re.M).group(1) house.insert_db() time.sleep(random.randint(0, 3))
def get_house_info(self,co_id,bu_id,house_detail_list): for house_detail in house_detail_list: house_url = self.url + house_detail try: house_res = requests.get(house_url,headers=self.headers) house_res.status_code == 200 except Exception as e: print("co_index={},房屋信息错误".format(co_index),e) continue house_con = house_res.text ho = House(co_index) ho.co_id = co_id ho.bu_id = bu_id ho.ho_name = re.search('房号.*?fh">(.*?)</span',house_con,re.S|re.M).group(1) ho.orientation = re.search('朝向.*?Cx">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_floor = re.search('层.*?lc">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_room_type = re.search('房型.*?hx">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_build_size = re.search('建筑面积.*?jzmj">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_share_size = re.search('分摊面积.*?ftmj">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_true_size= re.search('套内面积.*?tnmj">(.*?)</span',house_con,re.S|re.M).group(1) ho.ho_type = re.search('用途.*?lx">(.*?)</span',house_con,re.S|re.M).group(1) ho.insert_db()
def start_crawler(self): url = 'http://zzx.zzfc.com/ajaxpro/xy_ysxk_more,App_Web_mjeeodb-.ashx' for i in range(1, 21): payload = "{\"pageNo\":" + str( i) + ",\"pageSize\":30,\"rowcount\":589}" try: response = requests.post(url, data=payload, headers=self.headers) con = response.content.decode() except Exception as e: log.error('楼栋请求失败{}'.format(e)) continue co_list = re.findall('\[\d+,.*?\d+\]', con) for comm in co_list: try: sid = re.search('\[(\d+),', comm).group(1) pid = re.search('",(\d+),', comm).group(1) bu_url = 'http://zzx.zzfc.com/xy_bldg.aspx?pid=' + pid + '&sid=' + sid bu_res = requests.get(bu_url, headers=self.headers) bu_con = bu_res.content.decode() bu = Building(co_index) bu.bu_id = sid bu.bu_address = re.search('楼栋座落.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('预售证号.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('预售日期.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_all_house = re.search('套数.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.insert_db() except Exception as e: log.error("{}楼栋解析失败{}".format(comm, e)) continue ho_url = 'http://zzx.zzfc.com/ajaxpro/xy_housetag,App_Web_xg4ulr9n.ashx' data = "{\"m_key\":\"WWW_LPB_001\",\"m_param\":\"" + sid + "\"}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'X-AjaxPro-Method': 'GETLPBDS' } try: ho_res = requests.post(ho_url, data=data, headers=headers) ho_con = ho_res.content.decode() except Exception as e: log.error("房屋请求失败{}".format(e)) continue ho_list = re.findall('\["\d+.*?\d+\]', ho_con) for house in ho_list: try: ho = House(co_index) ho.bu_id = sid info_list = house.split(",") ho.ho_name = info_list[4] ho.ho_floor = re.search('(\d+)层', house).group(1) ho.ho_build_size = info_list[-3] ho.ho_true_size = info_list[-2] ho.insert_db() except Exception as e: log.error("{}房屋解析错误{}".format(house, e)) continue