def detail_parse(self, id, build_list): for build in build_list: bu_temp = re.search('<a href="(.*?)"', build).group(1) build_url = self.start_url + bu_temp try: bu_res = requests.get(build_url, headers=self.headers) time.sleep(2) bu_text = bu_res.content.decode() bu = Building(co_index) bu.bu_num = re.search('幢号:(.*?) 许', bu_text).group(1) bu.bu_pre_sale = re.search('许可证号:<span>(.*?)</span>', bu_text).group(1) bu.bu_id = int(bu.bu_pre_sale) bu.bu_all_house = re.search('套数:<span>(.*?)</span', bu_text).group(1) bu.bu_floor = re.search('地上层数:<span>(.*?)</span', bu_text).group(1) bu.bo_build_end_time = re.search('竣工日期:<span>(.*?)</span', bu_text).group(1) bu.bu_build_size = re.search('预售许可面积:<span>(.*?)</span', bu_text).group(1) bu.bu_type = re.search('用途:<span>(.*?)</span', bu_text).group(1) bu.insert_db() except Exception as e: log.error("楼栋出错{}".format(e)) continue self.house_detail(bu_text, id, bu.bu_id)
def bu_info(self, bu_list, co_id): for bu in bu_list: try: bu_url = 'http://www.fxfdcw.com/' + bu res = requests.get(bu_url, headers=self.headers) con = res.content.decode('gbk') html = etree.HTML(con) build = Building(co_index) build.co_id = co_id build.bu_id = re.search('bdid=(\d+)', bu).group(1) build.bu_num = re.search('楼号.*?">(.*?)</', con, re.S | re.M).group(1) build.bu_address = re.search('坐落.*?">(.*?)</', con, re.S | re.M).group(1) build.bu_floor = re.search('地上层数.*?">(.*?)</', con, re.S | re.M).group(1) build.bu_build_size = re.search('建筑面积.*?wrap">(.*?)</', con, re.S | re.M).group(1) build.bu_all_house = re.search('套 数.*?">(.*?)</', con, re.S | re.M).group(1) build.bu_type = re.search('用 途.*?wrap">(.*?)</', con, re.S | re.M).group(1) build.insert_db() ho_list = html.xpath("//span[@title]") except Exception as e: # log.error("楼栋信息错误{}".format(e)) print("楼栋信息错误{}".format(e)) continue self.ho_info(ho_list, co_id, build.bu_id)
def get_build_info(self, co_id): build_url = "http://202.103.219.149:7000/ajax/LeadingMIS.CommonModel.CommonQuery.WebUI.AjaxManage.QueryDataParser,LeadingMIS.CommonModel.CommonQuery.WebUI.ashx" querystring = {"_method": "GetDataToDynamicInXml", "_session": "rw"} payload = "xmlInfo=%263Croot%2620QueryCode%263D%2622BuildingsInfo%2622%2620PageIndex%263D%26221%2622%2620PageSize%263D%262215%2622%2620SortField%263D%2622%2620ORDER%2620BY%2620Name%2622%2620QueryString%263D%2622QueryCode%263DBuildingsInfo%2626amp%263BProjectID%263D" + co_id + "%2622%2620BeginDate%263D%2622%262000%263A00%263A00%2622%2620EndDate%263D%2622%262023%263A59%263A59%2622%2620Flag%263D%2622TitleBody%2622%2620TitlesWidthInfo%263D%2622BuildNo%267C0%2624Name%267C0%2624FloorCount%267C0%2624RoomCount%267C0%2624YCJZArea%267C0%2624Structure%267C0%2624YSXKCer%267C0%2624ZJJG%267C0%2622%2620IsUseOCache%263D%26220%2622%2620IsUserID%263D%26220%2622%2620SiteId%263D%26228907bd13-1d14-4f9e-8c01-e482d9590d10%2622%2620LockedColumn%263D%26220%2622%2620IsLocked%263D%26220%2622%2620ClientWidth%263D%26221601%2622%2620ShowModeCode%263D%2622default%2622%2620Language%263D%2622chinese%2622/%263E" try: response = requests.request("POST", build_url, data=payload, params=querystring) html = response.text build_info_list = re.findall('<tr.*?>.*?</tr>', html, re.S | re.M)[1:] for i in build_info_list: build = Building(co_index) build.co_id = co_id build.bu_num = re.search( '<span class="spanctfield".*?<span class="spanctfield".*?>.*?<a.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_floor = re.search( '<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) build.bu_pre_sale = re.search( '<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('id="Tr_(.*?)"', i, re.S | re.M).group(1) build.insert_db() self.get_house_info(co_id, build.bu_id) except Exception as e: print('请求错误,url={},data={},params={}'.format( build_url, payload, querystring))
def get_build_info(self, build_id_list, co_id): bu = Building(co_index) for build_id in build_id_list: formdata = {} formdata["action"] = "qeurySingleBuilding" formdata['pk'] = str(build_id) header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Referer': 'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildingList.php' } try: build_info = self.s.post( 'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildHouseAction.php', data=formdata, headers=header) except Exception as e: print("co_idnex={},楼栋错误".format(co_index), e) build_con = build_info.text bu.bu_id = build_id bu.co_id = co_id bu.bu_num = re.search('幢名称.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bu_floor = re.search('总层数.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bu_build_size = re.search('>建筑面积.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bo_develops = re.search('房地产企业.*?">(.*?)</td', build_con, re.S | re.M).group(1) bu.insert_db() self.get_house_info(build_con, co_id, build_id)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_code = re.search('xqbm=(.*?)$', i).group(1) build_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/donginfo.aspx?xqbm=' + build_code build.bu_num = 'Labeldongmc">(.*?)<' build.bu_pre_sale = 'Labelyszheng">(.*?)<' build.bu_floor = 'Labelsceng">(.*?)<' build.bu_address = 'Label1zuoluo">(.*?)<' build.bo_build_start_time = 'Label1kaigong">(.*?)<' build.co_build_structural = 'Labeljiegou">(.*?)<' build.co_id = 'donginfo.aspx\?xqbm=(.*?)"' build.bu_id = 'id="DropDownList1".*?value="(.*?)"' p = ProducerListUrl(page_url=build_url, request_type='get', encode='utf-8', analyzer_rules_dict=build.to_dict(), current_url_rule='location\.href=(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def build_parse(self, co_id): list_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/loulist.jsp?Id_xmxq=' + co_id res = requests.get(list_url, headers=self.headers) con = res.content.decode() build_id_list = re.findall("searchByLid\('(\d+)'\)", con) for build_id in build_id_list: try: bu_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/lpbxx_new.jsp?lid=' + build_id bu_res = requests.get(bu_url, headers=self.headers) bu_con = bu_res.content.decode('gbk') bu = Building(co_index) bu.co_id = co_id bu.bu_id = build_id bu.bu_num = re.search('楼栋名称.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.bu_all_house = re.search('总套数.*?">总(.*?)套</td', bu_con, re.S | re.M).group(1) bu.bu_floor = re.search('地上层数.*?">共(.*?)层</td', bu_con, re.S | re.M).group(1) bu.bu_build_size = re.search('总建筑面积.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search("searchysxk\('(.*?)'\)", bu_con, re.S | re.M).group(1) bu.bu_type = re.search('房屋用途.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.insert_db() except Exception as e: log.error('{}楼栋错误{}'.format(build_id, e)) self.house_parse(co_id, build_id, bu_con)
def get_build_info(self, bu_address_list, bu_num_list, bu_floor_list, bu_url_list, co_id): for i in range(len(bu_url_list)): build = Building(co_index) build.bu_address = bu_address_list[i] build.bu_num = bu_num_list[i] build.bu_floor = bu_floor_list[i] build.co_id = co_id # response = self.request_proxy('http://183.63.60.194:8808/public/web/' + bu_url_list[i]) time.sleep(1) response = self.s.get('http://183.63.60.194:8808/public/web/' + bu_url_list[i], headers=self.headers) build.bu_id = re.search('ljzid=(.*?)$', bu_url_list[i]).group(1) build.insert_db() html = response.text house_html = re.search('var _table_html_.*?</script>', html, re.S | re.M).group() house_url_list = re.findall('房屋号:<a.*?href="(.*?)"', house_html, re.S | re.M) try: self.get_house_info(house_url_list, build.bu_id) except Exception as e: print( '房号错误,co_index={},url={}'.format( co_index, 'http://183.63.60.194:8808/public/web/' + bu_url_list[i]), e)
def get_build_info(self, build_url_list, comm): for i in build_url_list: try: build_url = 'http://58.51.240.121:8503/' + i response = requests.get(build_url, headers=self.headers) html = response.text comm.co_pre_sale = re.search( 'id="PresellInfo1_lblXkzh">(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale_date = re.search( 'id="PresellInfo1_lblFzrq">(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_info_list = re.findall('<tr bgcolor="#FFFFFF">.*?</tr>', html, re.S | re.M) for i in build_info_list: build = Building(co_index) build.co_id = comm.co_id build.bu_num = re.search('<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_floor = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_all_house = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('PresellId=(.*?)$', build_url).group(1) build.insert_db() house_url = re.search('a href="(.*?)"', i, re.S | re.M).group(1) self.get_house_info(house_url, comm.co_id, build.bu_id) except Exception as e: print('请求错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://222.223.160.199:8088/website/buildquery/selectBuild.jsp?buildID=' + i[ 0] response = requests.get(build_url, headers=self.headers) html = response.text build.bu_id = i[0] build.co_build_structural = re.search('结构类型.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bo_build_end_time = re.search('建成年份.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_build_size = re.search('总建筑面积.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.size = re.search('占地面积.*?<td>(.*?)<', html, re.S | re.M).group(1) build.bu_floor = re.search('房屋层数.*?<td>(.*?)<', html, re.S | re.M).group(1) build.bu_all_house = re.search('房屋套数.*?<td>(.*?)<', html, re.S | re.M).group(1) build.area = re.search('坐落区.*?<td>(.*?)<', html, re.S | re.M).group(1) build.insert_db() self.get_house_info(build.bu_id) except Exception as e: print('请求错误,url={}'.format(build_url), e)
def get_build_info(self, build_lis, co_id): for build_ in build_lis: build_url = "http://xx.yyfdcw.com" + build_ try: build_res = requests.get(build_url, headers=self.headers) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('Bid=(\d+)', build_).group(1) bu.bu_num = re.search('名称.*?">(.*?)</spa', con).group(1) bu.bu_pre_sale = re.search("编.*?red'>(.*?)</a", con).group(1) bu.bu_pre_sale_date = re.search('颁发日期.*?Date">(.*?)</span', con).group(1) bu.bo_build_start_time = re.search('开工日期.*?">(.*?)</span', con).group(1) bu.bo_build_end_time = re.search('竣工日期.*?">(.*?)</span', con).group(1) bu.bo_develops = re.search('单位.*?">(.*?)</span', con).group(1) bu.bu_floor = re.search('层数.*?">(.*?)</span', con).group(1) bu.bu_live_size = re.search('住宅面积.*?">(.*?)</span', con).group(1) bu.size = re.search('总面积.*?">(.*?)</span', con).group(1) bu.insert_db() id = re.search('测量号.*?">(.*?)</span', con).group(1) self.get_house_info(co_id, bu.bu_id, id)
def bu_parse(self,bu_url,co_id,co_url): build_url = "http://61.143.241.154/" + bu_url global headers headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Referer': co_url } bu_res = requests.get(build_url,headers=headers) bu_con = bu_res.content.decode('gbk') bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a',bu_con,re.S|re.M).group(1) bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td',bu_con,re.S|re.M).group(1) bu_html = etree.HTML(bu_con) bu_list = bu_html.xpath("//table[@id='donglist']//tr") for bo in bu_list: bu = Building(co_index) bu.co_id = co_id bo_url = bo.xpath("./td/a/@href")[0] bu.bu_id = re.search('dbh=(.*?)&', bo_url).group(1) bu.bu_num = bo.xpath("./td[3]/text()")[0] bu.bu_floor = bo.xpath("./td[4]/text()")[0] bu.bu_pre_sale = bu_pre_sale bu.bu_pre_sale_date = bu_pre_sale_date bu.insert_db() self.house_parse(bo_url,co_id,bu.bu_id)
def build_info(self, build_detail, co_id): proxy = Proxy_contact(app_name='wuhan', method='get', url=build_detail, headers=self.headers) # build_res = requests.get(build_detail,headers=self.headers) build_res = proxy.contact() html = etree.HTML(build_res.decode('gb18030')) info_list = html.xpath("//tr[@bgcolor='#FFFFFF']") for info in info_list: try: bu = Building(co_index) bu.co_id = co_id bu.bu_floor = info.xpath('./td[3]/text()')[0] bu.bu_all_house = info.xpath('./td[4]/text()')[0] bu.bu_num = info.xpath('./td//span/text()')[0] temp_url = info.xpath('./td/a/@href')[0] bu.bu_id = re.search('HouseDengjh=(.*?\d+)', temp_url).group(1) bu.insert_db() except Exception as e: log.error('楼栋错误{}'.format(e)) continue a = parse.quote(re.search('DengJh=(.*?\d+)&', temp_url).group(1), encoding='gbk') b = parse.quote(re.search('HouseDengjh=(.*?\d+)', temp_url).group(1), encoding='gbk') bu_url = 'http://scxx.fgj.wuhan.gov.cn/5.asp?DengJh=' + a + '&HouseDengjh=' + b self.house_info(bu.bu_id, bu_url, co_id) time.sleep(3)
def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def build_info(self, bu_list, co_id): for bu in bu_list: bu_url = bu.xpath("./td[4]/a/@href")[0] build_url = self.start_url + '/' + bu_url bu_res = requests.get(build_url, headers=self.headers) bu_res.encoding = 'gbk' con = bu_res.text bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a', con, re.S | re.M).group(1) bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td', con, re.S | re.M).group(1) bu_html = etree.HTML(con) donglist = bu_html.xpath("//table[@id='donglist']/tr") for dong in donglist: dong_url = dong.xpath("./td/a/@href")[0] bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID={(.*?)}', dong_url).group(1) bu.bu_num = dong.xpath("./td[3]/text()")[0] bu.bu_floor = dong.xpath("./td[4]/text()")[0] bu.bu_pre_sale = bu_pre_sale bu.bu_pre_sale_date = bu_pre_sale_date bu.insert_db() self.house_info(co_id, bu.bu_id, dong_url)
def comm_info( self, con, ): # 小区及楼栋 comm = Comm(co_index) comm.co_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()" )[0] # 小区名称 co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0] # 小区id comm.co_id = re.search(r"\d+", co_id_str).group(0) comm.co_address = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[ 0] # 小区地址 comm.co_develops = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[ 0] # 开发商 comm.co_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0] # 总面积 comm.co_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[ 0] # 建筑面积 comm.co_build_end_time = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[ 0] # 竣工时间 comm.co_plan_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[ 0] # 用地规划许可 comm.co_work_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0] # 施工许可 comm.co_green = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()" )[0] # 绿地百分比 comm.co_land_use = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0] # 土地使用证 comm.insert_db() build = Building(co_index) build_table = con.xpath("//tr[@style='color:#000066;']") room_list = [] for build_list in build_table: build.co_id = comm.co_id build.co_name = comm.co_name build_info = build_list.xpath("./td/text()") build.bu_id = build_info[0] build.bu_num = build_info[1] build.bu_all_house = build_info[2] build.size = build_info[3] build.bu_floor = build_info[4] build.bu_pre_sale = build_info[5] build.insert_db() room_url = build_list.xpath("./td/a/@href")[0] room_list.append(room_url) return room_list
def get_build_info(self, co_id, co_name): url = 'http://www.czhome.com.cn/Presell.asp?projectID=' + co_id + '&projectname=' + co_name response = requests.get(url, headers=self.headers) html = response.content.decode('gbk') tree = etree.HTML(html) xpath_list = tree.xpath('//tr[@class="indextabletxt"]') for i in xpath_list[1:]: build_url = i.xpath('td[2]/a/@href')[0] url = 'http://www.czhome.com.cn/' + build_url result = requests.get(url, headers=self.headers) if result.status_code is not 200: print("co_index={},预售url:{}连接失败".format(co_index, url)) continue html = result.content.decode('gbk') tree = etree.HTML(html) # 总套数 bu_xpath = tree.xpath('/html/body/table/tr/td/table/tr/td/table/tr')[1:] for i in bu_xpath: try: building = Building(7) global building_id building_id += 1 building.bu_id = building_id bu_all_house = i.xpath('td[7]/text()')[0] bu_url = i.xpath('td[1]/a/@href')[0] url = 'http://www.czhome.com.cn/' + bu_url response = requests.get(url, headers=self.headers) if response.status_code is not 200: print("co_index={},楼栋url:{}连接失败".format(co_index, url)) continue html = response.content.decode('gbk') tree = etree.HTML(html) # 楼层 bu_floor = tree.xpath('//*[@id="Table4"]/tr[2]/td/table[3]/tr/td[1]/u/text()')[-1] house_url_list = tree.xpath('//*[@id="Table4"]/tr[2]/td/table[3]/tr/td/a/@href') bu_address = re.search('<center><font color=.*? (.*?)<', html, re.S | re.M).group(1) building.bu_all_house = bu_all_house building.bu_address = bu_address building.bu_floor = bu_floor building.bu_id = building_id building.co_id = co_id building.insert_db() for i in house_url_list: try: house = House(7) house_url = 'http://www.czhome.com.cn/' + i self.get_house_info(house_url, house, co_id, building_id, building) except Exception as e: print(e) except Exception as e: print(e)
def build_info(self, bu_list, co_id): for bo in bu_list: ho_url = bo.xpath("./@href")[0] floor = bo.xpath(".//p[2]/text()")[0] bu = Building(co_index) bu.bu_pre_sale = bo.xpath(".//p[3]/text()")[0] bu.bu_num = re.search('zh=(.*?)', ho_url).group(1) bu.bu_id = re.search('n=(\d+)', ho_url).group(1) bu.co_id = co_id bu.bu_floor = re.search('总层数.*?(\d+)', floor).group(1) bu.insert_db() house_url = "http://www.ggsfcw.com/" + ho_url self.ho_info(house_url, co_id, bu.bu_id)
def get_comm_detail(self, detail_url, area): try: comm = Comm(co_index) comm_detail_url = 'http://www.yfci.gov.cn:8080/HousePresell/' + detail_url response = requests.get(comm_detail_url, headers=self.headers) html = response.text comm.co_develops = re.search('id="kfsmc".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = re.search('id="PresellName".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('id="HouseRepose".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('id="PresellArea".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('id="djrqtd".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_land_use = re.search('id="landinfo".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('id="zczjtd".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('id="bookid".*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale_date = re.search('id="FZDatebegin".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_open_time = re.search('id="kpdate".*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search('FD=(.*?)&', detail_url, re.S | re.M).group(1) comm.area = area comm.insert_db() build_html = re.search('id="donglist".*?</table>', html, re.S | re.M).group() build_info_list = re.findall('<tr.*?</tr>', build_html, re.S | re.M) for i in build_info_list: build = Building(co_index) build.co_id = comm.co_id build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_num = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) build.bu_id = re.search("LID=(.*?)$", house_url, re.S | re.M).group(1) build.insert_db() self.get_house_info(house_url, comm.co_id, build.bu_id) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url), e)
def bu_parse(self, co_id, page, co_url, co_res, path_url): html = etree.HTML(co_res.text) viewstate = html.xpath("//input[@id='__VIEWSTATE']/@value")[0] generator = html.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0] valid = html.xpath("//input[@id='__EVENTVALIDATION']/@value")[0] formdata = { "__VIEWSTATE": viewstate, "__EVENTTARGET": 'ctl00$MainContent$OraclePager1$ctl11$PageList', "__VIEWSTATEGENERATOR": generator, "__EVENTVALIDATION": valid, "ctl00$MainContent$OraclePager1$ctl11$PageList": 0 } self.headers['Referer'] = co_url for i in range(1, int(page) + 1): page_res = requests.post(co_url, data=formdata, headers=self.headers) page_html = etree.HTML(page_res.text) view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0] generator_ = html.xpath( "//input[@id='__VIEWSTATEGENERATOR']/@value")[0] valid_ = html.xpath("//input[@id='__EVENTVALIDATION']/@value")[0] formdata = { "__VIEWSTATE": view_state, "__EVENTTARGET": 'ctl00$MainContent$OraclePager1$ctl11$PageList', "__VIEWSTATEGENERATOR": generator_, "__EVENTVALIDATION": valid_, "ctl00$MainContent$OraclePager1$ctl11$PageList": i - 1 } bu_list = page_html.xpath( "//table[@id='ctl00_MainContent_OraclePager1']//tr") for bu in bu_list[1:]: build = Building(co_index) build.co_id = co_id build.bu_num = bu.xpath("./td/a/text()")[0] build.bu_build_size = bu.xpath("./td[2]/text()")[0] build.bu_floor = bu.xpath("./td[4]/text()")[0] build.bu_all_house = bu.xpath("./td[3]/text()")[0] tmp_url = bu.xpath("./td/a/@href")[0] build.bu_id = re.search('PBTAB_ID=(.*?)&', tmp_url).group(1) build.insert_db() house_url = path_url.replace('SaleInfoProListIndex.aspx', '') + tmp_url self.ho_parse(co_id, build.bu_id, house_url)
def get_build_info(self, build_url_list, co_name): for i in build_url_list: try: build = Building(co_index) build.co_name = co_name build_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(build_url, headers=self.headers) html = response.text # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0] for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M): try: build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k result = requests.get(build_url_detail, headers=self.headers) content = result.text build.bu_num = re.findall( 'BuildingInfo1_lblBuildingName">(.*?)<', content, re.S | re.M)[0] build.bu_all_house = re.findall( 'BuildingInfo1_lblZts">(.*?)<', content, re.S | re.M)[0] build.bu_floor = re.findall( 'BuildingInfo1_lblZcs">(.*?)<', content, re.S | re.M)[0] build.bu_build_size = re.findall( 'BuildingInfo1_lblJzmj">(.*?)<', content, re.S | re.M)[0] build.bu_live_size = re.findall( 'BuildingInfo1_lblZzmj">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale = re.findall( 'BuildingInfo1_lblYsxkzh">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale_date = re.findall( 'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content, re.S | re.M)[0] build.insert_db() house_url_list = re.findall( "onClick=.getMoreHouseInfo\('(.*?)'\)", content, re.S | re.M) self.get_house_info(house_url_list, co_name, build.bu_num) except Exception as e: print(e) except Exception as e: print(e)
def build_parse(self, co_id): # 楼栋信息解析 bu = Building(co_index) build_info_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/ProNBList.do" formdata = {"pid": co_id, "pageNo": "1", "pageSize": "50"} res = requests.post(build_info_url, data=formdata, headers=self.headers) con = res.text info = re.findall('<tr objid.*?</tr>', con, re.S | re.M) for i in info: bu.co_id = co_id bu.bu_id = re.search('objid="(\d+)"', i).group(1) bu.bu_num = re.findall('<span>(.*?)<', i)[1] bu.bu_floor = re.search('<td>(\d+)\(', i).group(1) bu.bu_address = re.findall('<td>(.*?)</td>', i)[-1] bu.insert_db() self.house_parse(bu.bu_id, co_id)
def get_build_info(self, co_id): build_url = 'http://www.yanjifc.com/jdi' payload = "activityId=" + str(co_id) + "&module=jtsActBuildingInfo" result = requests.post(url=build_url, data=payload, headers=self.headers) data = result.json() build_list = data['ROWS']['ROW'] for i in build_list: build = Building(co_index) build.bu_all_size = self.dict_get(i, 'BUILDING_AREA') build.bu_address = self.dict_get(i, 'LOCATION') build.bu_num = self.dict_get(i, 'LOCATION') build.bu_floor = self.dict_get(i, 'TOTAL_FLOORS') build.bu_all_house = self.dict_get(i, 'TOTAL_SET') build.co_build_structural = self.dict_get(i, 'STRUCTURE') build.bu_id = self.dict_get(i, 'RESOURCE_GUID') build.co_id = co_id build.insert_db() self.get_house_info(co_id, build.bu_id)
def get_build_info(self, presell_url_list, co_id): for presell_url in presell_url_list: pre_url = self.url + presell_url res = requests.get(pre_url, headers=self.headers) build_url_list = re.findall('【<a href="(.*?)" target="_self"', res.text, re.S | re.M) for build_url in build_url_list: build_info_url = self.url + build_url try: build_res = requests.get(build_info_url, headers=self.headers) con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID=(\d+)', build_url).group(1) bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span', con, re.S | re.M).group(1) bu.bu_floor = re.search('总 层 数.*?(\d+)</span', con, re.S | re.M).group(1) bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_not_live_size = re.search( '非住宅面积.*?Fzzmj">(.*?)</span', con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span', con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span', con, re.S | re.M).group(1) bu.bu_type = re.search('项目类型.*?Type">(.*?)</span', con, re.S | re.M).group(1) bu.insert_db() except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"", con, re.S | re.M) self.get_house_info(co_id, bu.bu_id, house_detail_list)
def bu_parse(self, co_id, bulist): for bo in bulist: bu_url = "http://110.89.45.7:8082" + bo bu_res = requests.get(bu_url, headers=self.headers) con = bu_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('buildingInfoID=(.*?)&', bo).group(1) bu.bu_num = re.search('幢号.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_floor = re.search('总 层 数.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_live_size = re.search('批准销售.*?">.*?</td.*?">(.*?)</td', con, re.S | re.M).group(1) bu.bu_all_size = re.search('总面积.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_type = re.search('设计用途.*?">(.*?)</', con, re.S | re.M).group(1) bu.insert_db() bu_html = etree.HTML(con) ho_list = bu_html.xpath("//td[@style]/a") self.ho_parse(co_id, bu.bu_id, ho_list)
def get_build_info(self, build_url_list, co_id): for i in build_url_list: build_url = 'http://www.fjlyfdc.com.cn/' + i try: build = Building(co_index) response = requests.get(build_url, headers=self.headers) html = response.text build.bu_id = re.search('buildingInfoID=(.*?)&', build_url).group(1) build.co_id = co_id build.bo_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('坐落位置:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_num = re.search('幢号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_build_structural = re.search('建筑结构:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_type = re.search('设计用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_floor = re.search('总 层 数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_all_size = re.search('总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.insert_db() house_url_list = re.findall('href="(/House/HouseInfo\?HouseCenterID=.*?)"', html, re.S | re.M) self.get_house_info(house_url_list, build.bu_id, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.ndjsj.gov.cn/House/' + i build.co_name = '项目名称:.*?<td.*?>(.*?)<' build.bu_num = '幢 号:.*?<td.*?>(.*?)<' build.bu_address = '坐落位置:.*?<td.*?>(.*?)<' build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<' build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<' build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<' # build.bu_type = '设计用途:.*?<td.*?>(.*?)<' build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<' p = ProducerListUrl(page_url=build_url, request_type='get', encode='utf-8', analyzer_rules_dict=build.to_dict(), current_url_rule='javascript:ShowTitle.*?href="(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print('宁德楼栋错误,url={}'.format(build_url), e)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.fjnpfdc.com/House/' + i res = requests.get(build_url, headers=self.headers) con = res.content.decode('gbk') build.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_num = re.search("幢 号:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_use = re.search("设计用途:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_structural = re.search("建筑结构:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_floor = re.search("总 层 数:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_build_size = re.search("总 面 积:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_end_time = re.search("竣工日期:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) house_url_list = re.findall('<a href="(HouseInfo.*?)"', con) # p = ProducerListUrl(page_url=build_url, # request_type='get', encode='gbk', # analyzer_rules_dict=build.to_dict(), # current_url_rule='<a href="(HouseInfo.*?)"', # analyzer_type='regex', # headers=self.headers) build.co_id = re.search('ProjectId=(.*?)&', i).group(1) build.bu_id = re.search('BuildingId=(.*?)&P', i).group(1) build.insert_db() # house_url_list = p.get_details() self.get_house_info(house_url_list, build.bu_id, build.co_id) except Exception as e: print("co_index={},楼栋{}错误".format(co_index, i), e)
def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con): build = Building(co_index) build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_pre_sale = bu_pre_sale build.bo_develops = bo_develops build.co_name = bu_co_name build.insert_db()
def get_build_info(self, more_build_url): for i in more_build_url: try: build = Building(co_index) build_url = 'http://www.jmfc.com.cn/' + i build.bu_num = '<tr bgcolor="#FFFFFF">.*?<td.*?>(.*?)<' build.co_id = '楼盘首页.*?aid-(.*?)/' build.bu_id = '&addno=12&action=loupantable&lzbm=(.*?)&ql_xh=' build.bu_pre_sale = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>(.*?)<' build.bu_floor = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<' build.bu_all_house = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<' p = ProducerListUrl( page_url=build_url, request_type='get', encode='gbk', analyzer_rules_dict=build.to_dict(), current_url_rule= '<tr bgcolor="#FFFFFF">.*?align="left".*?href="(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print(e)
def get_comm_info(self,comm_info): co = Comm(co_index) co.co_name = re.search('_blank">(.*?)</a',comm_info).group(1) try: co.co_address = re.findall('px">(.*?)</td',comm_info)[1] except: co.co_address = None co.area = re.search('center">(.*?)</td>',comm_info).group(1) co_detail_url = re.search("href='(.*?)'",comm_info).group(1) co_url = "http://www.qyfgj.cn/newys/"+co_detail_url try: res = requests.get(co_url,headers=self.headers) except Exception as e: print("co_index={}小区未请求到".format(co_index),e) con = res.content.decode('gbk') try: co.co_develops = re.search('开发商名称.*?px;">(.*?)</a',con,re.S|re.M).group(1) co.co_all_house = re.search('总套数.*?">(\d+) ',con,re.S|re.M).group(1) co.co_all_size = re.search('总面积.*?">(\d+.\d+) m',con,re.S|re.M).group(1) except: print("小区无开发商等信息") co.insert_db() try: build = re.findall('<tr bgcolor="white">(.*?)</tr>',con,re.S|re.M) except: print("小区没有楼栋信息") build_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': co_url } for build_info in build: if "进入" in build_info: build_url = re.search('href="(.*?)"><font',build_info).group(1) build_url = "http://www.qyfgj.cn/newys/" + build_url ho_headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': build_url } build_res = requests.get(build_url, headers=build_headers) build_con = build_res.content.decode('gbk') if re.search('ID=(\d+)',build_url): #现售 bu = Building(co_index) bu_id = re.search('ID=(\d+)',build_url).group(1) bu.bu_id = bu_id bu.co_name =co.co_name bu.insert_db() self.get_house_info(headers=ho_headers,bu_id=bu_id,url=build_url) else: #预售 bu = Building(co_index) bu.co_name = co.co_name bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>', build_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con, re.S | re.M).group(1) ret = re.findall('<tr onmouseover(.*?)</tr',build_con,re.S|re.M) for i in ret: house_url = re.search('href="(.*?)"',i).group(1) house_url = "http://www.qyfgj.cn/newys/" + house_url bu.bu_id = re.search('dbh=(.*?)&',i).group(1) bu.bu_num = re.search('<td width="89.*?">(.*?)</',i).group(1) bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',i).group(1) bu.insert_db() ho_res = requests.get(house_url,headers=ho_headers) ho_con = ho_res.content.decode('gbk') new_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': house_url } self.get_house_info(ho_con=ho_con,headers=new_headers,bu_id=bu.bu_id) else: print("楼栋无链接地址")