def get_build_info(self, build_lis, co_id): for build_ in build_lis: build_url = "http://xx.yyfdcw.com" + build_ try: build_res = requests.get(build_url, headers=self.headers) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('Bid=(\d+)', build_).group(1) bu.bu_num = re.search('名称.*?">(.*?)</spa', con).group(1) bu.bu_pre_sale = re.search("编.*?red'>(.*?)</a", con).group(1) bu.bu_pre_sale_date = re.search('颁发日期.*?Date">(.*?)</span', con).group(1) bu.bo_build_start_time = re.search('开工日期.*?">(.*?)</span', con).group(1) bu.bo_build_end_time = re.search('竣工日期.*?">(.*?)</span', con).group(1) bu.bo_develops = re.search('单位.*?">(.*?)</span', con).group(1) bu.bu_floor = re.search('层数.*?">(.*?)</span', con).group(1) bu.bu_live_size = re.search('住宅面积.*?">(.*?)</span', con).group(1) bu.size = re.search('总面积.*?">(.*?)</span', con).group(1) bu.insert_db() id = re.search('测量号.*?">(.*?)</span', con).group(1) self.get_house_info(co_id, bu.bu_id, id)
def get_build_info(self,presell_url_list,co_id): for presell_url in presell_url_list: pre_url = self.url + presell_url res = requests.get(pre_url,headers=self.headers) build_url_list = re.findall('【<a href="(.*?)" target="_self"',res.text,re.S|re.M) for build_url in build_url_list: build_info_url = self.url+build_url try: build_res = requests.get(build_info_url,headers=self.headers) con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID=(\d+)',build_url).group(1) bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span',con,re.S|re.M).group(1) bu.bu_floor = re.search('总 层 数.*?(\d+)</span',con,re.S|re.M).group(1) bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span',con,re.S|re.M).group(1) bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span',con,re.S|re.M).group(1) bu.bu_not_live_size = re.search('非住宅面积.*?Fzzmj">(.*?)</span',con,re.S|re.M).group(1) bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span',con,re.S|re.M).group(1) bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span',con,re.S|re.M).group(1) bu.bu_type = re.search('项目类型.*?Type">(.*?)</span',con,re.S|re.M).group(1) bu.insert_db() except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"",con,re.S|re.M) self.get_house_info(co_id,bu.bu_id,house_detail_list)
def get_build_info(self, co_id): build_url = "http://202.103.219.149:7000/ajax/LeadingMIS.CommonModel.CommonQuery.WebUI.AjaxManage.QueryDataParser,LeadingMIS.CommonModel.CommonQuery.WebUI.ashx" querystring = {"_method": "GetDataToDynamicInXml", "_session": "rw"} payload = "xmlInfo=%263Croot%2620QueryCode%263D%2622BuildingsInfo%2622%2620PageIndex%263D%26221%2622%2620PageSize%263D%262215%2622%2620SortField%263D%2622%2620ORDER%2620BY%2620Name%2622%2620QueryString%263D%2622QueryCode%263DBuildingsInfo%2626amp%263BProjectID%263D" + co_id + "%2622%2620BeginDate%263D%2622%262000%263A00%263A00%2622%2620EndDate%263D%2622%262023%263A59%263A59%2622%2620Flag%263D%2622TitleBody%2622%2620TitlesWidthInfo%263D%2622BuildNo%267C0%2624Name%267C0%2624FloorCount%267C0%2624RoomCount%267C0%2624YCJZArea%267C0%2624Structure%267C0%2624YSXKCer%267C0%2624ZJJG%267C0%2622%2620IsUseOCache%263D%26220%2622%2620IsUserID%263D%26220%2622%2620SiteId%263D%26228907bd13-1d14-4f9e-8c01-e482d9590d10%2622%2620LockedColumn%263D%26220%2622%2620IsLocked%263D%26220%2622%2620ClientWidth%263D%26221601%2622%2620ShowModeCode%263D%2622default%2622%2620Language%263D%2622chinese%2622/%263E" try: response = requests.request("POST", build_url, data=payload, params=querystring) html = response.text build_info_list = re.findall('<tr.*?>.*?</tr>', html, re.S | re.M)[1:] for i in build_info_list: build = Building(co_index) build.co_id = co_id build.bu_num = re.search( '<span class="spanctfield".*?<span class="spanctfield".*?>.*?<a.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_floor = re.search( '<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) build.bu_pre_sale = re.search( '<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('id="Tr_(.*?)"', i, re.S | re.M).group(1) build.insert_db() self.get_house_info(co_id, build.bu_id) except Exception as e: print('请求错误,url={},data={},params={}'.format( build_url, payload, querystring))
def get_build_info(self, build_url_list, co_id): for i in build_url_list: build_url = 'http://gold.ncfdc.com.cn/' + i.replace('amp;', '') res = requests.get(build_url) co_name = re.search('ctl15_proname">(.*?)<', res.text, re.S | re.M).group(1) str = re.search('项目楼栋列表.*?ctl17_fLinks_pDataShow', res.text, re.S | re.M).group() for info in re.findall('<tr>.*?</tr>', str, re.S | re.M): if 'href' not in info: continue try: build = Building(co_index) build.co_name = co_name build.bu_num = re.search('<tr>.*?<td>.*?<a href=.*?>(.*?)<', info, re.S | re.M).group(1) build.bu_pre_sale = re.search('onclick="BinSHouseInfo.*?>(.*?)<', info, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('onclick="BinSHouseInfo.*?<td>(.*?)<', info, re.S | re.M).group( 1) build.bu_all_house = re.search('color:#ec5f00;">(.*?)<', info, re.S | re.M).group(1) build.bu_id = re.search("DisplayB_ld&hrefID=(.*?)'", info, re.S | re.M).group(1) build.co_id = co_id build.insert_db() except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e) house_url_list = re.findall("</span>.*?</td><td>.*?<a href='(.*?xs.*?)' target=\"_blank\">.*?查看", res.text, re.S | re.M) self.get_house_info(house_url_list)
def build_parse(self, co_id): list_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/loulist.jsp?Id_xmxq=' + co_id res = requests.get(list_url, headers=self.headers) con = res.content.decode() build_id_list = re.findall("searchByLid\('(\d+)'\)", con) for build_id in build_id_list: try: bu_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/lpbxx_new.jsp?lid=' + build_id bu_res = requests.get(bu_url, headers=self.headers) bu_con = bu_res.content.decode('gbk') bu = Building(co_index) bu.co_id = co_id bu.bu_id = build_id bu.bu_num = re.search('楼栋名称.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.bu_all_house = re.search('总套数.*?">总(.*?)套</td', bu_con, re.S | re.M).group(1) bu.bu_floor = re.search('地上层数.*?">共(.*?)层</td', bu_con, re.S | re.M).group(1) bu.bu_build_size = re.search('总建筑面积.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search("searchysxk\('(.*?)'\)", bu_con, re.S | re.M).group(1) bu.bu_type = re.search('房屋用途.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.insert_db() except Exception as e: log.error('{}楼栋错误{}'.format(build_id, e)) self.house_parse(co_id, build_id, bu_con)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_code = re.search('xqbm=(.*?)$', i).group(1) build_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/donginfo.aspx?xqbm=' + build_code build.bu_num = 'Labeldongmc">(.*?)<' build.bu_pre_sale = 'Labelyszheng">(.*?)<' build.bu_floor = 'Labelsceng">(.*?)<' build.bu_address = 'Label1zuoluo">(.*?)<' build.bo_build_start_time = 'Label1kaigong">(.*?)<' build.co_build_structural = 'Labeljiegou">(.*?)<' build.co_id = 'donginfo.aspx\?xqbm=(.*?)"' build.bu_id = 'id="DropDownList1".*?value="(.*?)"' p = ProducerListUrl(page_url=build_url, request_type='get', encode='utf-8', analyzer_rules_dict=build.to_dict(), current_url_rule='location\.href=(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def bu_parse(self, bu_url, co_id, co_url): build_url = "http://61.143.241.154/" + bu_url global headers headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Referer': co_url } bu_res = requests.get(build_url, headers=headers) bu_con = bu_res.content.decode('gbk') bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a', bu_con, re.S | re.M).group(1) bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu_html = etree.HTML(bu_con) bu_list = bu_html.xpath("//table[@id='donglist']//tr") for bo in bu_list: bu = Building(co_index) bu.co_id = co_id bo_url = bo.xpath("./td/a/@href")[0] bu.bu_id = re.search('dbh=(.*?)&', bo_url).group(1) bu.bu_num = bo.xpath("./td[3]/text()")[0] bu.bu_floor = bo.xpath("./td[4]/text()")[0] bu.bu_pre_sale = bu_pre_sale bu.bu_pre_sale_date = bu_pre_sale_date bu.insert_db() self.house_parse(bo_url, co_id, bu.bu_id)
def comm_info( self, con, ): # 小区及楼栋 comm = Comm(co_index) comm.co_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()" )[0] # 小区名称 co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0] # 小区id comm.co_id = re.search(r"\d+", co_id_str).group(0) comm.co_address = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[ 0] # 小区地址 comm.co_develops = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[ 0] # 开发商 comm.co_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0] # 总面积 comm.co_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[ 0] # 建筑面积 comm.co_build_end_time = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[ 0] # 竣工时间 comm.co_plan_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[ 0] # 用地规划许可 comm.co_work_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0] # 施工许可 comm.co_green = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()" )[0] # 绿地百分比 comm.co_land_use = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0] # 土地使用证 comm.insert_db() build = Building(co_index) build_table = con.xpath("//tr[@style='color:#000066;']") room_list = [] for build_list in build_table: build.co_id = comm.co_id build.co_name = comm.co_name build_info = build_list.xpath("./td/text()") build.bu_id = build_info[0] build.bu_num = build_info[1] build.bu_all_house = build_info[2] build.size = build_info[3] build.bu_floor = build_info[4] build.bu_pre_sale = build_info[5] build.insert_db() room_url = build_list.xpath("./td/a/@href")[0] room_list.append(room_url) return room_list
def bu_info(self, bu_list, co_id): for bu_ in bu_list[1:]: bu = Building(co_index) bu.co_id = co_id bu.bu_num = bu_.xpath("./td/a/text()")[0] bu.bu_pre_sale = bu_.xpath("./td[2]/text()")[0] bu.bu_type = bu_.xpath("./td[4]/text()")[0] bu_url = bu_.xpath("./td/a/@href")[0] bu.bu_id = re.search('buildid=(\d+)', bu_url).group(1) bu.insert_db() self.ho_info(bu_url, co_id, bu.bu_id)
def get_build_info(self, comm_url_list): for i in comm_url_list: try: sid = re.findall('\+(\d+)\+', i)[0] pid = re.findall('\+(\d+)\+', i)[1] build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid # print(build_url) response = requests.get(build_url) html = response.text build = Building(co_index) build.bu_id = pid build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_pre_sale = re.search('预售证号.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('时间.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_all_house = re.search('dM.*?">(.*?) ', html, re.S | re.M).group(1) # build.bu_address = re.search('售楼处地址.*?">(.*?) ', html, re.S | re.M).group(1) build.insert_db() except Exception as e: print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url), e) house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001¶ms=' + sid # print(house_url) result = requests.get(house_url) html_ = result.text for house_info in re.findall('<Result.*?</Result>', html_, re.S | re.M): try: house = House(co_index) house.bu_id = build.bu_id house.bu_num = build.bu_num house.ho_name = re.search('<ONAME>(.*?)</ONAME>', house_info, re.S | re.M).group(1) house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info, re.S | re.M).group(1) house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>', house_info, re.S | re.M).group(1) house.ho_floor = re.search('<FORC>(.*?)</FORC>', house_info, re.S | re.M).group(1) house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>', house_info, re.S | re.M).group(1) house.insert_db() except Exception as e: print('co_index={}, 房号错误'.format(co_index), e)
def get_comm_info(self, comm_info_list): for i in comm_info_list: build = Building(co_index) house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) build.bu_num = re.search('<a.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_pre_sale = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('slbh=(.*?)&', i, re.S | re.M).group(1) build.insert_db() self.get_house_info(house_url, build.bu_id)
def build_info(self, bu_list, co_id): for bo in bu_list: ho_url = bo.xpath("./@href")[0] floor = bo.xpath(".//p[2]/text()")[0] bu = Building(co_index) bu.bu_pre_sale = bo.xpath(".//p[3]/text()")[0] bu.bu_num = re.search('zh=(.*?)', ho_url).group(1) bu.bu_id = re.search('n=(\d+)', ho_url).group(1) bu.co_id = co_id bu.bu_floor = re.search('总层数.*?(\d+)', floor).group(1) bu.insert_db() house_url = "http://www.ggsfcw.com/" + ho_url self.ho_info(house_url, co_id, bu.bu_id)
def comm(self, id): bu = Building(co_index) house_url = self.start_url + "/api/buildInfos/getHouseInfosByPannelNumber?pannelNumber=" + str(id) comm_url = self.start_url + "/api/buildInfos/getHomePageBuildingInfo?blockNumber=" + str(id) comm_detail_url = self.start_url + "/api/buildInfos/getDetailsBuildingInfo?blockNumber=" + str(id) comm_res = requests.get(comm_url) comm_detail_res = requests.get(comm_detail_url) house_res = requests.get(house_url) comm_dict = json.loads(comm_res.text) comm_detail_dict = json.loads(comm_detail_res.text) house_dict = json.loads(house_res.text) bu.bu_id = id bu.bu_num = comm_dict["data"]["nameBuildings"] bu.area = comm_detail_dict['data']['houseingArea'] bu.bu_address = comm_dict["data"]["houseaddress"] bu.bu_pre_sale = comm_detail_dict["data"]["yszh"] bu.bu_type = comm_dict["data"]["propertycategory"] bu.bo_develops = comm_dict["data"]["companyName"] bu.insert_db() house_num = house_dict["data"] for hu in house_num: ho = House(co_index) h = hu["data"] if len(h) > 0: for i in h: try: room_id = i["houseNumber"] room_url = self.start_url + "/api/buildInfos/getHouseInfoByHouseNumber?houseNumber=" + str( room_id) res = requests.get(room_url, headers=self.headers) dict = json.loads(res.text) ho.bu_id = id # ho.ho_num = room_id ho.ho_name = dict["data"]["houseNo"] ho.ho_build_size = dict["data"]["buildArea"] ho.ho_true_size = dict["data"]["jacketArea"] ho.ho_share_size = dict["data"]["apportionedArea"] ho.ho_floor = dict["data"]["nominalLevel"] ho.insert_db() except Exception as e: print(e) else: continue
def get_build_info(self, build_url_list, co_name): for i in build_url_list: try: build = Building(co_index) build.co_name = co_name build_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(build_url, headers=self.headers) html = response.text # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0] for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M): try: build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k result = requests.get(build_url_detail, headers=self.headers) content = result.text build.bu_num = re.findall( 'BuildingInfo1_lblBuildingName">(.*?)<', content, re.S | re.M)[0] build.bu_all_house = re.findall( 'BuildingInfo1_lblZts">(.*?)<', content, re.S | re.M)[0] build.bu_floor = re.findall( 'BuildingInfo1_lblZcs">(.*?)<', content, re.S | re.M)[0] build.bu_build_size = re.findall( 'BuildingInfo1_lblJzmj">(.*?)<', content, re.S | re.M)[0] build.bu_live_size = re.findall( 'BuildingInfo1_lblZzmj">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale = re.findall( 'BuildingInfo1_lblYsxkzh">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale_date = re.findall( 'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content, re.S | re.M)[0] build.insert_db() house_url_list = re.findall( "onClick=.getMoreHouseInfo\('(.*?)'\)", content, re.S | re.M) self.get_house_info(house_url_list, co_name, build.bu_num) except Exception as e: print(e) except Exception as e: print(e)
def get_build_detail(self, all_building_url_list): house_url_list = [] for i in all_building_url_list: try: response = requests.get(i, headers=self.headers) html = response.text tree = etree.HTML(html) bo_develops = tree.xpath('//*[@id="content_1"]/div[3]/text()[2]')[0] # 开发商 bu_build_size = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[6]/a/text()') # 销售面积 if bu_build_size: bu_build_size = bu_build_size[0] bu_pre_sale = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[1]/a/text()') # 预售证书 if bu_pre_sale: bu_pre_sale = bu_pre_sale[0] bu_floor = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[3]/a/text()')[0] # 总层数 bu_all_house = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[4]/a/text()')[0] # 总套数 bu_type = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[5]/a/text()')[0] # 房屋用途 build_html = re.search('houseTable_1.*?当前共有', html, re.S | re.M).group() build_detail_html = re.findall('class.*?</a></td>.*?</a></td>.*?</a></td>', build_html, re.S | re.M) bu_num = re.findall('项目名称:</b>(.*?)</div>', html, re.S | re.M)[0].strip() url_list = [] for bu in build_detail_html: try: build = Building(co_index) build.bu_id = re.search("href='roomTable.aspx\?id=(.*?)&", bu, re.S | re.M).group(1) build.bu_address = re.search("_blank.*?_blank'>(.*?)</a></td><td>", bu, re.S | re.M).group( 1).strip() build.bo_develops = bo_develops build.bu_build_size = bu_build_size build.bu_pre_sale = bu_pre_sale build.bu_num = bu_num build.bu_floor = bu_floor build.bu_all_house = bu_all_house build.bu_type = bu_type for k in self.area_list: if k in build.bu_address: build.area = k continue build.insert_db() house_url = re.search("(roomTable.aspx\?id=.*?&vc=.*?)'", bu, re.S | re.M).group(1) url_list.append('http://dgfc.dg.gov.cn/dgwebsite_v2/Vendition/' + house_url) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) house_url_list = url_list + house_url_list except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) return house_url_list
def bu_parse(self, detail_url, co_id): pre_url = detail_url.replace('lp', 'presell') pre_res = requests.get(pre_url, headers=self.headers) pre_html = etree.HTML(pre_res.text) bu_pre_list = pre_html.xpath("//dt/strong/a") for bu_pre in bu_pre_list: bu_pre_url = bu_pre.xpath("./@href")[0] bu_pre_sale = bu_pre.xpath("./text()")[0] bu_url = 'http://www.zstmsf.com' + bu_pre_url while True: try: proxy = self.proxies[random.randint(0, 9)] bu_res = requests.get(bu_url, headers=self.headers, proxies=proxy, timeout=10) break except: continue bu_html = etree.HTML(bu_res.text) bu_list = bu_html.xpath("//tr//strong/a/@href") for bo_url in bu_list: ho_url = "http://www.zstmsf.com" + bo_url while True: try: proxy = self.proxies[random.randint(0, 9)] ho_res = requests.get(ho_url, headers=self.headers, proxies=proxy, timeout=10) break except: continue build = Building(co_index) build.co_id = co_id build.bu_id = re.search('zid=.*?(\d+)', ho_url).group(1) build.bu_num = re.search('幢名称:<strong>(.*?)<', ho_res.text).group(1) build.bu_all_house = re.search("幢总套数.*?'>(.*?)</", ho_res.text).group(1) build.bu_all_size = re.findall("面积.*?'>(.*?)</", ho_res.text)[0] build.bu_pre_sale = bu_pre_sale build.insert_db() self.ho_parse(co_id, build.bu_id, ho_res)
def get_build_info(self, co_id): build_url = 'http://www.lhfdc.gov.cn/Templets/LH/aspx/HPMS/GetQueryResult.ashx?type=0&PCODE=' + co_id response = requests.get(build_url, headers=self.headers) html = response.text build_list = re.findall( "bid=(.*?) .*?onclick=\"BuildChange\(.*?,'(.*?)','(.*?)'\)", html, re.S | re.M) for i in build_list: try: build = Building(co_index) build.co_id = co_id build.bu_pre_sale = i[1] build.bu_num = i[2] build.bu_id = i[0] build.insert_db() self.get_house_info(co_id, build.bu_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, build_url_list, bu_pre_sale_list, co_name, co_id): for i in range(len(build_url_list)): try: build = Building(co_index) build.co_id = co_id build.co_name = co_name build.bu_pre_sale = bu_pre_sale_list[i] build.bu_id = re.search('lh=(\d+)', build_url_list[i]).group(1) build_url = 'http://221.2.144.162:8090/' + build_url_list[i] response = requests.get(build_url, headers=self.headers) html = response.content.decode('gbk') build.bu_num = re.findall('<font color=white.*?><b>(.*?)<', html, re.S | re.M)[0] build.bu_address = re.findall('坐落位置:</b>(.*?)<', html, re.S | re.M)[0] build.insert_db() ho_url_list = re.findall('background-.*?href=(.*?) ', html, re.S | re.M) ho_name_list = re.findall('background-color.*?<a.*?>(.*?)<', html, re.S | re.M) for i in range(len(ho_url_list)): try: house = House(co_index) house_url = 'http://221.2.144.162:8090/' + ho_url_list[ i] result = requests.get( house_url, headers=self.headers).content.decode('gbk') house.bu_id = build.bu_id house.co_id = co_id house.ho_type = re.findall( '用 途:.*?<td.*?>(.*?)<', result, re.S | re.M)[0] house.ho_build_size = re.findall( '建筑面积:.*?<td>(.*?)<', result, re.S | re.M)[0] house.bu_num = build.bu_num house.co_name = co_name house.ho_name = ho_name_list[i] house.insert_db() except Exception as e: print("co_index={},房屋信息错误".format(co_index), e) except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e)
def build_parse(self, co_id): bu = Building(co_index) url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id) res = requests.get(url, headers=self.headers) con_html = etree.HTML(res.text) build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1] a = con_html.xpath("//td[@width='54%']") for index in range(0, len(build_url_list)): try: build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index] res = requests.get(build_info_url, headers=self.headers) con = res.text bu.co_id = co_id bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con, re.S | re.M).group(1) bu.bu_num = re.search('幢.*?did">(.*?)<', con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con, re.S | re.M).group(1) bu.bu_address = re.search('位置.*?ss">(.*?)<', con, re.S | re.M).group(1) bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con, re.S | re.M).group(1) bu.bu_type = re.search('性质.*?type">(.*?)<', con, re.S | re.M).group(1) bu.bu_all_house = re.search('套数.*?number">(.*?)<', con, re.S | re.M).group(1) bu.bu_id = re.search('id=(\d+)', build_url_list[index]).group(1) bu.insert_db() except Exception as e: print( '楼栋错误,co_index={},url={}'.format(co_index, build_info_url), e) continue try: house_url = a[index].xpath("./a/@href")[0] self.house_parse(house_url, co_id, bu.bu_id) except Exception as e: continue
def start_crawler(self): response = requests.get(url) html = response.text tree = etree.HTML(html) all_url = tree.xpath('//a[@class="a_name"]/@href') for i in all_url: comm = Comm(co_index) if i == '#': continue comm_url = 'http://www.lzfc.com.cn:8080' + i comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_address = "cc1.innerHTML='(.*?)'" comm.area = "cc2.innerHTML='(.*?)'" comm.co_use = "cc4.innerHTML='(.*?)'" comm.co_develops = "cc5.innerHTML='(.*?)'" comm.co_open_time = "cc6.innerHTML='(.*?)'" comm.co_all_house = "cc9.innerHTML='(.*?)'" comm.co_build_size = "cc11.innerHTML='(.*?)'" comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_id = "BaseCode=(.*?)'" p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details() for i in build_url: build = Building(co_index) build_detail_url = 'http://www.lzfc.com.cn:8080' + i build.bu_num = 'onclick=comInfoView.*?center">(.*?)<' build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<' build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<' build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<' build.co_name = 'fontbg_red">(.*?)<' build.bu_id = "onclick=comInfoView\('(.*?)'\)" p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details()
def build_info(self,bu_list,co_id): for bu in bu_list: bu_url = bu.xpath("./td[4]/a/@href")[0] build_url = self.start_url+'/' + bu_url bu_res = requests.get(build_url,headers=self.headers) bu_res.encoding = 'gbk' con = bu_res.text bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a',con,re.S|re.M).group(1) bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td',con,re.S|re.M).group(1) bu_html = etree.HTML(con) donglist = bu_html.xpath("//table[@id='donglist']/tr") for dong in donglist: dong_url = dong.xpath("./td/a/@href")[0] bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID={(.*?)}',dong_url).group(1) bu.bu_num = dong.xpath("./td[3]/text()")[0] bu.bu_floor = dong.xpath("./td[4]/text()")[0] bu.bu_pre_sale = bu_pre_sale bu.bu_pre_sale_date = bu_pre_sale_date bu.insert_db() self.house_info(co_id,bu.bu_id,dong_url)
def get_build_info(self, more_build_url): for i in more_build_url: try: build = Building(co_index) build_url = 'http://www.jmfc.com.cn/' + i build.bu_num = '<tr bgcolor="#FFFFFF">.*?<td.*?>(.*?)<' build.co_id = '楼盘首页.*?aid-(.*?)/' build.bu_id = '&addno=12&action=loupantable&lzbm=(.*?)&ql_xh=' build.bu_pre_sale = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>(.*?)<' build.bu_floor = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<' build.bu_all_house = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<' p = ProducerListUrl( page_url=build_url, request_type='get', encode='gbk', analyzer_rules_dict=build.to_dict(), current_url_rule= '<tr bgcolor="#FFFFFF">.*?align="left".*?href="(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print(e)
def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con): build = Building(co_index) build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_pre_sale = bu_pre_sale build.bo_develops = bo_develops build.co_name = bu_co_name build.insert_db()
def get_comm_info(self, comm_info): co = Comm(co_index) co.co_name = re.search('_blank">(.*?)</a', comm_info).group(1) try: co.co_address = re.findall('px">(.*?)</td', comm_info)[1] except: co.co_address = None co.area = re.search('center">(.*?)</td>', comm_info).group(1) co_detail_url = re.search("href='(.*?)'", comm_info).group(1) co_url = "http://www.qyfgj.cn/newys/" + co_detail_url try: res = requests.get(co_url, headers=self.headers) except Exception as e: print("co_index={}小区未请求到".format(co_index), e) con = res.content.decode('gbk') try: co.co_develops = re.search('开发商名称.*?px;">(.*?)</a', con, re.S | re.M).group(1) co.co_all_house = re.search('总套数.*?">(\d+) ', con, re.S | re.M).group(1) co.co_all_size = re.search('总面积.*?">(\d+.\d+) m', con, re.S | re.M).group(1) except: print("小区无开发商等信息") co.insert_db() try: build = re.findall('<tr bgcolor="white">(.*?)</tr>', con, re.S | re.M) except: print("小区没有楼栋信息") build_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': co_url } for build_info in build: if "进入" in build_info: build_url = re.search('href="(.*?)"><font', build_info).group(1) build_url = "http://www.qyfgj.cn/newys/" + build_url ho_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': build_url } build_res = requests.get(build_url, headers=build_headers) build_con = build_res.content.decode('gbk') if re.search('ID=(\d+)', build_url): #现售 bu = Building(co_index) bu_id = re.search('ID=(\d+)', build_url).group(1) bu.bu_id = bu_id bu.co_name = co.co_name bu.insert_db() self.get_house_info(headers=ho_headers, bu_id=bu_id, url=build_url) else: #预售 bu = Building(co_index) bu.co_name = co.co_name bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>', build_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con, re.S | re.M).group(1) ret = re.findall('<tr onmouseover(.*?)</tr', build_con, re.S | re.M) for i in ret: house_url = re.search('href="(.*?)"', i).group(1) house_url = "http://www.qyfgj.cn/newys/" + house_url bu.bu_id = re.search('dbh=(.*?)&', i).group(1) bu.bu_num = re.search('<td width="89.*?">(.*?)</', i).group(1) bu.bu_floor = re.search('<td width="84.*?">(\d+)</td', i).group(1) bu.insert_db() ho_res = requests.get(house_url, headers=ho_headers) ho_con = ho_res.content.decode('gbk') new_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': house_url } self.get_house_info(ho_con=ho_con, headers=new_headers, bu_id=bu.bu_id) else: print("楼栋无链接地址")
def start_crawler(self): url = 'http://zzx.zzfc.com/ajaxpro/xy_ysxk_more,App_Web_mjeeodb-.ashx' for i in range(1, 21): payload = "{\"pageNo\":" + str( i) + ",\"pageSize\":30,\"rowcount\":589}" try: response = requests.post(url, data=payload, headers=self.headers) con = response.content.decode() except Exception as e: log.error('楼栋请求失败{}'.format(e)) continue co_list = re.findall('\[\d+,.*?\d+\]', con) for comm in co_list: try: sid = re.search('\[(\d+),', comm).group(1) pid = re.search('",(\d+),', comm).group(1) bu_url = 'http://zzx.zzfc.com/xy_bldg.aspx?pid=' + pid + '&sid=' + sid bu_res = requests.get(bu_url, headers=self.headers) bu_con = bu_res.content.decode() bu = Building(co_index) bu.bu_id = sid bu.bu_address = re.search('楼栋座落.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('预售证号.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('预售日期.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_all_house = re.search('套数.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.insert_db() except Exception as e: log.error("{}楼栋解析失败{}".format(comm, e)) continue ho_url = 'http://zzx.zzfc.com/ajaxpro/xy_housetag,App_Web_xg4ulr9n.ashx' data = "{\"m_key\":\"WWW_LPB_001\",\"m_param\":\"" + sid + "\"}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'X-AjaxPro-Method': 'GETLPBDS' } try: ho_res = requests.post(ho_url, data=data, headers=headers) ho_con = ho_res.content.decode() except Exception as e: log.error("房屋请求失败{}".format(e)) continue ho_list = re.findall('\["\d+.*?\d+\]', ho_con) for house in ho_list: try: ho = House(co_index) ho.bu_id = sid info_list = house.split(",") ho.ho_name = info_list[4] ho.ho_floor = re.search('(\d+)层', house).group(1) ho.ho_build_size = info_list[-3] ho.ho_true_size = info_list[-2] ho.insert_db() except Exception as e: log.error("{}房屋解析错误{}".format(house, e)) continue