def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def build_parse(self, co_id): list_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/loulist.jsp?Id_xmxq=' + co_id res = requests.get(list_url, headers=self.headers) con = res.content.decode() build_id_list = re.findall("searchByLid\('(\d+)'\)", con) for build_id in build_id_list: try: bu_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/lpbxx_new.jsp?lid=' + build_id bu_res = requests.get(bu_url, headers=self.headers) bu_con = bu_res.content.decode('gbk') bu = Building(co_index) bu.co_id = co_id bu.bu_id = build_id bu.bu_num = re.search('楼栋名称.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.bu_all_house = re.search('总套数.*?">总(.*?)套</td', bu_con, re.S | re.M).group(1) bu.bu_floor = re.search('地上层数.*?">共(.*?)层</td', bu_con, re.S | re.M).group(1) bu.bu_build_size = re.search('总建筑面积.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search("searchysxk\('(.*?)'\)", bu_con, re.S | re.M).group(1) bu.bu_type = re.search('房屋用途.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.insert_db() except Exception as e: log.error('{}楼栋错误{}'.format(build_id, e)) self.house_parse(co_id, build_id, bu_con)
def get_build_info(self,presell_url_list,co_id): for presell_url in presell_url_list: pre_url = self.url + presell_url res = requests.get(pre_url,headers=self.headers) build_url_list = re.findall('【<a href="(.*?)" target="_self"',res.text,re.S|re.M) for build_url in build_url_list: build_info_url = self.url+build_url try: build_res = requests.get(build_info_url,headers=self.headers) con = build_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID=(\d+)',build_url).group(1) bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span',con,re.S|re.M).group(1) bu.bu_floor = re.search('总 层 数.*?(\d+)</span',con,re.S|re.M).group(1) bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span',con,re.S|re.M).group(1) bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span',con,re.S|re.M).group(1) bu.bu_not_live_size = re.search('非住宅面积.*?Fzzmj">(.*?)</span',con,re.S|re.M).group(1) bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span',con,re.S|re.M).group(1) bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span',con,re.S|re.M).group(1) bu.bu_type = re.search('项目类型.*?Type">(.*?)</span',con,re.S|re.M).group(1) bu.insert_db() except Exception as e: print("co_index={},楼栋信息错误".format(co_index), e) continue house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"",con,re.S|re.M) self.get_house_info(co_id,bu.bu_id,house_detail_list)
def get_build_info(self, build_id_list, co_id): bu = Building(co_index) for build_id in build_id_list: formdata = {} formdata["action"] = "qeurySingleBuilding" formdata['pk'] = str(build_id) header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36', 'Referer': 'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildingList.php' } try: build_info = self.s.post( 'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildHouseAction.php', data=formdata, headers=header) except Exception as e: print("co_idnex={},楼栋错误".format(co_index), e) build_con = build_info.text bu.bu_id = build_id bu.co_id = co_id bu.bu_num = re.search('幢名称.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bu_floor = re.search('总层数.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bu_build_size = re.search('>建筑面积.*?<td>(.*?)<', build_con, re.S | re.M).group(1) bu.bo_develops = re.search('房地产企业.*?">(.*?)</td', build_con, re.S | re.M).group(1) bu.insert_db() self.get_house_info(build_con, co_id, build_id)
def get_build_info(self, build_info_list, co_id, comm_html, url): for i in build_info_list: try: building = Building(2) bu_name = i[1] # 楼栋名称 bu_num = bu_name.split('#')[0] # 楼号 bu_all_house = i[3] # 总套数 bu_build_size = i[5] # 面积 bu_price = i[9] # 价格 # 给对象增加属性 building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_build_size = bu_build_size building.bu_price = bu_price building.co_id = co_id # 小区id build_html = re.search(r'楼盘表(.*?)个楼栋信息', comm_html).group(1) build_url = re.search(r'<ahref="(.*?)">查看信息<', build_html).group(1) build_id = re.search('buildingId=(.*?)$', build_url).group(1) building.bu_id = build_id # 楼栋id building.insert_db() self.get_build_detail(build_url, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.ndjsj.gov.cn/House/' + i build.co_name = '项目名称:.*?<td.*?>(.*?)<' build.bu_num = '幢 号:.*?<td.*?>(.*?)<' build.bu_address = '坐落位置:.*?<td.*?>(.*?)<' build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<' build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<' build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<' # build.bu_type = '设计用途:.*?<td.*?>(.*?)<' build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<' p = ProducerListUrl( page_url=build_url, request_type='get', encode='utf-8', analyzer_rules_dict=build.to_dict(), current_url_rule='javascript:ShowTitle.*?href="(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print('宁德楼栋错误,url={}'.format(build_url), e)
def get_comm_info(self, url, comm): try: response = requests.get(url=url, headers=self.headers) html = response.text tree = etree.HTML(html) # 小区名称 co_name = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[1]/td[2]/text()')[0].strip() # 小区地址 co_address = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[2]/td[2]/text()')[0].strip() # 开工时间 co_build_start_time = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[2]/text()')[ 0].strip() # 竣工时间 co_build_end_time = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[4]/text()')[0].strip() # 建筑结构 co_build_structural = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[4]/td[2]/text()')[ 0].strip() # 容积率 co_volumetric = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[4]/text()')[0].strip() # 绿化率 co_green = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[2]/text()')[0].strip() # 占地面的 co_size = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[5]/td[2]/text()')[0].strip() co_id = re.search('home/(.*?).html', url).group(1) comm.co_name = co_name comm.co_address = co_address comm.co_build_start_time = co_build_start_time comm.co_build_end_time = co_build_end_time comm.co_build_structural = co_build_structural comm.co_volumetric = co_volumetric comm.co_green = co_green comm.co_size = co_size comm.co_id = co_id comm.insert_db() build_info_list = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox1"]/div/table/tr[@class="hobuild"]') for i in build_info_list: try: build = Building(11) # 楼栋名称 bu_name = i.xpath('string(td[1])')[0] bu_all_house = i.xpath('td[2]/text()')[0] # 楼栋id bu_id = i.xpath('td[1]/strong/a/@href')[0] bu_id = re.search('building_id=(.*?)$', bu_id).group(1) # 建筑面积 bu_build_size = i.xpath('string(td[3])').replace('�O', '') build.co_id = co_id build.bu_id = bu_id build.bu_all_house = bu_all_house build.bu_name = bu_name build.bu_build_size = bu_build_size build.insert_db() self.get_house_info(bu_id, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e) except BaseException as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
def get_build_info(self, build_url_list, co_name): for i in build_url_list: try: build = Building(co_index) build.co_name = co_name build_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(build_url, headers=self.headers) html = response.text # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0] for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M): try: build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k result = requests.get(build_url_detail, headers=self.headers) content = result.text build.bu_num = re.findall( 'BuildingInfo1_lblBuildingName">(.*?)<', content, re.S | re.M)[0] build.bu_all_house = re.findall( 'BuildingInfo1_lblZts">(.*?)<', content, re.S | re.M)[0] build.bu_floor = re.findall( 'BuildingInfo1_lblZcs">(.*?)<', content, re.S | re.M)[0] build.bu_build_size = re.findall( 'BuildingInfo1_lblJzmj">(.*?)<', content, re.S | re.M)[0] build.bu_live_size = re.findall( 'BuildingInfo1_lblZzmj">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale = re.findall( 'BuildingInfo1_lblYsxkzh">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale_date = re.findall( 'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content, re.S | re.M)[0] build.insert_db() house_url_list = re.findall( "onClick=.getMoreHouseInfo\('(.*?)'\)", content, re.S | re.M) self.get_house_info(house_url_list, co_name, build.bu_num) except Exception as e: print(e) except Exception as e: print(e)
def get_build_detail(self, all_building_url_list): house_url_list = [] for i in all_building_url_list: try: response = requests.get(i, headers=self.headers) html = response.text tree = etree.HTML(html) bo_develops = tree.xpath('//*[@id="content_1"]/div[3]/text()[2]')[0] # 开发商 bu_build_size = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[6]/a/text()') # 销售面积 if bu_build_size: bu_build_size = bu_build_size[0] bu_pre_sale = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[1]/a/text()') # 预售证书 if bu_pre_sale: bu_pre_sale = bu_pre_sale[0] bu_floor = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[3]/a/text()')[0] # 总层数 bu_all_house = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[4]/a/text()')[0] # 总套数 bu_type = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[5]/a/text()')[0] # 房屋用途 build_html = re.search('houseTable_1.*?当前共有', html, re.S | re.M).group() build_detail_html = re.findall('class.*?</a></td>.*?</a></td>.*?</a></td>', build_html, re.S | re.M) bu_num = re.findall('项目名称:</b>(.*?)</div>', html, re.S | re.M)[0].strip() url_list = [] for bu in build_detail_html: try: build = Building(co_index) build.bu_id = re.search("href='roomTable.aspx\?id=(.*?)&", bu, re.S | re.M).group(1) build.bu_address = re.search("_blank.*?_blank'>(.*?)</a></td><td>", bu, re.S | re.M).group( 1).strip() build.bo_develops = bo_develops build.bu_build_size = bu_build_size build.bu_pre_sale = bu_pre_sale build.bu_num = bu_num build.bu_floor = bu_floor build.bu_all_house = bu_all_house build.bu_type = bu_type for k in self.area_list: if k in build.bu_address: build.area = k continue build.insert_db() house_url = re.search("(roomTable.aspx\?id=.*?&vc=.*?)'", bu, re.S | re.M).group(1) url_list.append('http://dgfc.dg.gov.cn/dgwebsite_v2/Vendition/' + house_url) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) house_url_list = url_list + house_url_list except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) return house_url_list
def build_parse(self, co_id): bu = Building(co_index) url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id) res = requests.get(url, headers=self.headers) con_html = etree.HTML(res.text) build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1] a = con_html.xpath("//td[@width='54%']") for index in range(0, len(build_url_list)): try: build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index] res = requests.get(build_info_url, headers=self.headers) con = res.text bu.co_id = co_id bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con, re.S | re.M).group(1) bu.bu_num = re.search('幢.*?did">(.*?)<', con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con, re.S | re.M).group(1) bu.bu_address = re.search('位置.*?ss">(.*?)<', con, re.S | re.M).group(1) bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con, re.S | re.M).group(1) bu.bu_type = re.search('性质.*?type">(.*?)<', con, re.S | re.M).group(1) bu.bu_all_house = re.search('套数.*?number">(.*?)<', con, re.S | re.M).group(1) bu.bu_id = re.search('id=(\d+)', build_url_list[index]).group(1) bu.insert_db() except Exception as e: print( '楼栋错误,co_index={},url={}'.format(co_index, build_info_url), e) continue try: house_url = a[index].xpath("./a/@href")[0] self.house_parse(house_url, co_id, bu.bu_id) except Exception as e: continue
def get_comm_info(self, comm_url, comm): co_url = 'http://www.fangdi.com.cn/' + comm_url response = requests.get(co_url, headers=self.headers) html = response.content.decode('gbk') comm.co_develops = re.search('企业名称:.*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.insert_db() add_build_url = 'http://www.fangdi.com.cn/Presell.asp?projectID=' + comm.co_id result = requests.get(add_build_url, headers=self.headers) html_str = result.content.decode('gbk') build_detail_tuple_list = re.findall( "javascript:SetSelect\(.*?,.*?,.*?,.*?,.*?,'(.*?)','(.*?)'\)", html_str, re.S | re.M) for i in build_detail_tuple_list: PreSell_ID = i[0] Start_ID = i[1] build_detail_url = 'http://www.fangdi.com.cn/building.asp?ProjectID=OTU4OHwyMDE4LTQtNHwxNw&PreSell_ID=' + PreSell_ID + '&Start_ID=' + Start_ID massage = requests.get(build_detail_url, headers=self.headers).content.decode('gbk') build_url_list = re.findall('class="indextabletxt">.*?</tr>', massage, re.S | re.M) for i in build_url_list: try: build = Building(co_index) build.bu_num = re.search('<a.*?>(.*?)</a>', i, re.S | re.M).group(1) build.bu_all_house = re.search( '<a.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_build_size = re.search( '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('Param=(.*?)=', i, re.S | re.M).group(1) build.co_id = comm.co_id build.insert_db() house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) self.get_house_info(house_url, build.bu_id, build.co_id) except Exception as e: print( '楼栋错误,co_index={},url={}'.format( co_index, build_detail_url), e)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://222.223.160.199:8088/website/buildquery/selectBuild.jsp?buildID=' + i[0] response = requests.get(build_url, headers=self.headers) html = response.text build.bu_id = i[0] build.co_build_structural = re.search('结构类型.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bo_build_end_time = re.search('建成年份.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_build_size = re.search('总建筑面积.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.size = re.search('占地面积.*?<td>(.*?)<', html, re.S | re.M).group(1) build.bu_floor = re.search('房屋层数.*?<td>(.*?)<', html, re.S | re.M).group(1) build.bu_all_house = re.search('房屋套数.*?<td>(.*?)<', html, re.S | re.M).group(1) build.area = re.search('坐落区.*?<td>(.*?)<', html, re.S | re.M).group(1) build.insert_db() self.get_house_info(build.bu_id) except Exception as e: print('请求错误,url={}'.format(build_url),e)
def get_build_detail(self, build_url, co_id): bu_url = 'http://www.yzfdc.cn/' + build_url response = self.s.get(bu_url, headers=self.headers) html = response.text build = Building(co_index) build.bu_num = re.search('查询幢号:.*?<span.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) bu_html = re.search('<div align="center">已售已备案.*?</table>', html, re.S | re.M).group() build_html_list = re.findall('<tr.*?</tr>', bu_html, re.S | re.M) all_size = 0 for i in build_html_list: num = re.search( '<div.*?<div.*?<div.*?<div.*?<div.*?<div.*?>(.*?)<', i, re.S | re.M).group(1) if num: all_size += float(num) build.bu_build_size = all_size build.co_id = co_id build.bu_id = re.search('GCZHId=(.*?)$', bu_url).group(1) build.insert_db() self.get_house_info(co_id, build.bu_id)
def build_info(self, co_id, bu_id): bu_url = 'http://www.lsjs.gov.cn/WebLSZFGB/ZNInfo.aspx?YSZID=' + bu_id + "&YSXMID=" + co_id bu_res = requests.get(bu_url, headers=self.headers) con = bu_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = bu_id bu.bu_num = re.search('znxx">(.*?)</span', con).group(1) bu.bu_all_house = re.search('纳入网上预(销)售总套数.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_build_size = re.search('纳入网上预(销)售总面积.*?">(.*?)</', con, re.S | re.M).group(1) bu.insert_db() html = etree.HTML(con) house_list = html.xpath("//span[@class='syt-span']") for tag in house_list: ho = House(co_index) ho.bu_id = bu_id ho.co_id = co_id ho.ho_name = tag.xpath(".//p[@class='ewb-num']/text()")[0] ho.ho_build_size = tag.xpath(".//p[@class='ewb-con']/text()")[0] ho.insert_db()
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.fjnpfdc.com/House/' + i res = requests.get(build_url, headers=self.headers) con = res.content.decode('gbk') build.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_num = re.search("幢 号:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_use = re.search("设计用途:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_structural = re.search("建筑结构:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_floor = re.search("总 层 数:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_build_size = re.search("总 面 积:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_end_time = re.search("竣工日期:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) house_url_list = re.findall('<a href="(HouseInfo.*?)"', con) # p = ProducerListUrl(page_url=build_url, # request_type='get', encode='gbk', # analyzer_rules_dict=build.to_dict(), # current_url_rule='<a href="(HouseInfo.*?)"', # analyzer_type='regex', # headers=self.headers) build.co_id = re.search('ProjectId=(.*?)&', i).group(1) build.bu_id = re.search('BuildingId=(.*?)&P', i).group(1) build.insert_db() # house_url_list = p.get_details() self.get_house_info(house_url_list, build.bu_id, build.co_id) except Exception as e: print("co_index={},楼栋{}错误".format(co_index, i), e)
def bu_info(self,bu_list,co_id): for bu in bu_list: try: bu_url = 'http://www.fxfdcw.com/'+bu res = requests.get(bu_url,headers=self.headers) con = res.content.decode('gbk') html = etree.HTML(con) build = Building(co_index) build.co_id = co_id build.bu_id = re.search('bdid=(\d+)',bu).group(1) build.bu_num = re.search('楼号.*?">(.*?)</',con,re.S|re.M).group(1) build.bu_address = re.search('坐落.*?">(.*?)</',con,re.S|re.M).group(1) build.bu_floor = re.search('地上层数.*?">(.*?)</',con,re.S|re.M).group(1) build.bu_build_size = re.search('建筑面积.*?wrap">(.*?)</',con,re.S|re.M).group(1) build.bu_all_house = re.search('套 数.*?">(.*?)</',con,re.S|re.M).group(1) build.bu_type = re.search('用 途.*?wrap">(.*?)</',con,re.S|re.M).group(1) build.insert_db() ho_list = html.xpath("//span[@title]") except Exception as e: # log.error("楼栋信息错误{}".format(e)) print("楼栋信息错误{}".format(e)) continue self.ho_info(ho_list,co_id,build.bu_id)
def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con): build = Building(co_index) build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_pre_sale = bu_pre_sale build.bo_develops = bo_develops build.co_name = bu_co_name build.insert_db()
def get_comm_detail(self, href, comm): comm_detail_url = self.URL_FRONT + href response = requests.get(url=comm_detail_url, headers=self.headers) co_id = response.url co_id = int(co_id.split('=')[1]) # 小区id html = response.content.decode('gbk') co_name = self.regex_common(r'项目名称.*?<td.*?>(.*?)</td>', html) # 小区名字 co_owner = self.regex_common(r'房屋所有权证号.*?<td.*?>(.*?)</td>', html) co_use = self.regex_common(r'用 途.*?<td.*?>(.*?)</td>', html) co_develops = self.regex_common(r'开 发 商.*?<td.*?>(.*?)</td>', html) co_address = self.regex_common(r'项目位置.*?<td.*?>(.*?)</td>', html) co_pre_sale = self.regex_common(r'预售证号.*?<td.*?>(.*?)</td>', html) co_land_use = self.regex_common(r'土地使用权证.*?<td.*?>(.*?)</td>', html) co_land_type = self.regex_common(r'土地权证类型.*?<td.*?>(.*?)</td>', html) co_handed_time = self.regex_common(r'终止日期.*?<td.*?>(.*?)</td>', html) co_plan_pro = self.regex_common(r'规划许可证.*?<td.*?>(.*?)</td>', html) co_work_pro = self.regex_common(r'施工许可证.*?<td.*?>(.*?)</td>', html) co_type = self.regex_common(r'项目类型.*?<td.*?>(.*?)</td>', html) # 小区类型 co_size = self.regex_common(r'批准面积.*?<td.*?>(.*?)</td>', html) # 占地面积 comm.co_id = co_id comm.co_name = co_name comm.co_type = co_type comm.co_size = co_size comm.co_owner = co_owner comm.co_use = co_use comm.co_develops = co_develops comm.co_address = co_address comm.co_pre_sale = co_pre_sale comm.co_land_use = co_land_use comm.co_land_type = co_land_type comm.co_handed_time = co_handed_time comm.co_plan_pro = co_plan_pro comm.co_work_pro = co_work_pro # 获取楼栋url列表 build_url_list = re.findall(r"<td><a href='(.*?)'", html, re.M | re.S) if not build_url_list: return else: for build_url in build_url_list: try: building = Building(self.CO_INDEX) build_id = re.search(r'<td>(\d{2,6})</td>', html, re.M | re.S).group(1) # 楼栋id bu_all_house = re.search(r'<td>(\d{1,3})</td>', html, re.M | re.S).group(1) # 总套数 bu_price_demo = re.findall('<td>[\.\d]+</td>', html, re.M | re.S)[4] bu_price = re.search('\d+', bu_price_demo).group() data_dict = self.get_build_detail(build_url) bu_num = data_dict['bu_num'] # 楼号 bu_build_size = data_dict['bu_build_size'] # 建筑面积 co_address = data_dict['co_address'] # 小区地址 co_build_end_time = data_dict['co_build_end_time'] # 竣工时间 co_build_type = data_dict['co_build_type'] # 竣工时间 if not co_build_end_time: building.co_is_build = '1' comm.co_address = co_address comm.co_build_end_time = co_build_end_time comm.bu_build_size = bu_build_size comm.co_build_type = co_build_type # 楼栋 building.bu_num = bu_num building.bu_build_size = bu_build_size building.bu_all_house = bu_all_house building.bu_id = build_id building.co_id = co_id building.bu_price = bu_price # 插入 building.insert_db() except Exception as e: build_detail_url = self.URL_FRONT + build_url print('楼栋错误:', build_detail_url) comm.insert_db()
def get_build_url_list(self, url_list): for i in url_list: try: res = requests.get(i) html = res.content.decode('gbk') for k in re.findall('项目名称.*?</dl>', html, re.S | re.M): try: c = Comm(self.co_index) c.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) c.co_address = re.search('class="address"(.*?)</dd>', k, re.S | re.M).group(1) c.area = re.search('"city">(.*?)</dd>', k, re.S | re.M).group(1) c.co_develops = re.search('"average">(.*?)</dd>', k, re.S | re.M).group(1) c.insert_db() global count count += 1 print(count) url = re.search('a href="(.*?)">', k, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') build_info_str = re.search('楼盘表</td>(.*?)合 计', html, re.S | re.M).group(1) for j in re.findall('<tr.*?</tr>', build_info_str, re.S | re.M): try: b = Building(self.co_index) b.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) b.bu_all_house = re.search( 'absmiddle" />(.*?)</a>', j, re.S | re.M).group(1) b.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) b.bu_build_size = re.search( 'td class="t_c">.*?td class="t_c">(.*?㎡)</td>', j, re.S | re.M).group(1) b.insert_db() url = re.search('a href="(.*?)"', j, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') # 解析html获取iframe表单的数据 house_url = self.url_source + re.search( '<iframe.*?"(.*?)"', html, re.S | re.M).group(1) logic_house_url = house_url.replace( 'Default', 'GetData') logic_house_html = requests.get( url=logic_house_url).content.decode() logic_id = re.search( '<LOGICBUILDING_ID>(.*?)<', logic_house_html, re.S | re.M).group(1) final_url = 'http://www.yingtanfdc.com/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logic_id final_html = requests.get( url=final_url).content.decode('gbk') for l in re.findall( '<ROOM_NUMBER>(.*?)</ROOM_NUMBER>', final_html, re.S | re.M): try: h = House(self.co_index) h.info = final_html h.ho_name = l h.co_name = re.search( 'html">(.*?)</a>', k, re.S | re.M).group(1) h.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) h.insert_db() except Exception as e: continue except Exception as e: continue except Exception as e: continue except Exception as e: continue