def get_build_info(self, build_logo_list, preid): for build_logo in build_logo_list: try: build_url = 'https://www.qdfd.com.cn/qdweb/realweb/fh/FhBuildingList.jsp?preid=' + build_logo response = requests.get(build_url, headers=self.headers) html = response.text bu_num_list = re.findall('javascript:showHouseStatus.*?>(.*?)</a', html, re.S | re.M) bu_all_house_list = re.findall( 'javascript:showHouseStatus.*?center.*?center.*?center.*?center.*?center.*?>(.*?)<', html, re.S | re.M) house_code_list = re.findall("javascript:showHouseStatus\((.*?)\)'>", html, re.S | re.M) for i in range(len(bu_num_list)): try: build = Building(co_index) bu_code_list = re.findall('"(.*?)"', house_code_list[i]) build.bu_num = bu_num_list[i] build.bu_all_house = bu_all_house_list[i] build.co_id = preid build.bu_id = bu_code_list[0] build.insert_db() co_id = bu_code_list[2] house_id = bu_code_list[1] self.get_house_info(build.bu_id, co_id, house_id) except Exception as e: print(e) except Exception as e: print('青岛楼栋问题,url:={}'.format(build_url), e)
def get_build_info(self, build_info_list, co_id, comm_html, url): for i in build_info_list: try: building = Building(2) bu_name = i[1] # 楼栋名称 bu_num = bu_name.split('#')[0] # 楼号 bu_all_house = i[3] # 总套数 bu_build_size = i[5] # 面积 bu_price = i[9] # 价格 # 给对象增加属性 building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_build_size = bu_build_size building.bu_price = bu_price building.co_id = co_id # 小区id build_html = re.search(r'楼盘表(.*?)个楼栋信息', comm_html).group(1) build_url = re.search(r'<ahref="(.*?)">查看信息<', build_html).group(1) build_id = re.search('buildingId=(.*?)$', build_url).group(1) building.bu_id = build_id # 楼栋id building.insert_db() self.get_build_detail(build_url, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
def build_parse(self, co_id): list_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/loulist.jsp?Id_xmxq=' + co_id res = requests.get(list_url, headers=self.headers) con = res.content.decode() build_id_list = re.findall("searchByLid\('(\d+)'\)", con) for build_id in build_id_list: try: bu_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/lpbxx_new.jsp?lid=' + build_id bu_res = requests.get(bu_url, headers=self.headers) bu_con = bu_res.content.decode('gbk') bu = Building(co_index) bu.co_id = co_id bu.bu_id = build_id bu.bu_num = re.search('楼栋名称.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.bu_all_house = re.search('总套数.*?">总(.*?)套</td', bu_con, re.S | re.M).group(1) bu.bu_floor = re.search('地上层数.*?">共(.*?)层</td', bu_con, re.S | re.M).group(1) bu.bu_build_size = re.search('总建筑面积.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search("searchysxk\('(.*?)'\)", bu_con, re.S | re.M).group(1) bu.bu_type = re.search('房屋用途.*?">(.*?)</td', bu_con, re.S | re.M).group(1) bu.insert_db() except Exception as e: log.error('{}楼栋错误{}'.format(build_id, e)) self.house_parse(co_id, build_id, bu_con)
def get_build_info(self, build_url_list, comm): for i in build_url_list: try: build_url = 'http://58.51.240.121:8503/' + i response = requests.get(build_url, headers=self.headers) html = response.text comm.co_pre_sale = re.search( 'id="PresellInfo1_lblXkzh">(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale_date = re.search( 'id="PresellInfo1_lblFzrq">(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_info_list = re.findall('<tr bgcolor="#FFFFFF">.*?</tr>', html, re.S | re.M) for i in build_info_list: build = Building(co_index) build.co_id = comm.co_id build.bu_num = re.search('<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_floor = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_all_house = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('PresellId=(.*?)$', build_url).group(1) build.insert_db() house_url = re.search('a href="(.*?)"', i, re.S | re.M).group(1) self.get_house_info(house_url, comm.co_id, build.bu_id) except Exception as e: print('请求错误,co_index={},url={}'.format(co_index, build_url), e)
def build_crawler(self, co_id, co_name, comm_con): bu = Building(co_index, co_id=co_id, co_name=co_name) build_list = re.search('查看楼盘表.*?<tr>(.*?)</table>', comm_con, re.S | re.M).group(1) build = re.findall('<tr>(.*?)</tr>', build_list, re.S | re.M) for bul in build: try: bul_html = etree.HTML(bul) buli = bul_html.xpath("//td/text()") bu.bu_num = bu_num = buli[1] bu.bu_all_house = buli[2] bu.size = buli[3] house_url = re.search( r'"(.*?)" t', bul, ).group(1) bu.bu_id = bu_id = re.search('-(\d+)', house_url).group(1) bu.insert_db() except: continue self.house_crawler(house_url, bu_num, co_id, bu_id)
def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def build_info(self,co_id,temp_url_list): for temp_url in temp_url_list: try: build_url = "http://222.77.178.63:7002/" + temp_url res = requests.get(build_url,headers=self.headers) html = etree.HTML(res.content.decode('gbk')) build_info_list = html.xpath("//tr[@class='indextabletxt']") for build_info in build_info_list: bu = Building(co_index) ho_url = build_info.xpath("./td/a/@href")[0] bu.co_id = co_id bu.bu_id = re.search('Param=(.*)',ho_url).group(1) bu.bu_num = build_info.xpath("./td/a/text()")[0] bu.bu_all_house = build_info.xpath("./td[2]/text()")[0] try: bu.bu_all_size = build_info.xpath("./td[3]/text()")[0] except: bu.bu_all_size = None try: bu.bu_live_size = build_info.xpath("./td[5]/text()")[0] except: bu.bu_live_size = None bu.insert_db() except Exception as e: # log.error('楼栋信息错误{}'.format(e)) print('楼栋信息错误{}'.format(e)) continue self.house_info(ho_url,co_id,bu.bu_id)
def build_info(self, build_url_list, co_id): for build_ in build_url_list: build_url = "http://www.njhouse.com.cn/2016/spf/" + build_ while True: build_pro = Proxy_contact(app_name="nanjing", method='get', url=build_url, headers=self.headers) build_con = build_pro.contact() build_con = build_con.decode('gbk') html = etree.HTML(build_con) bu = Building(co_index) bu.co_id = co_id try: bu.bu_id = re.search('buildid=(\d+)', build_).group(1) bu.bu_all_house = html.xpath( "//tr[@class='yll']/td/text()")[1] bu.bu_num = re.search('13px;">(.*?)  ', build_con).group(1) bu.insert_db() house_url_list = html.xpath("//td/a[1]/@href") break except Exception as e: log.error("楼栋请求失败{}".format(e)) continue self.house_info(co_id, bu.bu_id, house_url_list)
def get_build_info(self, build_url_list, co_id): for i in build_url_list: build_url = 'http://gold.ncfdc.com.cn/' + i.replace('amp;', '') res = requests.get(build_url) co_name = re.search('ctl15_proname">(.*?)<', res.text, re.S | re.M).group(1) str = re.search('项目楼栋列表.*?ctl17_fLinks_pDataShow', res.text, re.S | re.M).group() for info in re.findall('<tr>.*?</tr>', str, re.S | re.M): if 'href' not in info: continue try: build = Building(co_index) build.co_name = co_name build.bu_num = re.search('<tr>.*?<td>.*?<a href=.*?>(.*?)<', info, re.S | re.M).group(1) build.bu_pre_sale = re.search('onclick="BinSHouseInfo.*?>(.*?)<', info, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('onclick="BinSHouseInfo.*?<td>(.*?)<', info, re.S | re.M).group( 1) build.bu_all_house = re.search('color:#ec5f00;">(.*?)<', info, re.S | re.M).group(1) build.bu_id = re.search("DisplayB_ld&hrefID=(.*?)'", info, re.S | re.M).group(1) build.co_id = co_id build.insert_db() except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e) house_url_list = re.findall("</span>.*?</td><td>.*?<a href='(.*?xs.*?)' target=\"_blank\">.*?查看", res.text, re.S | re.M) self.get_house_info(house_url_list)
def get_comm_info(self, comm_url,comm): try: response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_id = re.search('jectcode=(.*?)"', html, re.S | re.M).group(1) comm.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", html, re.S | re.M).group(1) comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('开发企业:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_owner = re.search('国土证书:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('行政区划:</th>.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.insert_db() build_html = re.search('套房信息.*?</table>', html, re.S | re.M).group() build_info_list = re.findall('<tr.*?>.*?</tr>', build_html, re.S | re.M) for i in build_info_list: try: build = Building(co_index) build.co_id = comm.co_id build.bu_num = re.search('<td.*?>(.*?)</td', i, re.S | re.M).group(1) build.bu_id = re.search('buildingcode=(.*?)&', i, re.S | re.M).group(1) build.co_build_structural = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_all_house = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.insert_db() house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) self.get_build_info(house_url, build.bu_id, comm.co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def build_info(self, build_detail, co_id): proxy = Proxy_contact(app_name='wuhan', method='get', url=build_detail, headers=self.headers) # build_res = requests.get(build_detail,headers=self.headers) build_res = proxy.contact() html = etree.HTML(build_res.decode('gb18030')) info_list = html.xpath("//tr[@bgcolor='#FFFFFF']") for info in info_list: try: bu = Building(co_index) bu.co_id = co_id bu.bu_floor = info.xpath('./td[3]/text()')[0] bu.bu_all_house = info.xpath('./td[4]/text()')[0] bu.bu_num = info.xpath('./td//span/text()')[0] temp_url = info.xpath('./td/a/@href')[0] bu.bu_id = re.search('HouseDengjh=(.*?\d+)', temp_url).group(1) bu.insert_db() except Exception as e: log.error('楼栋错误{}'.format(e)) continue a = parse.quote(re.search('DengJh=(.*?\d+)&', temp_url).group(1), encoding='gbk') b = parse.quote(re.search('HouseDengjh=(.*?\d+)', temp_url).group(1), encoding='gbk') bu_url = 'http://scxx.fgj.wuhan.gov.cn/5.asp?DengJh=' + a + '&HouseDengjh=' + b self.house_info(bu.bu_id, bu_url, co_id) time.sleep(3)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.ndjsj.gov.cn/House/' + i build.co_name = '项目名称:.*?<td.*?>(.*?)<' build.bu_num = '幢 号:.*?<td.*?>(.*?)<' build.bu_address = '坐落位置:.*?<td.*?>(.*?)<' build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<' build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<' build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<' # build.bu_type = '设计用途:.*?<td.*?>(.*?)<' build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<' p = ProducerListUrl( page_url=build_url, request_type='get', encode='utf-8', analyzer_rules_dict=build.to_dict(), current_url_rule='javascript:ShowTitle.*?href="(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print('宁德楼栋错误,url={}'.format(build_url), e)
def comm_info( self, con, ): # 小区及楼栋 comm = Comm(co_index) comm.co_name = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()" )[0] # 小区名称 co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0] # 小区id comm.co_id = re.search(r"\d+", co_id_str).group(0) comm.co_address = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[ 0] # 小区地址 comm.co_develops = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[ 0] # 开发商 comm.co_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0] # 总面积 comm.co_build_size = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[ 0] # 建筑面积 comm.co_build_end_time = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[ 0] # 竣工时间 comm.co_plan_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[ 0] # 用地规划许可 comm.co_work_pro = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0] # 施工许可 comm.co_green = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()" )[0] # 绿地百分比 comm.co_land_use = con.xpath( "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0] # 土地使用证 comm.insert_db() build = Building(co_index) build_table = con.xpath("//tr[@style='color:#000066;']") room_list = [] for build_list in build_table: build.co_id = comm.co_id build.co_name = comm.co_name build_info = build_list.xpath("./td/text()") build.bu_id = build_info[0] build.bu_num = build_info[1] build.bu_all_house = build_info[2] build.size = build_info[3] build.bu_floor = build_info[4] build.bu_pre_sale = build_info[5] build.insert_db() room_url = build_list.xpath("./td/a/@href")[0] room_list.append(room_url) return room_list
def get_build_info(self, co_id, co_name): url = 'http://www.czhome.com.cn/Presell.asp?projectID=' + co_id + '&projectname=' + co_name response = requests.get(url, headers=self.headers) html = response.content.decode('gbk') tree = etree.HTML(html) xpath_list = tree.xpath('//tr[@class="indextabletxt"]') for i in xpath_list[1:]: build_url = i.xpath('td[2]/a/@href')[0] url = 'http://www.czhome.com.cn/' + build_url result = requests.get(url, headers=self.headers) if result.status_code is not 200: print("co_index={},预售url:{}连接失败".format(co_index, url)) continue html = result.content.decode('gbk') tree = etree.HTML(html) # 总套数 bu_xpath = tree.xpath( '/html/body/table/tr/td/table/tr/td/table/tr')[1:] for i in bu_xpath: try: building = Building(7) global building_id building_id += 1 building.bu_id = building_id bu_all_house = i.xpath('td[7]/text()')[0] bu_url = i.xpath('td[1]/a/@href')[0] url = 'http://www.czhome.com.cn/' + bu_url response = requests.get(url, headers=self.headers) if response.status_code is not 200: print("co_index={},楼栋url:{}连接失败".format(co_index, url)) continue html = response.content.decode('gbk') tree = etree.HTML(html) # 楼层 bu_floor = tree.xpath( '//*[@id="Table4"]/tr[2]/td/table[3]/tr/td[1]/u/text()' )[-1] house_url_list = tree.xpath( '//*[@id="Table4"]/tr[2]/td/table[3]/tr/td/a/@href') bu_address = re.search( '<center><font color=.*? (.*?)<', html, re.S | re.M).group(1) building.bu_all_house = bu_all_house building.bu_address = bu_address building.bu_floor = bu_floor building.bu_id = building_id building.co_id = co_id building.insert_db() for i in house_url_list: try: house = House(7) house_url = 'http://www.czhome.com.cn/' + i self.get_house_info(house_url, house, co_id, building_id, building) except Exception as e: print(e) except Exception as e: print(e)
def get_comm_info(self, url, comm): try: response = requests.get(url=url, headers=self.headers) html = response.text tree = etree.HTML(html) # 小区名称 co_name = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[1]/td[2]/text()')[0].strip() # 小区地址 co_address = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[2]/td[2]/text()')[0].strip() # 开工时间 co_build_start_time = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[2]/text()')[ 0].strip() # 竣工时间 co_build_end_time = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[4]/text()')[0].strip() # 建筑结构 co_build_structural = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[4]/td[2]/text()')[ 0].strip() # 容积率 co_volumetric = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[4]/text()')[0].strip() # 绿化率 co_green = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[2]/text()')[0].strip() # 占地面的 co_size = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[5]/td[2]/text()')[0].strip() co_id = re.search('home/(.*?).html', url).group(1) comm.co_name = co_name comm.co_address = co_address comm.co_build_start_time = co_build_start_time comm.co_build_end_time = co_build_end_time comm.co_build_structural = co_build_structural comm.co_volumetric = co_volumetric comm.co_green = co_green comm.co_size = co_size comm.co_id = co_id comm.insert_db() build_info_list = tree.xpath('//*[@id="ctl00_CPH_M_sm_spfBox1"]/div/table/tr[@class="hobuild"]') for i in build_info_list: try: build = Building(11) # 楼栋名称 bu_name = i.xpath('string(td[1])')[0] bu_all_house = i.xpath('td[2]/text()')[0] # 楼栋id bu_id = i.xpath('td[1]/strong/a/@href')[0] bu_id = re.search('building_id=(.*?)$', bu_id).group(1) # 建筑面积 bu_build_size = i.xpath('string(td[3])').replace('�O', '') build.co_id = co_id build.bu_id = bu_id build.bu_all_house = bu_all_house build.bu_name = bu_name build.bu_build_size = bu_build_size build.insert_db() self.get_house_info(bu_id, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e) except BaseException as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
def get_comm_info(self, comm_id_list): for i in comm_id_list: try: comm = Comm(co_index) comm_url = 'http://web.xxfdc.gov.cn/onlineQuery/projectInformation.do?xmId=' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('已售总套数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('已售总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('行政区别:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = i comm.insert_db() bu_html = re.search( '<table class="table table-bordered itemInfoDetail.*?</table>', html, re.S | re.M).group() build_info_list = re.findall('<tr>.*?</tr>', bu_html, re.S | re.M)[1:] for i in build_info_list: try: build = Building(co_index) build.bu_num = re.search('<td>(.*?)<', i, re.S | re.M).group(1) build.bu_all_house = re.search( '<td>.*?<td>.*?<td>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('buildId=(.*?)&', i, re.S | re.M).group(1) build.co_id = comm.co_id build.insert_db() house_url = re.search('<a href="(.*?)"', bu_html, re.S | re.M).group(1) response = requests.get(house_url, headers=self.headers) html = response.text house_url_list = re.findall( '<td width="110">.*?<a.*?href="(.*?)"', html, re.S | re.M) self.get_house_info(house_url_list, build.bu_id, comm.co_id) except Exception as e: print( '楼栋错误,co_index={},url={}'.format( co_index, house_url), e) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_build_info(self, comm_url_list): for i in comm_url_list: try: sid = re.findall('\+(\d+)\+', i)[0] pid = re.findall('\+(\d+)\+', i)[1] build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid # print(build_url) response = requests.get(build_url) html = response.text build = Building(co_index) build.bu_id = pid build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_pre_sale = re.search('预售证号.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_pre_sale_date = re.search('时间.*?">(.*?) ', html, re.S | re.M).group(1) build.bu_all_house = re.search('dM.*?">(.*?) ', html, re.S | re.M).group(1) # build.bu_address = re.search('售楼处地址.*?">(.*?) ', html, re.S | re.M).group(1) build.insert_db() except Exception as e: print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url), e) house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001¶ms=' + sid # print(house_url) result = requests.get(house_url) html_ = result.text for house_info in re.findall('<Result.*?</Result>', html_, re.S | re.M): try: house = House(co_index) house.bu_id = build.bu_id house.bu_num = build.bu_num house.ho_name = re.search('<ONAME>(.*?)</ONAME>', house_info, re.S | re.M).group(1) house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info, re.S | re.M).group(1) house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>', house_info, re.S | re.M).group(1) house.ho_floor = re.search('<FORC>(.*?)</FORC>', house_info, re.S | re.M).group(1) house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>', house_info, re.S | re.M).group(1) house.insert_db() except Exception as e: print('co_index={}, 房号错误'.format(co_index), e)
def build_info(self, bu_info_list, co_id): for bu_info in bu_info_list: try: bu = Building(co_index) url = bu_info.xpath("./@onclick")[0] bu.bu_id = re.search('dbh=(\d+)', url).group(1) bu.co_id = co_id bu.bu_num = bu_info.xpath("./td[@class='org']/text()")[0] bu.bu_all_house = bu_info.xpath("./td[3]/text()")[0] bu.size = bu_info.xpath("./td[2]/text()")[0] bu.insert_db() except Exception as e: log.error('楼栋信息错误', e)
def get_build_info(self, build_info_list, co_id): for i in build_info_list: try: build = Building(co_index) build.bu_num = re.search('<td>(.*?)</td>', i, re.S | re.M).group(1) build.bu_all_house = re.search('<td>.*?<td>(.*?)</td>', i, re.S | re.M).group(1) build.bu_all_size = re.search('<td>.*?<td>.*?<td>(.*?)</td>', i, re.S | re.M).group(1) build.bu_id = re.search('\?id=(.*?)"', i, re.S | re.M).group(1) build.co_id = co_id build.insert_db() house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) self.get_house_info(house_url, co_id, build.bu_id) except Exception as e: print('楼栋错误,co_index={},str={}'.format(co_index, i), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://old.newhouse.cnnbfdc.com/' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_pre_sale = re.findall('预\(现\)售证名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace( 'm²', '').strip() comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace( '套', '').strip() comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_id = re.findall('mobanshow.aspx\?projectid=(.*?)"', html, re.S | re.M)[0].strip() comm.insert_db() global count count += 1 print(count) build_url_list = re.findall("window.open\('(.*?)'", html, re.S | re.M) bu_name_list = re.findall("window.open.*?<font.*?>(.*?)<", html, re.S | re.M) bu_all_house_list = re.findall("window.open.*?<td.*?>(.*?)<", html, re.S | re.M) qrykey = re.findall("qrykey=(.*?)&", html, re.S | re.M) for index in range(len(build_url_list)): try: build = Building(co_index) build.bu_name = bu_name_list[index].strip() build.bu_all_house = bu_all_house_list[index].strip() build.co_id = comm.co_id build.bu_id = qrykey[index].strip() build.insert_db() except Exception as e: print(e) self.get_house_info(build_url_list) except Exception as e: print(e)
def get_build_info(self, build_url_list, co_name): for i in build_url_list: try: build = Building(co_index) build.co_name = co_name build_url = 'http://www.sxczfdc.com/pubinfo/' + i response = requests.get(build_url, headers=self.headers) html = response.text # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0] for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M): try: build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k result = requests.get(build_url_detail, headers=self.headers) content = result.text build.bu_num = re.findall( 'BuildingInfo1_lblBuildingName">(.*?)<', content, re.S | re.M)[0] build.bu_all_house = re.findall( 'BuildingInfo1_lblZts">(.*?)<', content, re.S | re.M)[0] build.bu_floor = re.findall( 'BuildingInfo1_lblZcs">(.*?)<', content, re.S | re.M)[0] build.bu_build_size = re.findall( 'BuildingInfo1_lblJzmj">(.*?)<', content, re.S | re.M)[0] build.bu_live_size = re.findall( 'BuildingInfo1_lblZzmj">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale = re.findall( 'BuildingInfo1_lblYsxkzh">(.*?)<', content, re.S | re.M)[0] build.bu_pre_sale_date = re.findall( 'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content, re.S | re.M)[0] build.insert_db() house_url_list = re.findall( "onClick=.getMoreHouseInfo\('(.*?)'\)", content, re.S | re.M) self.get_house_info(house_url_list, co_name, build.bu_num) except Exception as e: print(e) except Exception as e: print(e)
def get_build_detail(self, all_building_url_list): house_url_list = [] for i in all_building_url_list: try: response = requests.get(i, headers=self.headers) html = response.text tree = etree.HTML(html) bo_develops = tree.xpath('//*[@id="content_1"]/div[3]/text()[2]')[0] # 开发商 bu_build_size = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[6]/a/text()') # 销售面积 if bu_build_size: bu_build_size = bu_build_size[0] bu_pre_sale = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[1]/a/text()') # 预售证书 if bu_pre_sale: bu_pre_sale = bu_pre_sale[0] bu_floor = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[3]/a/text()')[0] # 总层数 bu_all_house = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[4]/a/text()')[0] # 总套数 bu_type = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[5]/a/text()')[0] # 房屋用途 build_html = re.search('houseTable_1.*?当前共有', html, re.S | re.M).group() build_detail_html = re.findall('class.*?</a></td>.*?</a></td>.*?</a></td>', build_html, re.S | re.M) bu_num = re.findall('项目名称:</b>(.*?)</div>', html, re.S | re.M)[0].strip() url_list = [] for bu in build_detail_html: try: build = Building(co_index) build.bu_id = re.search("href='roomTable.aspx\?id=(.*?)&", bu, re.S | re.M).group(1) build.bu_address = re.search("_blank.*?_blank'>(.*?)</a></td><td>", bu, re.S | re.M).group( 1).strip() build.bo_develops = bo_develops build.bu_build_size = bu_build_size build.bu_pre_sale = bu_pre_sale build.bu_num = bu_num build.bu_floor = bu_floor build.bu_all_house = bu_all_house build.bu_type = bu_type for k in self.area_list: if k in build.bu_address: build.area = k continue build.insert_db() house_url = re.search("(roomTable.aspx\?id=.*?&vc=.*?)'", bu, re.S | re.M).group(1) url_list.append('http://dgfc.dg.gov.cn/dgwebsite_v2/Vendition/' + house_url) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) house_url_list = url_list + house_url_list except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) return house_url_list
def get_build_info(self, all_build_url_list): b = Building(co_index) b.co_id = "onclick=GetData\('(.*?)'," b.bu_id = "onclick=GetData\('.*?','(.*?)'" b.bu_num = "font12yellow-leftA'>.*?</span>套</td><td>.*?</td><td>(.*?)<" b.bu_all_house = "font12yellow-leftA'>(.*?)<" data_list = b.to_dict() p = ProducerListUrl( page_url=all_build_url_list, request_type='get', encode='utf-8', analyzer_rules_dict=data_list, current_url_rule="onclick=GetData\('(.*?)','(.*?)'\)", analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() return house_url_list
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm = Comm(co_index) comm_url = 'http://old.newhouse.cnnbfdc.com/' + i try: response = requests.get(comm_url, headers=self.headers) except Exception as e: print("{}城市无法访问小区{}".format(city, comm_url), e) continue html = response.text con = etree.HTML(html) comm.co_id = re.search('id=(\d+)', i).group(1) comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html, re.S | re.M)[0] comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_pre_sale = re.findall('售证名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html, re.S | re.M)[0] comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html, re.S | re.M)[0] comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html, re.S | re.M)[0] comm.insert_db() bu_all_house_list = re.findall( 'window.open.*?center.*?center.*?>(.*?)<', html, re.S | re.M) try: bu_url_list = re.findall("window\.open\('(.*?)'", html, re.S | re.M) except Exception as e: print("{}城市{}小区无楼栋".format(city, comm.co_name), e) continue for i in range(len(bu_url_list)): build = Building(co_index) bu_url = bu_url_list[i] build.bu_all_house = bu_all_house_list[i] build.co_name = comm.co_name build.bu_num = con.xpath("//a[@href='#']/@title")[i] build.bu_id = re.search('key=(\d+)&', bu_url).group(1) build.co_id = comm.co_id build.insert_db() self.get_house_info(bu_url, build.bu_id)
def bu_parse(self, detail_url, co_id): pre_url = detail_url.replace('lp', 'presell') pre_res = requests.get(pre_url, headers=self.headers) pre_html = etree.HTML(pre_res.text) bu_pre_list = pre_html.xpath("//dt/strong/a") for bu_pre in bu_pre_list: bu_pre_url = bu_pre.xpath("./@href")[0] bu_pre_sale = bu_pre.xpath("./text()")[0] bu_url = 'http://www.zstmsf.com' + bu_pre_url while True: try: proxy = self.proxies[random.randint(0, 9)] bu_res = requests.get(bu_url, headers=self.headers, proxies=proxy, timeout=10) break except: continue bu_html = etree.HTML(bu_res.text) bu_list = bu_html.xpath("//tr//strong/a/@href") for bo_url in bu_list: ho_url = "http://www.zstmsf.com" + bo_url while True: try: proxy = self.proxies[random.randint(0, 9)] ho_res = requests.get(ho_url, headers=self.headers, proxies=proxy, timeout=10) break except: continue build = Building(co_index) build.co_id = co_id build.bu_id = re.search('zid=.*?(\d+)', ho_url).group(1) build.bu_num = re.search('幢名称:<strong>(.*?)<', ho_res.text).group(1) build.bu_all_house = re.search("幢总套数.*?'>(.*?)</", ho_res.text).group(1) build.bu_all_size = re.findall("面积.*?'>(.*?)</", ho_res.text)[0] build.bu_pre_sale = bu_pre_sale build.insert_db() self.ho_parse(co_id, build.bu_id, ho_res)
def get_comm_info(self, comm_url, comm): co_url = 'http://www.fangdi.com.cn/' + comm_url response = requests.get(co_url, headers=self.headers) html = response.content.decode('gbk') comm.co_develops = re.search('企业名称:.*?<a.*?>(.*?)<', html, re.S | re.M).group(1) comm.insert_db() add_build_url = 'http://www.fangdi.com.cn/Presell.asp?projectID=' + comm.co_id result = requests.get(add_build_url, headers=self.headers) html_str = result.content.decode('gbk') build_detail_tuple_list = re.findall( "javascript:SetSelect\(.*?,.*?,.*?,.*?,.*?,'(.*?)','(.*?)'\)", html_str, re.S | re.M) for i in build_detail_tuple_list: PreSell_ID = i[0] Start_ID = i[1] build_detail_url = 'http://www.fangdi.com.cn/building.asp?ProjectID=OTU4OHwyMDE4LTQtNHwxNw&PreSell_ID=' + PreSell_ID + '&Start_ID=' + Start_ID massage = requests.get(build_detail_url, headers=self.headers).content.decode('gbk') build_url_list = re.findall('class="indextabletxt">.*?</tr>', massage, re.S | re.M) for i in build_url_list: try: build = Building(co_index) build.bu_num = re.search('<a.*?>(.*?)</a>', i, re.S | re.M).group(1) build.bu_all_house = re.search( '<a.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_build_size = re.search( '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) build.bu_id = re.search('Param=(.*?)=', i, re.S | re.M).group(1) build.co_id = comm.co_id build.insert_db() house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) self.get_house_info(house_url, build.bu_id, build.co_id) except Exception as e: print( '楼栋错误,co_index={},url={}'.format( co_index, build_detail_url), e)
def build_parse(self, co_id): bu = Building(co_index) url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id) res = requests.get(url, headers=self.headers) con_html = etree.HTML(res.text) build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1] a = con_html.xpath("//td[@width='54%']") for index in range(0, len(build_url_list)): try: build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index] res = requests.get(build_info_url, headers=self.headers) con = res.text bu.co_id = co_id bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con, re.S | re.M).group(1) bu.bu_num = re.search('幢.*?did">(.*?)<', con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con, re.S | re.M).group(1) bu.bu_address = re.search('位置.*?ss">(.*?)<', con, re.S | re.M).group(1) bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con, re.S | re.M).group(1) bu.bu_type = re.search('性质.*?type">(.*?)<', con, re.S | re.M).group(1) bu.bu_all_house = re.search('套数.*?number">(.*?)<', con, re.S | re.M).group(1) bu.bu_id = re.search('id=(\d+)', build_url_list[index]).group(1) bu.insert_db() except Exception as e: print( '楼栋错误,co_index={},url={}'.format(co_index, build_info_url), e) continue try: house_url = a[index].xpath("./a/@href")[0] self.house_parse(house_url, co_id, bu.bu_id) except Exception as e: continue
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://222.223.160.199:8088/website/buildquery/selectBuild.jsp?buildID=' + i[0] response = requests.get(build_url, headers=self.headers) html = response.text build.bu_id = i[0] build.co_build_structural = re.search('结构类型.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bo_build_end_time = re.search('建成年份.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_build_size = re.search('总建筑面积.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.size = re.search('占地面积.*?<td>(.*?)<', html, re.S | re.M).group(1) build.bu_floor = re.search('房屋层数.*?<td>(.*?)<', html, re.S | re.M).group(1) build.bu_all_house = re.search('房屋套数.*?<td>(.*?)<', html, re.S | re.M).group(1) build.area = re.search('坐落区.*?<td>(.*?)<', html, re.S | re.M).group(1) build.insert_db() self.get_house_info(build.bu_id) except Exception as e: print('请求错误,url={}'.format(build_url),e)
def get_build_info(self, build_url): try: build = Building(co_index) response = requests.get(build_url, headers=self.headers) 'http://www.gzbjfc.com/House/Table.aspx?xmmc=%E5%85%B0%E6%A1%A5%E5%9C%A3%E8%8F%B2&yszh=bj1740&qu=%E6%AF%95%E8%8A%82&zhlx=xs&dongID=30012124' html = response.text bu_id_list = re.findall('cph_hb1_dg1.*?center.*?center.*?<td>(.*?)<', html, re.S | re.M) build.co_id = re.findall('hdl1_hfYszh" value="(.*?)"', html, re.S | re.M)[0] build.bu_num = self.get_build_num(build.co_id) bu_all_house_list = re.findall('cph_hb1_dg1.*?center.*?center.*?<td>.*?<td>.*?<td>(.*?)<', html, re.S | re.M) house_url_list = re.findall('cph_hb1_dg1.*?<a.*?href="(.*?)"', html, re.S | re.M) for i in range(len(bu_id_list)): build.bu_id = bu_id_list[i] build.bu_all_house = bu_all_house_list[i] build.insert_db() self.get_house_info(house_url_list) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, co_id): try: build_url = 'http://222.184.103.50:7700/WW/ZHList.aspx?projectID=' + co_id + '&projectname=' response = requests.get(build_url, headers=self.headers) html = response.text build_info_list = re.findall('<tr bgcolor="#f5f5f5">.*?</tr>', html, re.S | re.M) for i in build_info_list: build = Building(co_index) build.bu_num = re.search('<a id="LH".*?>(.*?)<', i, re.S | re.M).group(1).strip() build.bu_all_house = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1).strip() build.bu_id = re.search('ZNo=(.*?)"', i, re.S | re.M).group(1).strip() build.co_id = co_id build.insert_db() self.get_house_url(build.bu_id, co_id) except Exception as e: print('请求错误,co_index={},url={}'.format(co_index, build_url), e)