def get_build_info(self, build_info_list, co_id, comm_html, url): for i in build_info_list: try: building = Building(2) bu_name = i[1] # 楼栋名称 bu_num = bu_name.split('#')[0] # 楼号 bu_all_house = i[3] # 总套数 bu_build_size = i[5] # 面积 bu_price = i[9] # 价格 # 给对象增加属性 building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_build_size = bu_build_size building.bu_price = bu_price building.co_id = co_id # 小区id build_html = re.search(r'楼盘表(.*?)个楼栋信息', comm_html).group(1) build_url = re.search(r'<ahref="(.*?)">查看信息<', build_html).group(1) build_id = re.search('buildingId=(.*?)$', build_url).group(1) building.bu_id = build_id # 楼栋id building.insert_db() self.get_build_detail(build_url, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
def get_build_info(self, url, co_id): try: building = Building(co_index) response = requests.get(url) html = response.text tree = etree.HTML(html) co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0] # 小区名字 print(co_name) bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0] # 楼栋名称 bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0] # 楼号 栋号 bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[ 0] # 总套数 bu_floor = tree.xpath('//*[@id="cell3-1"]/text()') bu_floor = self.is_none(bu_floor) # 楼层 bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[ 0] # 建筑面积 bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[ 0] # 住宅面积 bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()') bu_price = self.is_none(bu_price) # 住宅价格 bu_id = re.search('\?(\d+)$', url).group(1) # 楼栋id building.co_id = co_id building.bu_name = bu_name building.bu_num = bu_num building.bu_all_house = bu_all_house building.bu_floor = bu_floor building.bu_build_size = bu_build_size building.bu_live_size = bu_live_size building.bu_price = bu_price building.bu_id = bu_id building.insert_db() house_info_html = re.findall('<tr id="row3">(.*)$', html, re.S | re.M)[0] for i in re.findall('(<td.*?>.*?</td>)', house_info_html, re.S | re.M): if '<br>' not in i: continue ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M) ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i, re.S | re.M) ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i, re.S | re.M)[0] for i in range(len(ho_name_list)): try: if 'font' in ho_name_list[i]: ho_name = re.sub('<font.*?>', '', ho_name_list[i]) else: ho_name = ho_name_list[i] house = House(8) house.ho_name = ho_name house.ho_true_size = ho_true_size_list[i] house.co_id = co_id house.bu_id = bu_id house.ho_type = ho_type house.insert_db() except Exception as e: print(e) except BaseException as e: print(e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://old.newhouse.cnnbfdc.com/' + i response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_pre_sale = re.findall('预\(现\)售证名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace('m²', '').strip() comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace('套', '').strip() comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip() comm.co_id = re.findall('mobanshow.aspx\?projectid=(.*?)"', html, re.S | re.M)[0].strip() comm.insert_db() global count count += 1 print(count) build_url_list = re.findall("window.open\('(.*?)'", html, re.S | re.M) bu_name_list = re.findall("window.open.*?<font.*?>(.*?)<", html, re.S | re.M) bu_all_house_list = re.findall("window.open.*?<td.*?>(.*?)<", html, re.S | re.M) qrykey = re.findall("qrykey=(.*?)&", html, re.S | re.M) for index in range(len(build_url_list)): try: build = Building(co_index) build.bu_name = bu_name_list[index].strip() build.bu_all_house = bu_all_house_list[index].strip() build.co_id = comm.co_id build.bu_id = qrykey[index].strip() build.insert_db() except Exception as e: print(e) self.get_house_info(build_url_list) except Exception as e: print(e)
def get_comm_info(self, url, response, comm): html = response.text tree = etree.HTML(html) # 地区 co_area = tree.xpath( '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[3]/td[2]/text()' )[0] # 小区名称 co_name = tree.xpath( '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[1]/td/strong/span/text()' )[0] # 小区地址 co_address = tree.xpath( '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[2]/td/span/text()' )[0] # 开发商 co_develops = tree.xpath( '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[3]/td[1]/span/@title' )[0] # 物业公司 co_develops = tree.xpath( '//div[@class="wzjs-box"]//tr[3]//span/text()')[0] # 容积率 co_volumetric = tree.xpath( '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[5]/td[2]/span/text()' )[0] # 预售证书 co_pre_sale = tree.xpath( '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[6]/td[1]/text()' )[0] # 建筑面积 co_build_size = tree.xpath( '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[5]/td[1]' )[0].text # 小区id co_id = re.search('id=(.*?)$', url).group(1) html_ = html.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '') bu_url_info = re.search('<pclass="bot-a">(.*?)</p>', html_).group(1) building_url_list = re.findall('<td><aid="(.*?)"(.*?)>(.*?)</a>', bu_url_info) for i in building_url_list: build = Building(co_index) value = i[0] bu_name = i[2] house_url = 'http://fsfc.fsjw.gov.cn/hpms_project/room.jhtml?id=' + value floor_url = "http://fsfc.fsjw.gov.cn/hpms_project/roomtj.jhtml?id=" + value try: res = requests.get(floor_url, headers=self.headers) except Exception as e: print("co_index={},楼栋详情页{}访问失败".format(co_index, floor_url)) print(e) continue try: bu_floor = json.loads(res.text) build.bu_floor = bu_floor["zcs"] except: build.bu_floor = None try: response = requests.get(house_url, headers=self.headers) except Exception as e: print("co_index={},房屋详情页{}请求失败".format(co_index, house_url)) print(e) self.get_build_info(house_url, response, co_id, value) build.co_id = co_id build.bu_id = value build.bu_name = bu_name build.insert_db() comm.co_name = co_name comm.co_id = co_id comm.co_address = co_address comm.co_develops = co_develops comm.co_volumetric = co_volumetric comm.co_pre_sale = co_pre_sale comm.co_build_size = co_build_size comm.area = co_area comm.insert_db()
def get_comm_info(self, url, comm): try: response = requests.get(url=url, headers=self.headers) html = response.text tree = etree.HTML(html) # 小区名称 co_name = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[1]/td[2]/text()' )[0].strip() # 小区地址 co_address = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[2]/td[2]/text()' )[0].strip() # 开工时间 co_build_start_time = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[2]/text()' )[0].strip() # 竣工时间 co_build_end_time = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[4]/text()' )[0].strip() # 建筑结构 co_build_structural = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[4]/td[2]/text()' )[0].strip() # 容积率 co_volumetric = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[4]/text()' )[0].strip() # 绿化率 co_green = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[2]/text()' )[0].strip() # 占地面的 co_size = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[5]/td[2]/text()' )[0].strip() co_id = re.search('home/(.*?).html', url).group(1) comm.co_name = co_name comm.co_address = co_address comm.co_build_start_time = co_build_start_time comm.co_build_end_time = co_build_end_time comm.co_build_structural = co_build_structural comm.co_volumetric = co_volumetric comm.co_green = co_green comm.co_size = co_size comm.co_id = co_id comm.insert_db() build_info_list = tree.xpath( '//*[@id="ctl00_CPH_M_sm_spfBox1"]/div/table/tr[@class="hobuild"]' ) for i in build_info_list: try: build = Building(11) # 楼栋名称 bu_name = i.xpath('string(td[1])')[0] bu_all_house = i.xpath('td[2]/text()')[0] # 楼栋id bu_id = i.xpath('td[1]/strong/a/@href')[0] bu_id = re.search('building_id=(.*?)$', bu_id).group(1) # 建筑面积 bu_build_size = i.xpath('string(td[3])').replace('�O', '') build.co_id = co_id build.bu_id = bu_id build.bu_all_house = bu_all_house build.bu_name = bu_name build.bu_build_size = bu_build_size build.insert_db() self.get_house_info(bu_id, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e) except BaseException as e: print('楼栋错误,co_index={},url={}'.format(co_index, url), e)