def get_comm_detail(self, comm_list): for i in comm_list: try: comm = Comm(co_index) comm_url = 'http://www.lpsfdc.cn/Templets/LPS/aspx/' + i content = requests.get(comm_url) html = content.text co_name_list = re.findall('项目名称:.*?>(.*?)<', html, re.S | re.M) co_id_list = re.findall('hdProjectCode" value="(.*?)"', html, re.S | re.M) co_develops_list = re.findall('开发企业:.*?>(.*?)<', html, re.S | re.M) co_build_size_list = re.findall('TJ_ZMJ">(.*?)<', html, re.S | re.M) co_address_list = re.findall('Pro_XMDZ">(.*?)<', html, re.S | re.M) co_owner_list = re.findall('Pro_ZZZSBH">(.*?)<', html, re.S | re.M) co_pre_sale_list = re.findall('Pro_XKZH">(.*?)<', html, re.S | re.M) co_all_house_list = re.findall('TJ_HZYSTS">(.*?)<', html, re.S | re.M) for i in range(0, len(co_name_list)): try: comm.co_name = co_name_list[i] comm.co_id = co_id_list[i] comm.co_develops = co_develops_list[i] comm.co_build_size = co_build_size_list[i] comm.co_address = co_address_list[i] comm.co_owner = co_owner_list[i] comm.co_pre_sale = co_pre_sale_list[i] comm.co_all_house = co_all_house_list[i] comm.insert_db() # global count # count += 1 # print(count) except Exception as e: print('co_index={}, commiunty error'.format(co_index,), e) build_url_list = re.findall("radiobuild' id='build(.*?)'", html, re.S | re.M) build_name_list = re.findall("radiobuild.*?<span.*?>(.*?)<", html, re.S | re.M) for i in range(0, len(build_url_list)): build = Building(co_index) build.bu_id = build_url_list[i] build.bu_num = build_name_list[i] build.co_id = co_id_list[0] build.insert_db() self.get_build_info(build_url_list) except Exception as e: print(e)
def get_build_info(self, co_id): try: build_url = 'http://222.184.103.50:7700/WW/ZHList.aspx?projectID=' + co_id + '&projectname=' response = requests.get(build_url, headers=self.headers) html = response.text build_info_list = re.findall('<tr bgcolor="#f5f5f5">.*?</tr>', html, re.S | re.M) for i in build_info_list: build = Building(co_index) build.bu_num = re.search('<a id="LH".*?>(.*?)<', i, re.S | re.M).group(1).strip() build.bu_all_house = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1).strip() build.bu_id = re.search('ZNo=(.*?)"', i, re.S | re.M).group(1).strip() build.co_id = co_id build.insert_db() self.get_house_url(build.bu_id, co_id) except Exception as e: print('请求错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, build_url): try: build = Building(co_index) response = requests.get(build_url, headers=self.headers) 'http://www.gzbjfc.com/House/Table.aspx?xmmc=%E5%85%B0%E6%A1%A5%E5%9C%A3%E8%8F%B2&yszh=bj1740&qu=%E6%AF%95%E8%8A%82&zhlx=xs&dongID=30012124' html = response.text bu_id_list = re.findall('cph_hb1_dg1.*?center.*?center.*?<td>(.*?)<', html, re.S | re.M) build.co_id = re.findall('hdl1_hfYszh" value="(.*?)"', html, re.S | re.M)[0] build.bu_num = self.get_build_num(build.co_id) bu_all_house_list = re.findall('cph_hb1_dg1.*?center.*?center.*?<td>.*?<td>.*?<td>(.*?)<', html, re.S | re.M) house_url_list = re.findall('cph_hb1_dg1.*?<a.*?href="(.*?)"', html, re.S | re.M) for i in range(len(bu_id_list)): build.bu_id = bu_id_list[i] build.bu_all_house = bu_all_house_list[i] build.insert_db() self.get_house_info(house_url_list) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://222.223.160.199:8088/website/buildquery/selectBuild.jsp?buildID=' + i[0] response = requests.get(build_url, headers=self.headers) html = response.text build.bu_id = i[0] build.co_build_structural = re.search('结构类型.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bo_build_end_time = re.search('建成年份.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_build_size = re.search('总建筑面积.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.size = re.search('占地面积.*?<td>(.*?)<', html, re.S | re.M).group(1) build.bu_floor = re.search('房屋层数.*?<td>(.*?)<', html, re.S | re.M).group(1) build.bu_all_house = re.search('房屋套数.*?<td>(.*?)<', html, re.S | re.M).group(1) build.area = re.search('坐落区.*?<td>(.*?)<', html, re.S | re.M).group(1) build.insert_db() self.get_house_info(build.bu_id) except Exception as e: print('请求错误,url={}'.format(build_url),e)
def get_build_info(self, co_id): build_url = 'http://www.yanjifc.com/jdi' payload = "activityId=" + str(co_id) + "&module=jtsActBuildingInfo" result = requests.post(url=build_url, data=payload, headers=self.headers) data = result.json() build_list = data['ROWS']['ROW'] for i in build_list: build = Building(co_index) build.bu_all_size = self.dict_get(i, 'BUILDING_AREA') build.bu_address = self.dict_get(i, 'LOCATION') build.bu_num = self.dict_get(i, 'LOCATION') build.bu_floor = self.dict_get(i, 'TOTAL_FLOORS') build.bu_all_house = self.dict_get(i, 'TOTAL_SET') build.co_build_structural = self.dict_get(i, 'STRUCTURE') build.bu_id = self.dict_get(i, 'RESOURCE_GUID') build.co_id = co_id build.insert_db() self.get_house_info(co_id, build.bu_id)
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm_url = 'http://www.hbczfdc.com:4993/' + i.replace('../', '') try: comm = Comm(co_index) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_name = re.search('id="Project_XMMC">(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('id="Project_XMDZ">(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('id="Project_COMPANYNAME">(.*?)<', html, re.S | re.M).group(1) comm.area = re.search('id="Project_AREA_NAME">(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('id="Project_GHZJZMJ">(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('id="Project_RJL">(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('id="presellInfo".*?,,(.*?)"', html, re.S | re.M).group(1) comm.co_land_use = re.search('id="tdzInfo".*?,,(.*?)"', html, re.S | re.M).group(1) comm.co_work_pro = re.search('id="sgxkzInfo".*?,,(.*?)"', html, re.S | re.M).group(1) comm.co_plan_pro = re.search('id="ghxkzInfo".*?,,(.*?)"', html, re.S | re.M).group(1) comm.co_id = re.search('code=(.*?)$', comm_url, re.S | re.M).group(1) comm.insert_db() build = Building(co_index) build.bu_id = re.search("name='radiobuild'.*? bid=(.*?) ", html, re.S | re.M).group(1) build.bu_num = re.search("name='radiobuild'.*?<span.*?>(.*?)<", html, re.S | re.M).group(1) build.co_id = comm.co_id build.insert_db() self.get_build_info(build.bu_id, comm.co_id) except Exception as e: print('小区页面错误,co_index={},url={}'.format(co_index, comm_url), e)
def bu_parse(self, co_id, bulist): for bo in bulist: bu_url = "http://110.89.45.7:8082" + bo bu_res = requests.get(bu_url, headers=self.headers) con = bu_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('buildingInfoID=(.*?)&', bo).group(1) bu.bu_num = re.search('幢号.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_floor = re.search('总 层 数.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_live_size = re.search('批准销售.*?">.*?</td.*?">(.*?)</td', con, re.S | re.M).group(1) bu.bu_all_size = re.search('总面积.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_type = re.search('设计用途.*?">(.*?)</', con, re.S | re.M).group(1) bu.insert_db() bu_html = etree.HTML(con) ho_list = bu_html.xpath("//td[@style]/a") self.ho_parse(co_id, bu.bu_id, ho_list)
def get_build_url(self, build_url_list, co_id): for i in build_url_list: try: build = Building(co_index) build.co_id = co_id bu_url = 'http://www.nhfg.cn/webhouseinfo/ItemList/' + i response = self.s.get(bu_url) html = response.text build.bu_num = \ re.findall('<TD style="WIDTH: 471px" colSpan="11"><FONT style="COLOR: white" face="宋体">(.*?)<', html, re.S | re.M)[0].strip() build.bu_all_house = re.findall( '商业</FONT></TD>.*?center">(.*?)<', html, re.S | re.M)[0].strip() build.insert_db() house_url = re.findall('(RoomLoad\.aspx\?.*?)"', html, re.S | re.M)[0] zu_house_url = 'http://www.nhfg.cn/webhouseinfo/ItemList/HouseList/' + house_url self.get_house_info(zu_house_url, build.bu_num, co_id) except Exception as e: print(e)
def get_build_detail(self, build_url, co_id): bu_url = 'http://www.yzfdc.cn/' + build_url response = self.s.get(bu_url, headers=self.headers) html = response.text build = Building(co_index) build.bu_num = re.search('查询幢号:.*?<span.*?<span.*?>(.*?)<', html, re.S | re.M).group(1) bu_html = re.search('<div align="center">已售已备案.*?</table>', html, re.S | re.M).group() build_html_list = re.findall('<tr.*?</tr>', bu_html, re.S | re.M) all_size = 0 for i in build_html_list: num = re.search( '<div.*?<div.*?<div.*?<div.*?<div.*?<div.*?>(.*?)<', i, re.S | re.M).group(1) if num: all_size += float(num) build.bu_build_size = all_size build.co_id = co_id build.bu_id = re.search('GCZHId=(.*?)$', bu_url).group(1) build.insert_db() self.get_house_info(co_id, build.bu_id)
def analyzer_comm_url(self, comm_url_list): all_url = [] for i in comm_url_list: try: res = requests.get(i) html = res.content.decode('gbk') c = Comm(self.co_index) c.co_name = re.search('项目名称:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 项目名称 c.co_address = re.search('项目地址:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 项目地址 c.co_develops = re.search('开发商:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 开发商 c.co_build_size = re.search('总建筑面积:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 建筑面积 c.co_land_type = re.search('用地依据:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 土地使用证 c.co_all_house = re.search('>总套数:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 总套数 c.area = re.search('所在区域:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 地区 area c.co_work_pro = re.search('施工许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group(1) # 施工许可证 c.co_plan_pro = re.search('建设工程规划许可证:.*?">.*?<span.*?>(.*?)</span>', html, re.S | re.M).group( 1) # 规划许可证 c.insert_db() buildlist = re.findall('onmouseover.*?</TR>', html, re.S | re.M) url_list = [] for k in buildlist: try: b = Building(self.co_index) build_list = re.findall('<TD.*?>(.*?)</TD>', k, re.S | re.M) b.co_name = build_list[1] b.bu_num = build_list[2] b.bu_type = build_list[4] b.insert_db() house_url = re.findall('href="(.*?)"', k, re.S | re.M) for j in house_url: url_list.append('http://www.stfcj.gov.cn/stsite/ProjectList/' + j) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, i), e) all_url = all_url + url_list except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, i), e) return all_url
def build_info(self,bu_list,co_id): for bu in bu_list: bu_url = bu.xpath("./td[4]/a/@href")[0] build_url = self.start_url+'/' + bu_url bu_res = requests.get(build_url,headers=self.headers) bu_res.encoding = 'gbk' con = bu_res.text bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a',con,re.S|re.M).group(1) bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td',con,re.S|re.M).group(1) bu_html = etree.HTML(con) donglist = bu_html.xpath("//table[@id='donglist']/tr") for dong in donglist: dong_url = dong.xpath("./td/a/@href")[0] bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('ID={(.*?)}',dong_url).group(1) bu.bu_num = dong.xpath("./td[3]/text()")[0] bu.bu_floor = dong.xpath("./td[4]/text()")[0] bu.bu_pre_sale = bu_pre_sale bu.bu_pre_sale_date = bu_pre_sale_date bu.insert_db() self.house_info(co_id,bu.bu_id,dong_url)
def build_info(self, co_id, bu_id): bu_url = 'http://www.lsjs.gov.cn/WebLSZFGB/ZNInfo.aspx?YSZID=' + bu_id + "&YSXMID=" + co_id bu_res = requests.get(bu_url, headers=self.headers) con = bu_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = bu_id bu.bu_num = re.search('znxx">(.*?)</span', con).group(1) bu.bu_all_house = re.search('纳入网上预(销)售总套数.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_build_size = re.search('纳入网上预(销)售总面积.*?">(.*?)</', con, re.S | re.M).group(1) bu.insert_db() html = etree.HTML(con) house_list = html.xpath("//span[@class='syt-span']") for tag in house_list: ho = House(co_index) ho.bu_id = bu_id ho.co_id = co_id ho.ho_name = tag.xpath(".//p[@class='ewb-num']/text()")[0] ho.ho_build_size = tag.xpath(".//p[@class='ewb-con']/text()")[0] ho.insert_db()
def comm_list(self, html): com_list = html.xpath("//table[@id='data_table_2']//tr/td[3]/a/@href") for com_temp in com_list: com_url = 'http://www.hbsfdc.com' + com_temp.replace( "../../..", '') try: com_res = requests.get(com_url, headers=self.headers) except Exception as e: log.error("{}小区访问失败".format(com_url)) continue com_con = com_res.content.decode() co = Comm(co_index) co.co_id = re.search('lcode=(\d+)', com_temp).group(1) co.co_name = re.search('项目名称.*?XMMC">(.*?)</span', com_con, re.S | re.M).group(1) co.co_develops = re.search('开发公司.*?NAME">(.*?)</span', com_con, re.S | re.M).group(1) co.co_address = re.search('项目地址.*?XMDZ">(.*?)</span', com_con, re.S | re.M).group(1) co.area = re.search('所在区域.*?SZQY">(.*?)</span', com_con, re.S | re.M).group(1) co.co_volumetric = re.search('容积率.*?RJL">(.*?)</span', com_con, re.S | re.M).group(1) co.co_pre_sale = re.search('预售证号.*?ZH">(.*?)</span', com_con, re.S | re.M).group(1) co.co_build_size = re.search('总建筑面积.*?JZMJ">(.*?)</span', com_con, re.S | re.M).group(1) co.insert_db() bu_list = re.findall("input name='radiobuild'.*?</td>", com_con) for bu in bu_list: bid = re.search('bid=(\d+)', bu).group(1) bo = Building(co_index) bo.co_id = co.co_id bo.bu_id = bid bo.bu_num = re.search('/>(.*?)</td>', bu).group(1) bo.insert_db() self.ho_parse(bid, co.co_id)
def get_build_info(self, build_url_list, co_id): for i in build_url_list: build_url = 'http://www.fjlyfdc.com.cn/' + i try: build = Building(co_index) response = requests.get(build_url, headers=self.headers) html = response.text build.bu_id = re.search('buildingInfoID=(.*?)&', build_url).group(1) build.co_id = co_id build.bo_develops = re.search('开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_address = re.search('坐落位置:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_num = re.search('幢号:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_build_structural = re.search('建筑结构:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_type = re.search('设计用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bu_floor = re.search('总 层 数:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.co_all_size = re.search('总面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) build.insert_db() house_url_list = re.findall( 'href="(/House/HouseInfo\?HouseCenterID=.*?)"', html, re.S | re.M) self.get_house_info(house_url_list, build.bu_id, co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.fjnpfdc.com/House/' + i res = requests.get(build_url, headers=self.headers) con = res.content.decode('gbk') build.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_num = re.search("幢 号:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_use = re.search("设计用途:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_structural = re.search("建筑结构:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_floor = re.search("总 层 数:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_build_size = re.search("总 面 积:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_end_time = re.search("竣工日期:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) house_url_list = re.findall('<a href="(HouseInfo.*?)"', con) # p = ProducerListUrl(page_url=build_url, # request_type='get', encode='gbk', # analyzer_rules_dict=build.to_dict(), # current_url_rule='<a href="(HouseInfo.*?)"', # analyzer_type='regex', # headers=self.headers) build.co_id = re.search('ProjectId=(.*?)&', i).group(1) build.bu_id = re.search('BuildingId=(.*?)&P', i).group(1) build.insert_db() # house_url_list = p.get_details() self.get_house_info(house_url_list, build.bu_id, build.co_id) except Exception as e: print("co_index={},楼栋{}错误".format(co_index, i), e)
def get_build_info(self, build_all_url): build_url = 'http://www.tmsf.com/' + build_all_url[0] try: response = requests.get(build_url, headers=self.headers) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e) return html = response.text build_code_list = re.findall("javascript:doPresell\('(.*?)'\)", html) sid = re.findall('id="sid" value="(.*?)"', html)[0] propertyid = re.findall('id="propertyid" value="(.*?)"', html)[0] co_id = sid + '_' + propertyid for presellid in build_code_list: build_detail_url = build_url + '?presellid=' + presellid try: result = requests.get(build_detail_url, headers=self.headers, timeout=10).text except Exception as e: print( "楼栋错误,co_index={},url={}".format(co_index, build_detail_url), e) continue build_num_html = re.search("幢 号.*?面 积:", result, re.S | re.M).group() build_num_list = re.findall('<a.*?</a>', build_num_html, re.S | re.M) for i in build_num_list: build = Building(co_index) build_num = re.search("doBuilding\('(.*?)'\)", i, re.S | re.M).group(1) build.bu_num = re.search("doBuilding.*?>(.*?)<", i, re.S | re.M).group(1) build.bu_id = build_num build.co_id = co_id build.insert_db() self.get_house_info(build_num, sid)
def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con): build = Building(co_index) build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con, re.S | re.M).group(1) build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con, re.S | re.M).group(1) build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con, re.S | re.M).group(1) build.bu_pre_sale = bu_pre_sale build.bo_develops = bo_develops build.co_name = bu_co_name build.insert_db()
def bu_info(self,bu_list,co_id): for bu in bu_list: try: bu_url = 'http://www.fxfdcw.com/'+bu res = requests.get(bu_url,headers=self.headers) con = res.content.decode('gbk') html = etree.HTML(con) build = Building(co_index) build.co_id = co_id build.bu_id = re.search('bdid=(\d+)',bu).group(1) build.bu_num = re.search('楼号.*?">(.*?)</',con,re.S|re.M).group(1) build.bu_address = re.search('坐落.*?">(.*?)</',con,re.S|re.M).group(1) build.bu_floor = re.search('地上层数.*?">(.*?)</',con,re.S|re.M).group(1) build.bu_build_size = re.search('建筑面积.*?wrap">(.*?)</',con,re.S|re.M).group(1) build.bu_all_house = re.search('套 数.*?">(.*?)</',con,re.S|re.M).group(1) build.bu_type = re.search('用 途.*?wrap">(.*?)</',con,re.S|re.M).group(1) build.insert_db() ho_list = html.xpath("//span[@title]") except Exception as e: # log.error("楼栋信息错误{}".format(e)) print("楼栋信息错误{}".format(e)) continue self.ho_info(ho_list,co_id,build.bu_id)
def build_info(self, build_detail, co_id): build_detail_url = 'http://as.gzfcxx.cn' + build_detail res = requests.get(build_detail_url, headers=self.headers) html = etree.HTML(res.text) build_info_list = html.xpath("//div[@class='box']//font/a/@href") for build_url in build_info_list: try: url = 'http://as.gzfcxx.cn' + build_url ho_res = requests.get(url, headers=self.headers) ho_html = etree.HTML(ho_res.text) bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('dongID=(\d+)', build_url).group(1) bu.bu_num = ho_html.xpath( "//option[@selected='selected']/text()")[0] bu.insert_db() temp = re.search("\?(.*?dongID=\d+)", build_url).group(1) real_url = 'http://as.gzfcxx.cn/Controls/HouseControls/FloorView.aspx?' + temp house_res = requests.get(real_url, headers=self.headers) ho_html = etree.HTML(house_res.text) info = ho_html.xpath("//table[@class='C1 T0 F0']/..") except Exception as e: log.error('楼栋信息错误', e) continue for i in info: try: ho = House(co_index) ho_info = i.xpath("./@title")[0] ho.ho_build_size = re.search('(\d+).(\d+)', ho_info, re.S | re.M).group(1) ho.ho_name = i.xpath(".//span/text()")[0] ho.bu_id = bu.bu_id ho.co_id = co_id ho.insert_db() except Exception as e: log.error('房间信息错误', e)
def get_comm_detail(self, href, comm): comm_detail_url = self.URL_FRONT + href response = requests.get(url=comm_detail_url, headers=self.headers) co_id = response.url co_id = int(co_id.split('=')[1]) # 小区id html = response.content.decode('gbk') co_name = self.regex_common(r'项目名称.*?<td.*?>(.*?)</td>', html) # 小区名字 co_owner = self.regex_common(r'房屋所有权证号.*?<td.*?>(.*?)</td>', html) co_use = self.regex_common(r'用 途.*?<td.*?>(.*?)</td>', html) co_develops = self.regex_common(r'开 发 商.*?<td.*?>(.*?)</td>', html) co_address = self.regex_common(r'项目位置.*?<td.*?>(.*?)</td>', html) co_pre_sale = self.regex_common(r'预售证号.*?<td.*?>(.*?)</td>', html) co_land_use = self.regex_common(r'土地使用权证.*?<td.*?>(.*?)</td>', html) co_land_type = self.regex_common(r'土地权证类型.*?<td.*?>(.*?)</td>', html) co_handed_time = self.regex_common(r'终止日期.*?<td.*?>(.*?)</td>', html) co_plan_pro = self.regex_common(r'规划许可证.*?<td.*?>(.*?)</td>', html) co_work_pro = self.regex_common(r'施工许可证.*?<td.*?>(.*?)</td>', html) co_type = self.regex_common(r'项目类型.*?<td.*?>(.*?)</td>', html) # 小区类型 co_size = self.regex_common(r'批准面积.*?<td.*?>(.*?)</td>', html) # 占地面积 comm.co_id = co_id comm.co_name = co_name comm.co_type = co_type comm.co_size = co_size comm.co_owner = co_owner comm.co_use = co_use comm.co_develops = co_develops comm.co_address = co_address comm.co_pre_sale = co_pre_sale comm.co_land_use = co_land_use comm.co_land_type = co_land_type comm.co_handed_time = co_handed_time comm.co_plan_pro = co_plan_pro comm.co_work_pro = co_work_pro # 获取楼栋url列表 build_url_list = re.findall(r"<td><a href='(.*?)'", html, re.M | re.S) if not build_url_list: return else: for build_url in build_url_list: try: building = Building(self.CO_INDEX) build_id = re.search(r'<td>(\d{2,6})</td>', html, re.M | re.S).group(1) # 楼栋id bu_all_house = re.search(r'<td>(\d{1,3})</td>', html, re.M | re.S).group(1) # 总套数 bu_price_demo = re.findall('<td>[\.\d]+</td>', html, re.M | re.S)[4] bu_price = re.search('\d+', bu_price_demo).group() data_dict = self.get_build_detail(build_url) bu_num = data_dict['bu_num'] # 楼号 bu_build_size = data_dict['bu_build_size'] # 建筑面积 co_address = data_dict['co_address'] # 小区地址 co_build_end_time = data_dict['co_build_end_time'] # 竣工时间 co_build_type = data_dict['co_build_type'] # 竣工时间 if not co_build_end_time: building.co_is_build = '1' comm.co_address = co_address comm.co_build_end_time = co_build_end_time comm.bu_build_size = bu_build_size comm.co_build_type = co_build_type # 楼栋 building.bu_num = bu_num building.bu_build_size = bu_build_size building.bu_all_house = bu_all_house building.bu_id = build_id building.co_id = co_id building.bu_price = bu_price # 插入 building.insert_db() except Exception as e: build_detail_url = self.URL_FRONT + build_url print('楼栋错误:', build_detail_url) comm.insert_db()
def get_comm_info(self, comm_info): co = Comm(co_index) co.co_name = re.search('_blank">(.*?)</a', comm_info).group(1) try: co.co_address = re.findall('px">(.*?)</td', comm_info)[1] except: co.co_address = None co.area = re.search('center">(.*?)</td>', comm_info).group(1) co_detail_url = re.search("href='(.*?)'", comm_info).group(1) co_url = "http://www.qyfgj.cn/newys/" + co_detail_url try: res = requests.get(co_url, headers=self.headers) except Exception as e: print("co_index={}小区未请求到".format(co_index), e) con = res.content.decode('gbk') try: co.co_develops = re.search('开发商名称.*?px;">(.*?)</a', con, re.S | re.M).group(1) co.co_all_house = re.search('总套数.*?">(\d+) ', con, re.S | re.M).group(1) co.co_all_size = re.search('总面积.*?">(\d+.\d+) m', con, re.S | re.M).group(1) except: print("小区无开发商等信息") co.insert_db() try: build = re.findall('<tr bgcolor="white">(.*?)</tr>', con, re.S | re.M) except: print("小区没有楼栋信息") build_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': co_url } for build_info in build: if "进入" in build_info: build_url = re.search('href="(.*?)"><font', build_info).group(1) build_url = "http://www.qyfgj.cn/newys/" + build_url ho_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': build_url } build_res = requests.get(build_url, headers=build_headers) build_con = build_res.content.decode('gbk') if re.search('ID=(\d+)', build_url): #现售 bu = Building(co_index) bu_id = re.search('ID=(\d+)', build_url).group(1) bu.bu_id = bu_id bu.co_name = co.co_name bu.insert_db() self.get_house_info(headers=ho_headers, bu_id=bu_id, url=build_url) else: #预售 bu = Building(co_index) bu.co_name = co.co_name bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>', build_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con, re.S | re.M).group(1) ret = re.findall('<tr onmouseover(.*?)</tr', build_con, re.S | re.M) for i in ret: house_url = re.search('href="(.*?)"', i).group(1) house_url = "http://www.qyfgj.cn/newys/" + house_url bu.bu_id = re.search('dbh=(.*?)&', i).group(1) bu.bu_num = re.search('<td width="89.*?">(.*?)</', i).group(1) bu.bu_floor = re.search('<td width="84.*?">(\d+)</td', i).group(1) bu.insert_db() ho_res = requests.get(house_url, headers=ho_headers) ho_con = ho_res.content.decode('gbk') new_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': house_url } self.get_house_info(ho_con=ho_con, headers=new_headers, bu_id=bu.bu_id) else: print("楼栋无链接地址")
def get_build_url_list(self, url_list): for i in url_list: try: res = requests.get(i) html = res.content.decode('gbk') for k in re.findall('项目名称.*?</dl>', html, re.S | re.M): try: c = Comm(self.co_index) c.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) c.co_address = re.search('class="address"(.*?)</dd>', k, re.S | re.M).group(1) c.area = re.search('"city">(.*?)</dd>', k, re.S | re.M).group(1) c.co_develops = re.search('"average">(.*?)</dd>', k, re.S | re.M).group(1) c.insert_db() global count count += 1 print(count) url = re.search('a href="(.*?)">', k, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') build_info_str = re.search('楼盘表</td>(.*?)合 计', html, re.S | re.M).group(1) for j in re.findall('<tr.*?</tr>', build_info_str, re.S | re.M): try: b = Building(self.co_index) b.co_name = re.search('html">(.*?)</a>', k, re.S | re.M).group(1) b.bu_all_house = re.search( 'absmiddle" />(.*?)</a>', j, re.S | re.M).group(1) b.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) b.bu_build_size = re.search( 'td class="t_c">.*?td class="t_c">(.*?㎡)</td>', j, re.S | re.M).group(1) b.insert_db() url = re.search('a href="(.*?)"', j, re.S | re.M).group(1) complete_url = self.url_source + url res = requests.get(complete_url) html = res.content.decode('gbk') # 解析html获取iframe表单的数据 house_url = self.url_source + re.search( '<iframe.*?"(.*?)"', html, re.S | re.M).group(1) logic_house_url = house_url.replace( 'Default', 'GetData') logic_house_html = requests.get( url=logic_house_url).content.decode() logic_id = re.search( '<LOGICBUILDING_ID>(.*?)<', logic_house_html, re.S | re.M).group(1) final_url = 'http://www.yingtanfdc.com/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logic_id final_html = requests.get( url=final_url).content.decode('gbk') for l in re.findall( '<ROOM_NUMBER>(.*?)</ROOM_NUMBER>', final_html, re.S | re.M): try: h = House(self.co_index) h.info = final_html h.ho_name = l h.co_name = re.search( 'html">(.*?)</a>', k, re.S | re.M).group(1) h.bu_num = re.search( '="absmiddle" />(.*?)</a></strong></', j, re.S | re.M).group(1) h.insert_db() except Exception as e: continue except Exception as e: continue except Exception as e: continue except Exception as e: continue
def get_comm_info(self,url,response,comm): html = response.text tree = etree.HTML(html) # 地区 co_area = tree.xpath('//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[3]/td[2]/text()')[0] # 小区名称 co_name = tree.xpath('//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[1]/td/strong/span/text()')[0] # 小区地址 co_address = tree.xpath('//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[2]/td/span/text()')[0] # 开发商 co_develops = tree.xpath('//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[3]/td[1]/span/@title')[0] # 物业公司 co_develops = tree.xpath('//div[@class="wzjs-box"]//tr[3]//span/text()')[0] # 容积率 co_volumetric = tree.xpath('//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[5]/td[2]/span/text()')[0] # 预售证书 co_pre_sale = tree.xpath('//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[6]/td[1]/text()')[0] # 建筑面积 co_build_size = tree.xpath('//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[5]/td[1]')[0].text # 小区id co_id = re.search('id=(.*?)$', url).group(1) html_ = html.replace('\t', '').replace('\r', '').replace('\n', '').replace(' ', '') bu_url_info = re.search('<pclass="bot-a">(.*?)</p>', html_).group(1) building_url_list = re.findall('<td><aid="(.*?)"(.*?)>(.*?)</a>', bu_url_info) for i in building_url_list: build = Building(co_index) value = i[0] bu_name = i[2] house_url = 'http://fsfc.fsjw.gov.cn/hpms_project/room.jhtml?id=' + value floor_url = "http://fsfc.fsjw.gov.cn/hpms_project/roomtj.jhtml?id=" + value try: res = requests.get(floor_url,headers=self.headers) except Exception as e: print("co_index={},楼栋详情页{}访问失败".format(co_index,floor_url)) print(e) continue try: bu_floor = json.loads(res.text) build.bu_floor = bu_floor["zcs"] except: build.bu_floor = None try: response = requests.get(house_url, headers=self.headers) except Exception as e: print("co_index={},房屋详情页{}请求失败".format(co_index,house_url)) print(e) self.get_build_info(house_url,response,co_id,value) build.co_id = co_id build.bu_id = value build.bu_name = bu_name build.insert_db() comm.co_name = co_name comm.co_id = co_id comm.co_address = co_address comm.co_develops = co_develops comm.co_volumetric = co_volumetric comm.co_pre_sale = co_pre_sale comm.co_build_size = co_build_size comm.area = co_area comm.insert_db()
def comm_crawler(self, comm_url, co_develops, co_pre_sale, co_name, co_pre_sale_date): ho = House(co_index) comm_res = requests.get(comm_url, headers=self.headers) comm_html = etree.HTML(comm_res.text) value = comm_html.xpath("//input[@id='propertyid']/@value")[0] sid = comm_html.xpath("//input[@id='sid']/@value")[0] # detail_url = "http://hu.tmsf.com/newhouse/property_"+str(sid)+"_"+str(value)+"_price.htm" bu = Building(co_index) bu_num = comm_html.xpath("//div[@id='building_dd']//a")[1:] # bu_info,bu_num_list = self.build(comm_html,value) self.comm_info(co_develops, co_pre_sale, co_name, co_pre_sale_date, value) # page_html = requests.get(detail_url,headers=self.headers) for bu_ in bu_num: bu.bu_num = bu_.xpath("./text()")[0] bu_id = bu_.xpath("./@id")[0] bu.bu_id = re.search('\d+', bu_id).group(0) bu.co_id = value bu.insert_db() detail_url = "http://hu.tmsf.com/newhouse/property_" + str( sid) + "_" + str(value) + "_price.htm?buildingid=" + str( bu.bu_id) page_html = requests.get(detail_url, headers=self.headers) page = re.search('页数 \d+/(\d+)', page_html.text).group(1) for i in range(1, int(page) + 1): detail_url = detail_url + "?page=" + str(i) detail_res = requests.get(detail_url, headers=self.headers) house_html = etree.HTML(detail_res.text) house_url_list = house_html.xpath("//td[@width='100']/a/@href") house_bu_num = house_html.xpath("//td[@width='100']/a/text()") house_name = house_html.xpath( "//td[@width='101'][1]/a/div/text()") for index in range(1, len(house_url_list) + 1): try: ho.bu_num = house_bu_num[index] # 楼号 栋号 house_url = "http://hu.tmsf.com" + house_url_list[index] house_res = requests.get(house_url, headers=self.headers) house_html = house_res.text ho.bu_id = bu.bu_id ho.co_id = re.search('楼盘主页.*?_\d+_(\d+)_info', house_html).group(1) # 小区id ho.ho_name = house_name[index] # 房号:3单元403 # ho.ho_num = re.search('_(\d+).htm',house_url).group(1) # 房号id ho.ho_type = re.search('房屋用途:.*?>(.*?)<', house_html).group( 1) # 房屋类型:普通住宅 / 车库仓库 ho.ho_floor = re.search('第(.*?)层', house_html).group(1) build_text = re.search('建筑面积:(.*?)平方米', house_html).group(1) build_num = re.findall('class="(.*?)"', build_text) ho.ho_build_size = self.number(build_num) # 建筑面积 size_text = re.search('套内面积:(.*?)平方米', house_html).group(1) size_num = re.findall('class="(.*?)"', size_text) ho.ho_true_size = self.number(size_num) # 预测套内面积,实际面积 price_text = re.search('总 价:(.*?)万元', house_html).group(1) # 价格 price_num = re.findall('class="(.*?)"', price_text) ho.ho_price = self.number(price_num) ho.insert_db() except: continue
def start_crawler(self): url = 'http://zzx.zzfc.com/ajaxpro/xy_ysxk_more,App_Web_mjeeodb-.ashx' for i in range(1, 21): payload = "{\"pageNo\":" + str( i) + ",\"pageSize\":30,\"rowcount\":589}" try: response = requests.post(url, data=payload, headers=self.headers) con = response.content.decode() except Exception as e: log.error('楼栋请求失败{}'.format(e)) continue co_list = re.findall('\[\d+,.*?\d+\]', con) for comm in co_list: try: sid = re.search('\[(\d+),', comm).group(1) pid = re.search('",(\d+),', comm).group(1) bu_url = 'http://zzx.zzfc.com/xy_bldg.aspx?pid=' + pid + '&sid=' + sid bu_res = requests.get(bu_url, headers=self.headers) bu_con = bu_res.content.decode() bu = Building(co_index) bu.bu_id = sid bu.bu_address = re.search('楼栋座落.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('预售证号.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('预售日期.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.bu_all_house = re.search('套数.*?">(.*?) ', bu_con, re.S | re.M).group(1) bu.insert_db() except Exception as e: log.error("{}楼栋解析失败{}".format(comm, e)) continue ho_url = 'http://zzx.zzfc.com/ajaxpro/xy_housetag,App_Web_xg4ulr9n.ashx' data = "{\"m_key\":\"WWW_LPB_001\",\"m_param\":\"" + sid + "\"}" headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'X-AjaxPro-Method': 'GETLPBDS' } try: ho_res = requests.post(ho_url, data=data, headers=headers) ho_con = ho_res.content.decode() except Exception as e: log.error("房屋请求失败{}".format(e)) continue ho_list = re.findall('\["\d+.*?\d+\]', ho_con) for house in ho_list: try: ho = House(co_index) ho.bu_id = sid info_list = house.split(",") ho.ho_name = info_list[4] ho.ho_floor = re.search('(\d+)层', house).group(1) ho.ho_build_size = info_list[-3] ho.ho_true_size = info_list[-2] ho.insert_db() except Exception as e: log.error("{}房屋解析错误{}".format(house, e)) continue