def get_comm_detail(self, comm_detail_url, co_id): comm = Comm(co_index) try: response = requests.get(comm_detail_url, headers=self.headers) html = response.text comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('项目主体性质:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_develops = re.search('主开发商:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目建设地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_size = re.search('项目总规划面积(㎡):.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_start_time = re.search('计划开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search('计划竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = co_id comm.insert_db() build_info_list = re.findall('id="lpan".*?</tr>', html, re.S | re.M) self.get_build_info(build_info_list, co_id) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: comm_url = 'http://www.fjlyfdc.com.cn/' + i try: comm = Comm(co_index) response = requests.get(comm_url, headers=self.headers) html = response.text comm.co_develops = re.search('公司名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_pre_sale = re.search('预售许可证:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_address = re.search('项目坐落:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_type = re.search('规划用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_green = re.search('绿地率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_open_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_build_end_time = re.search('竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_house = re.search('批准销售:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_all_size = re.search('批准销售:.*?<td.*?<td.*?>(.*?)<', html, re.S | re.M).group(1) comm.co_id = re.search('CaseId=(.*?)$', comm_url).group(1) comm.insert_db() build_url_list = re.findall('href="(/House/BuildingInfo\?buildingInfoID=.*?&caseID=.*?)"', html, re.S | re.M) self.get_build_info(build_url_list, comm.co_id) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_info(self, comm_url): comm = Comm(co_index) comm_url = comm_url.replace('buildingdetail', 'buildinfo') response = self.request_proxy(comm_url, headers=self.headers) html = response.content.decode('gbk') comm.co_name = re.search('class="sf_xq_xmmc">(.*?)<', html, re.S | re.M).group(1).strip() comm.area = re.search('id="Label_CityArea">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_pre_sale_date = re.search('class="sf_xq_jfsj">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_build_type = re.search('id="lbl_JZJG".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_address = re.search('id="Label_ProjectAdress">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_pre_sale = re.search('id="Label_SallPreDocuments">(.*?)<', html, re.S | re.M).group(1).strip() comm.co_all_house = re.search('id="lbl_ZTS".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_build_size = re.search('id="lbl_JZMJ".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_all_size = re.search('id="lbl_ZDMJ".*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_develops = re.search('id="Label_DevName">.*?>(.*?)<', html, re.S | re.M).group(1).strip() comm.co_id = re.search('action=.*?buildingid=(.*?)"', html, re.S | re.M).group(1).strip() comm.insert_db() buildingid = re.search('buildingid=(.*?)$', comm_url, re.S | re.M).group(1) self.get_build_info(buildingid, comm.co_id)
def start_crawler(self): res = requests.get(url, headers=self.headers) content = res.text page = re.search('页数:1/(.*?) ', content, re.S | re.M).group(1) for i in range(1, int(page) + 1): page_url = 'http://newhouse.ntfdc.net/house_certification.aspx?p=' + str( i) response = requests.get(page_url, headers=self.headers) html = response.text comm_html = re.search('class="layer-bd tb-style1">.*?</table>', html, re.S | re.M).group() comm_info_list = re.findall('<tr>.*?</tr>', comm_html, re.S | re.M)[1:] for info in comm_info_list: try: comm = Comm(co_index) comm.co_pre_sale = re.search('<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_name = re.search('<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_all_size = re.search('<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_type = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_pre_sale_date = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_develops = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.insert_db() except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, page_url), e)
def comm_info(self, comm_url_list): for comm_url in comm_url_list: try: co_url = 'http://222.77.178.63:7002/' + comm_url co_res = requests.get(co_url, headers=self.headers) con = co_res.content.decode('gbk') co = Comm(co_index) co.co_id = re.search('projectID=(.*)', comm_url).group(1) co.co_name = re.search('项目名称:.*?">(.*?)</', con, re.S | re.M).group(1) co.area = re.search('所在区县:.*?">(.*?)</', con, re.S | re.M).group(1) co.co_address = re.search('项目地址:.*?">(.*?)</', con, re.S | re.M).group(1) co.co_develops = re.search('企业名称:.*?blank">(.*?)</', con, re.S | re.M).group(1) co.co_all_house = re.search('>总套数.*?">(\d+)<', con, re.S | re.M).group(1) co.co_all_size = re.search('>总面积.*?">(.*?)<', con, re.S | re.M).group(1) project_name = parse.quote(co.co_name) co.insert_db() except Exception as e: # log.error('小区信息错误{}'.format(e)) print('小区信息错误{}'.format(e)) sale_url = "http://222.77.178.63:7002/Presell.asp?projectID=" + co.co_id + "&projectname=" + project_name res = requests.get(sale_url, headers=self.headers) html = etree.HTML(res.content.decode('gbk')) temp_url_list = html.xpath("//a/@href") self.build_info(co.co_id, temp_url_list)
def comm_parse(self,co_name,co_addr,co_area,co_url): co_res = requests.get(co_url,headers=self.headers) co_res.encoding = 'gbk' con = co_res.text co = Comm(co_index) if re.search('开发商名称.*?;">(.*?)</',con,re.S|re.M): co.co_develops = re.search('开发商名称.*?;">(.*?)</',con,re.S|re.M).group(1) else: co.co_develops = None kfsid = re.search('kfsid=(\d+)',co_url).group(1) co.co_id = co_name+kfsid co.co_name = co_name co.co_address = co_addr co.area = co_area co.co_all_house = re.search('总套数.*?">(\d+) ',con,re.S|re.M).group(1) co.co_all_size = re.search('总面积.*?">(.*?) ',con,re.S|re.M).group(1) co.co_residential_size = re.search('住宅面积.*?">(.*?) ',con,re.S|re.M).group(1) co.insert_db() num = 1 while True: pre_url = co_url + "&ypage=" + str(num) # 预售翻页 pre_res = requests.get(pre_url,headers=self.headers) pre_con = pre_res.content.decode('gbk') pre_html = etree.HTML(pre_con) if pre_html.xpath("//table[@id='preselltable1']//tr[@bgcolor='white']"): pre_list = pre_html.xpath("//table[@id='preselltable1']//tr[@bgcolor='white']") num += 1 for pre in pre_list: bu_url = pre.xpath("./td[4]/a/@href")[0] if 'user_Presell' in bu_url: self.bu_parse(bu_url,co.co_id,co_url) else: continue else: break while True: sell_url = co_url + "&page=" + str(num) # 现售翻页 sell_res = requests.get(sell_url, headers=self.headers) sell_con = sell_res.content.decode('gbk') sell_html = etree.HTML(sell_con) if sell_html.xpath("//table[@id='selltable1']//tr[@bgcolor='white']"): sell_list = sell_html.xpath("//table[@id='selltable1']//tr[@bgcolor='white']") num += 1 for sell in sell_list: ho_url = sell.xpath("./td/a/@href")[0] if 'user_sell' in ho_url: bu_id = re.search('ID=(.*?)&',ho_url).group(1) self.house_parse(ho_url,co.co_id,bu_id) else: continue else: break
def get_comm_info(self, comm_info_list): for i in comm_info_list: try: comm = Comm(co_index) comm.co_name = re.search('<td>(.*?)</td>', i, re.S | re.M).group(1) comm.co_all_house = re.search('<td.*?<td>(.*?)</td>', i, re.S | re.M).group(1) comm.co_all_size = re.search('<td.*?<td.*?<td>(.*?)</td>', i, re.S | re.M).group(1) comm.insert_db() except Exception as e: print('小区错误,co_index={},html_str={}'.format(co_index, i), e)
def get_comm_info(self, comm_url_list): for comm_url in comm_url_list: comm_detail = "http://xx.yyfdcw.com" + comm_url try: comm_res = requests.get(comm_detail, headers=self.headers) except Exception as e: print("co_index={},小区详情页无法访问".format(co_index), e) continue con = comm_res.text comm = Comm(co_index) comm.co_id = re.search('ID=(\d+)', con).group(1) comm.co_name = re.search('lpname">.*?<h2>(.*?)</h2', con, re.S | re.M).group(1) comm.co_develops = re.search('开发商:.*?Kfs">(.*?)</span', con, re.S | re.M).group(1) comm.co_green = re.search('绿化率:.*?Lhl">(.*?)</span', con, re.S | re.M).group(1) comm.area = re.search('区域:.*?Name">(.*?)</span', con, re.S | re.M).group(1) comm.co_address = re.search('位置:</b>(.*?)</li', con, re.S | re.M).group(1) comm.co_build_size = re.search('建筑面积:.*?l5">(.*?)</span', con, re.S | re.M).group(1) comm.co_all_house = re.search('总户数:.*?hs">(.*?)</span', con, re.S | re.M).group(1) comm.co_plan_useland = re.search('用地.*?l4">(.*?)</span', con, re.S | re.M).group(1) comm.co_plan_project = re.search('工程.*?l3">(.*?)</span', con, re.S | re.M).group(1) comm.co_build_type = re.search('楼盘类型.*?Type">(.*?)</span', con, re.S | re.M).group(1) comm.co_all_size = re.search('占地面积.*?mianji">(.*?)</span', con, re.S | re.M).group(1) comm.co_land_use = re.search('使用权证.*?l1">(.*?)</span', con, re.S | re.M).group(1) comm.insert_db() try: build_list = re.findall( '<td align="center">.*?<a href="(.*?)"', con, re.S | re.M) if len(build_list) > 0: self.get_build_info(build_list, comm.co_id) else: print("co_index={},小区co_id={}没有楼栋".format( co_index, comm.co_id)) continue except: print("co_index={},小区co_id={}没有楼栋".format( co_index, comm.co_id)) continue
def comm_info(self, co_id): comm_url = "http://www.lsjs.gov.cn/WebLSZFGB/LPDetail.aspx?RowGuid=" + co_id co_res = requests.get(comm_url, headers=self.headers) con = co_res.text co = Comm(co_index) co.co_name = re.search('楼 盘 名 称:(.*?)<br', con).group(1) co.co_id = co_id co.area = re.search('所 属 城 区:.*?">(.*?)</span', con).group(1) co.co_address = re.search('楼 盘 坐 落:.*?">(.*?)</span', con).group(1) co.co_develops = re.search('项 目 公 司:.*?mc">(.*?)</span', con, re.S | re.M).group(1) co.co_pre_sale = re.search('预销售证号.*?">(.*?)</span', con, re.S | re.M).group(1) co.co_all_house = re.search('预售总套数.*?td>(.*?)</td', con, re.S | re.M).group(1) co.co_all_size = re.search('预售总面积.*?td>(.*?)</td', con, re.S | re.M).group(1) co.co_pre_sale_date = re.search('时间.*?">(.*?)</span', con, re.S | re.M).group(1) co.insert_db() url = 'http://www.lsjs.gov.cn/WebLSZFGB/Ashx/YSXM.ashx' count = 1 while True: data = { "method": "getzxl", "PageSize": 5, "CurrentPageIndex": str(count), "YSXMID": co_id, # 'Searchkey':'' } res = requests.post(url, data=data, headers=self.headers) con_dict = json.loads(res.text) num = con_dict["data"][0]['TotalNum'] info_list = con_dict["data"][1:] for info in info_list: bu_id = info["YSZID"] self.build_info(co_id, bu_id) if int(num) < count * 5: break else: count += 1 continue
def start_crawler(self): for i in self.area_list: data = {'districtID': i} res = requests.post(url='http://www.fangdi.com.cn/complexPro.asp', data=data) html_str = res.content.decode('gbk') # 根据返回结果 获取每个地区的返回分页 url_list = re.findall('value="(/complexpro.*?)"', html_str, re.S | re.M) for k in url_list: response = requests.get('http://www.fangdi.com.cn' + k, headers=self.headers) html = response.content.decode('gbk') comm_html = re.search('位置<.*?页/共', html, re.S | re.M).group() comm_info_list = re.findall('<tr valign=.*?</tr>', comm_html, re.S | re.M) for info in comm_info_list: try: comm = Comm(co_index) comm_url = re.search('<a href=(.*?)>', info, re.S | re.M).group(1) comm.co_name = re.search('<a.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_address = re.search('<a.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_all_house = re.search( '<a.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_all_size = re.search( '<a.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.area = re.search( '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', info, re.S | re.M).group(1) comm.co_id = re.search('projectID=(.*?)==', info, re.S | re.M).group(1) self.get_comm_info(comm_url, comm) except Exception as e: print( '小区错误,co_index={},url={}'.format( co_index, 'http://www.fangdi.com.cn' + k), e)
def comm_info(self, url): comm_url = self.start_url + "/" + url res = requests.get(comm_url, headers=self.headers) res.encoding = 'gbk' con = res.text co = Comm(co_index) co.co_id = re.search('kfsid=(\d+)', url).group(1) co.co_name = re.search('itemname.*?">(.*?)</font', con).group(1) co.co_develops = re.search('开发商名称:.*?px;">(.*?)</a', con, re.S | re.M).group(1) co.co_all_house = re.search('总套数:.*?">(.*?) ', con, re.S | re.M).group(1) co.co_all_size = re.search('总面积:.*?">(.*?) ', con, re.S | re.M).group(1) co.co_residential_size = re.search('>住宅面积:.*?">(.*?) ', con, re.S | re.M).group(1) co.co_address = re.search('项目座落.*?;">(.*?)</', con, re.S | re.M).group(1) co.area = re.search('所在地区.*?">(.*?)</td', con, re.S | re.M).group(1) try: co.co_build_size = re.search('建筑面积.*?">(.*?) ', con, re.S | re.M).group(1) co.co_plan_project = re.search('建设工程规划许可证号.*?">(.*?)<br', con, re.S | re.M).group(1) co.co_land_use = re.search('土地证号.*?">(.*?)<br', con, re.S | re.M).group(1) co.co_work_pro = re.search('建筑工程施工许可证号.*?">(.*?)<br', con, re.S | re.M).group(1) co.co_use = re.search('用途.*?">(.*?)<br', con, re.S | re.M).group(1) except: co.co_build_size = None co.co_plan_project = None co.co_land_use = None co.co_work_pro = None co.co_us = None co.insert_db() co_html = etree.HTML(con) bu_list = co_html.xpath( "//table[@id='preselltable1']/tr[@bgcolor='white']") self.build_info(bu_list, co.co_id)
def get_comm_info(self, comm_res, co_id): comm = Comm(co_index) con = comm_res.text comm.co_name = re.search('项目名称.*?">(.*?)<', con, re.S | re.M).group(1) comm.co_id = co_id comm.co_address = re.search('项目地址.*?<td>(.*?)<', con, re.S | re.M).group(1) comm.co_develops = re.search('开 发 商:.*?<td.*?>(.*?)<', con, re.S | re.M).group(1) comm.co_all_size = re.search('建设用地面积.*?<td>(.*?)</td>', con, re.S | re.M).group(1) comm.co_size = re.search('占地面积.*?<td>(.*?)</td>', con, re.S | re.M).group(1) comm.co_build_size = re.search('项目总建筑面积:.*?<td>(.*?)</td>', con, re.S | re.M).group(1) comm.co_land_use = re.search('土地使用证号.*?<td>(.*?)<', con, re.S | re.M).group(1) comm.co_plan_pro = re.search('规划许可证号.*?<td>(.*?)<', con, re.S | re.M).group(1) comm.insert_db() build_id_list = re.findall("onclick=.doview\('(\d+)'\)", con, re.S | re.M) self.get_build_info(build_id_list, co_id)
def get_comm_info(self,comm_info): co = Comm(co_index) co.co_name = re.search('_blank">(.*?)</a',comm_info).group(1) try: co.co_address = re.findall('px">(.*?)</td',comm_info)[1] except: co.co_address = None co.area = re.search('center">(.*?)</td>',comm_info).group(1) co_detail_url = re.search("href='(.*?)'",comm_info).group(1) co_url = "http://www.qyfgj.cn/newys/"+co_detail_url try: res = requests.get(co_url,headers=self.headers) except Exception as e: print("co_index={}小区未请求到".format(co_index),e) con = res.content.decode('gbk') try: co.co_develops = re.search('开发商名称.*?px;">(.*?)</a',con,re.S|re.M).group(1) co.co_all_house = re.search('总套数.*?">(\d+) ',con,re.S|re.M).group(1) co.co_all_size = re.search('总面积.*?">(\d+.\d+) m',con,re.S|re.M).group(1) except: print("小区无开发商等信息") co.insert_db() try: build = re.findall('<tr bgcolor="white">(.*?)</tr>',con,re.S|re.M) except: print("小区没有楼栋信息") build_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': co_url } for build_info in build: if "进入" in build_info: build_url = re.search('href="(.*?)"><font',build_info).group(1) build_url = "http://www.qyfgj.cn/newys/" + build_url ho_headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': build_url } build_res = requests.get(build_url, headers=build_headers) build_con = build_res.content.decode('gbk') if re.search('ID=(\d+)',build_url): #现售 bu = Building(co_index) bu_id = re.search('ID=(\d+)',build_url).group(1) bu.bu_id = bu_id bu.co_name =co.co_name bu.insert_db() self.get_house_info(headers=ho_headers,bu_id=bu_id,url=build_url) else: #预售 bu = Building(co_index) bu.co_name = co.co_name bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>', build_con, re.S | re.M).group(1) bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>', build_con, re.S | re.M).group(1) bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con, re.S | re.M).group(1) ret = re.findall('<tr onmouseover(.*?)</tr',build_con,re.S|re.M) for i in ret: house_url = re.search('href="(.*?)"',i).group(1) house_url = "http://www.qyfgj.cn/newys/" + house_url bu.bu_id = re.search('dbh=(.*?)&',i).group(1) bu.bu_num = re.search('<td width="89.*?">(.*?)</',i).group(1) bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',i).group(1) bu.insert_db() ho_res = requests.get(house_url,headers=ho_headers) ho_con = ho_res.content.decode('gbk') new_headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36', 'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j', 'Referer': house_url } self.get_house_info(ho_con=ho_con,headers=new_headers,bu_id=bu.bu_id) else: print("楼栋无链接地址")