def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_code = re.search('xqbm=(.*?)$', i).group(1) build_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/donginfo.aspx?xqbm=' + build_code build.bu_num = 'Labeldongmc">(.*?)<' build.bu_pre_sale = 'Labelyszheng">(.*?)<' build.bu_floor = 'Labelsceng">(.*?)<' build.bu_address = 'Label1zuoluo">(.*?)<' build.bo_build_start_time = 'Label1kaigong">(.*?)<' build.co_build_structural = 'Labeljiegou">(.*?)<' build.co_id = 'donginfo.aspx\?xqbm=(.*?)"' build.bu_id = 'id="DropDownList1".*?value="(.*?)"' p = ProducerListUrl(page_url=build_url, request_type='get', encode='utf-8', analyzer_rules_dict=build.to_dict(), current_url_rule='location\.href=(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm.co_id = '楼盘首页.*?aid-(.*?)/' comm.co_name = 'class="ls">(.*?)<' comm.co_type = '物业类型</em>(.*?)<' comm.area = '区域所属:</em>(.*?)<' comm.co_green = '绿 化 率:</em>(.*?)<' comm.co_volumetric = '容 积 率:</em>(.*?)<' comm.co_build_type = '楼 层:</em>(.*?)<' comm.co_size = '占地面积:</em>(.*?)<' comm.co_build_size = '建筑面积:</em>(.*?)<' comm.co_develops = '开 发 商:</em><.*?target="_blank">(.*?)<' comm.co_address = '项目地址:</em>(.*?)<' data_list = comm.to_dict() p = ProducerListUrl( page_url=i, request_type='get', encode='gbk', analyzer_rules_dict=data_list, current_url_rule= 'colspan="3" align="right"><a href="(.*?)"', analyzer_type='regex', headers=self.headers) more_build_url = p.get_details() self.get_build_info(more_build_url) except Exception as e: print(e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = i.replace('view', 'detail') comm.co_type = '物业类型:.*?<dd>(.*?)<' comm.area = '区域所属:.*?<dd>(.*?)<' comm.co_build_size = '建筑面积:.*?<dd>(.*?)<' comm.co_size = '占地面积:.*?<dd>(.*?)<' comm.co_green = '绿化率:.*?<dd><.*?>(.*?)<' comm.co_build_type = '楼 层:.*?<dd>(.*?)<' comm.co_volumetric = '容积率:.*?<dd><.*?>(.*?)<' comm.co_id = '楼盘首页.*?newhouse/.*?/(.*?)/' comm.co_name = '<h1 class="title">(.*?)<' comm.co_address = '楼盘地址:.*?<dd>(.*?)<' comm.co_develops = '开发商:.*?<dd(.*?)<' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print(e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://www.gzbjfc.com/' + i comm.co_name = 'cph_hif1_xmmc.*?<.*?>(.*?)<' comm.co_pre_sale = 'cph_hif1_xsxkz.*?<.*?>(.*?)<' comm.co_address = 'cph_hif1_zl.*?<.*?>(.*?)<' comm.co_develops = 'cph_hif1_kfs.*?<.*?>(.*?)<' comm.co_handed_time = 'cph_hif1_jfsj.*?<.*?>(.*?)<' comm.co_build_size = 'cph_hif1_jzmj.*?>(.*?)<' comm.co_all_house = 'cph_hif1_fwts.*?>(.*?)<' comm.co_id = 'hdl1_hfYszh" value="(.*?)"' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='utf-8', analyzer_rules_dict=comm.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() # 楼栋信息 build_url = comm_url.replace('Info', 'Building') self.get_build_info(build_url) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_info(self, all_url_list): try: c = Comm(co_index) c.co_name = "class='newtopleft font-k'>(.*?)</li>" c.co_id = 'form1" method="post" action="house_base\.aspx\?id=(.*?)"' c.co_address = "项目位置:</li><li class='DetaimidR font-f'>(.*?)</li></ul>" c.area = "地区/商圈:</li><li class='DetaimidR font-f'>(.*?)<" c.co_develops = "开发商:</li><li class='DetaimidR font-f'>(.*?)</li>" c.co_volumetric = "容积率:</li><li class='DetaimidR font-f'>(.*?)<" c.co_green = "绿化率:</li><li class='DetaimidR font-f'>(.*?)<" c.co_all_house = "总户数:</li><li class='DetaimidR font-f'>(.*?)<" c.co_open_time = "开盘时间:</li><li class='DetaimidR font-f'>(.*?)<" c.co_land_use = "国土使用证:</li><li class='DetaimidR font-f'>(.*?)<" c.co_plan_pro = "规划许可证:</li><li class='DetaimidR font-f'>(.*?)<" c.co_build_size = "建筑面积:</li><li class='DetaimidR font-f'>(.*?)<" data_list = c.to_dict() p = ProducerListUrl(page_url=all_url_list, request_type='get', encode='utf-8', analyzer_rules_dict=data_list, analyzer_type='regex', headers=self.headers) p.get_details() global count count += 1 print(count) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, all_url_list), e)
def get_house_info(self, house_url_list): for i in house_url_list: try: dongid = re.search('dongid=(.*?)&', i).group(1) roomid = re.search('roomid=(.*?)&', i).group(1) house_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/roominfo.aspx?dongid=' + dongid + '&roomid=' + roomid house = House(co_index) house.co_name = 'Labelxqmc">(.*?)<' house.area = 'Labelxzq">(.*?)<' house.bu_num = 'Labeldongmc">(.*?)<' house.ho_type = 'Labelyxyongtu">(.*?)<' house.ho_name = '<span id="Labelroommc".*?>(.*?)</span>' house.ho_build_size = 'Labeljzmianji">(.*?)<' house.ho_true_size = 'Labeltaonei">(.*?)<' house.ho_share_size = 'Labelgongtan">(.*?)<' house.ho_room_type = 'Labelhuxing">(.*?)<' house.bu_id = 'dongid=(.*?)&' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: code = i.split(',') comm_url = 'http://www.tmsf.com/newhouse/property_' + code[ 0] + '_' + code[1] + '_info.htm' comm = Comm(co_index) comm.co_name = 'buidname.*?>(.*?)<' comm.co_address = '--位置行--.*?<span.*?title="(.*?)"' comm.co_build_type = '建筑形式:<.*?>(.*?)<' comm.co_develops = '项目公司:<.*?>(.*?)<' comm.co_volumetric = '容 积 率:</span>(.*?)<' comm.co_green = '绿 化 率:</span>(.*?)<' comm.co_size = '占地面积:</span>(.*?)<' comm.co_build_size = '总建筑面积:</span>(.*?)<' comm.co_all_house = '总户数:</span>(.*?)<' comm.co_id = 'info" href="/newhouse/property_(.*?)_info' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='utf-8', analyzer_rules_dict=comm.to_dict(), current_url_rule='一房一价<.*?href="(.*?)"', analyzer_type='regex', headers=self.headers) build_all_url = p.get_details() global count count += 1 print('comm:', count) self.get_build_info(build_all_url) except Exception as e: print('小区页面,co_index={},url={}'.format(co_index, comm_url), e)
def get_comm_url(self, all_url_list): p = ProducerListUrl(page_url=all_url_list, request_type='get', encode='utf-8', current_url_rule="<a class='anone' href='(.*?)'", analyzer_rules_dict=None, analyzer_type='regex', headers=self.headers) comm_url_list = p.get_current_page_url() return comm_url_list
def start_crawler(self): for i in range(1, self.page + 1): url = self.url + "More_xm.aspx?page=" + str(i) p = ProducerListUrl(page_url=url, request_type='get', encode='utf-8', analyzer_rules_dict=None, current_url_rule="//td[@align='left']/a/@href", analyzer_type='xpath', headers=self.headers) comm_url_list = p.get_current_page_url() self.get_comm_info(comm_url_list)
def get_build_info(self, all_build_url_list): b = Building(co_index) b.co_id = "onclick=GetData\('(.*?)'," b.bu_id = "onclick=GetData\('.*?','(.*?)'" b.bu_num = "font12yellow-leftA'>.*?</span>套</td><td>.*?</td><td>(.*?)<" b.bu_all_house = "font12yellow-leftA'>(.*?)<" data_list = b.to_dict() p = ProducerListUrl( page_url=all_build_url_list, request_type='get', encode='utf-8', analyzer_rules_dict=data_list, current_url_rule="onclick=GetData\('(.*?)','(.*?)'\)", analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() return house_url_list
def get_house_info(self, build_num, sid): try: house_url = 'http://www.tmsf.com/newhouse/NewPropertyHz_showbox.jspx?buildingid=' + build_num + '&sid=' + sid house = House(co_index) house.bu_id = 'buildingid":(.*?),' house.co_build_size = 'builtuparea":(.*?),' house.ho_price = 'declarationofroughprice":(.*?),' house.ho_name = 'houseno":(.*?),' house.ho_true_size = 'setinsidefloorarea":(.*?),' house.ho_share_size = 'poolconstructionarea":(.*?),' house.ho_type = 'houseusage":(.*?),' p_2 = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p_2.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='共(.*?)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_page_url = url + '&Page=' + str(i) p = ProducerListUrl(page_url=all_page_url, request_type='get', encode='gbk', analyzer_rules_dict=None, current_url_rule="eval\('openBldg\((.*?)\)", analyzer_type='regex', headers=self.headers) comm_url_list = p.get_current_page_url() self.get_build_info(comm_url_list)
def get_house_info(self, house_url_list): for i in house_url_list: try: house = House(co_index) house_url = 'http://www.ndjsj.gov.cn/House/' + i house.bu_num = '幢 号:.*?<td.*?>(.*?)<' house.ho_name = '房 号:.*?<td.*?>(.*?)<' house.co_name = '项目名称:.*?<td.*?>(.*?)<' house.ho_build_size = '建筑面积:.*?<td.*?>(.*?)<' house.ho_true_size = '套内面积:.*?<td.*?>(.*?)<' house.ho_share_size = '分摊面积:.*?<td.*?>(.*?)<' house.ho_type = '房屋用途:.*?<td.*?>(.*?)<' house.ho_floor = '所 在 层:.*?<td.*?>(.*?)<' house.ho_room_type = '房屋户型:.*?<td.*?>(.*?)<' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('宁德房号错误,url={}'.format(house_url), e)
def get_comm_info(self, comm_detail_url_list): for i in comm_detail_url_list: try: comm = Comm(co_index) comm_url = 'http://www.ndjsj.gov.cn' + i comm.co_develops = '公司名称:.*?<td.*?>(.*?)<' comm.co_name = '项目名称:.*?<td.*?>(.*?)<' comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<' comm.co_address = '项目坐落:.*?<td.*?>(.*?)<' comm.co_use = '规划用途:.*?<td.*?>(.*?)<' comm.co_size = '占地面积:.*?<td.*?>(.*?)<' comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='utf-8', analyzer_rules_dict=comm.to_dict(), current_url_rule="(BuildingInfo\?BuildingId=.*?)'", analyzer_type='regex', headers=self.headers) build_url_list = p.get_details() self.get_build_info(build_url_list) except Exception as e: print('宁德小区错误,url={}'.format(comm_url), e)
def start_crawler(self): b = AllListUrl( first_page_url=url, request_method='get', analyzer_type='regex', encode='gbk', page_count_rule='>>></a>.*?href=".*?page-(.*?)\.html', ) page = b.get_page_count() for i in range(1, int(page) + 1): all_url = 'http://www.jmfc.com.cn/index/caid-2/addno-1/page-' + str( i) + '.html' p = ProducerListUrl( page_url=all_url, request_type='get', encode='gbk', analyzer_rules_dict=None, current_url_rule= "/html/body/div[5]/div[6]/div/div[2]/h3/a/@href", analyzer_type='xpath', headers=self.headers) comm_url_list = p.get_current_page_url() self.get_comm_info(comm_url_list)
def start_crawler(self): b = AllListUrl( first_page_url=self.start_url, request_method='get', analyzer_type='regex', encode='utf-8', page_count_rule='<cite>共.*?/(\d+)页', ) page = b.get_page_count() for i in range(1, int(page) + 1): url = "http://www.f0795.cn/house/index-htm-page-" + str( i) + ".html" p = ProducerListUrl( page_url=url, request_type='get', encode='utf-8', analyzer_rules_dict=None, current_url_rule= "//ul[@class='list']//div[@class='text']/h3/a/@href", analyzer_type='xpath', headers=self.headers) comm_url_list = p.get_current_page_url() self.get_comm_info(comm_url_list)
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.ndjsj.gov.cn/House/' + i build.co_name = '项目名称:.*?<td.*?>(.*?)<' build.bu_num = '幢 号:.*?<td.*?>(.*?)<' build.bu_address = '坐落位置:.*?<td.*?>(.*?)<' build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<' build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<' build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<' # build.bu_type = '设计用途:.*?<td.*?>(.*?)<' build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<' p = ProducerListUrl(page_url=build_url, request_type='get', encode='utf-8', analyzer_rules_dict=build.to_dict(), current_url_rule='javascript:ShowTitle.*?href="(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print('宁德楼栋错误,url={}'.format(build_url), e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://www.fjnpfdc.com/House/' + i comm.co_develops = '公司名称:.*?<td.*?>(.*?)<' comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<' comm.co_name = '项目名称:.*?<td.*?>(.*?)<' comm.co_address = '项目坐落:.*?<td.*?>(.*?)<' comm.co_use = '规划用途:.*?<td.*?>(.*?)<' comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<' comm.co_id = 'ProjectId=(.*?)&' p = ProducerListUrl( page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="<a href='(BuildingInfo.*?)'", analyzer_type='regex', headers=self.headers) build_url_list = p.get_details() self.get_build_info(build_url_list) except Exception as e: print("co_index={},小区{}错误".format(co_index, i), e)
def get_build_info(self, more_build_url): for i in more_build_url: try: build = Building(co_index) build_url = 'http://www.jmfc.com.cn/' + i build.bu_num = '<tr bgcolor="#FFFFFF">.*?<td.*?>(.*?)<' build.co_id = '楼盘首页.*?aid-(.*?)/' build.bu_id = '&addno=12&action=loupantable&lzbm=(.*?)&ql_xh=' build.bu_pre_sale = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>(.*?)<' build.bu_floor = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<' build.bu_all_house = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<' p = ProducerListUrl( page_url=build_url, request_type='get', encode='gbk', analyzer_rules_dict=build.to_dict(), current_url_rule= '<tr bgcolor="#FFFFFF">.*?align="left".*?href="(.*?)"', analyzer_type='regex', headers=self.headers) house_url_list = p.get_details() self.get_house_info(house_url_list) except Exception as e: print(e)
def get_comm_info(self, comm_url_list): for i in comm_url_list: try: comm = Comm(co_index) comm_url = 'http://zjjg.0557fdc.com:9555/' + i comm.co_name = '小区名称:.*?<td.*?>(.*?)<' comm.area = '所属区域:.*?<td.*?>(.*?)<' comm.co_address = '座落:.*?<td.*?>(.*?)<' comm.co_develops = '开发商名称:.*?<td.*?>(.*?)<' comm.co_pre_sale = '开发企业营业执照号.*?<td.*?>(.*?)<' comm.co_all_house = 'Label1">(.*?)<' comm.co_build_size = 'Label2">(.*?)<' comm.co_id = 'action=.*?xqbm=(.*?)"' p = ProducerListUrl(page_url=comm_url, request_type='get', encode='utf-8', analyzer_rules_dict=comm.to_dict(), current_url_rule='action="(.*?)"', analyzer_type='regex', headers=self.headers) build_url_list = p.get_details() self.get_build_info(build_url_list) except Exception as e: print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
def start_crawler(self): response = requests.get(url) html = response.text tree = etree.HTML(html) all_url = tree.xpath('//a[@class="a_name"]/@href') for i in all_url: comm = Comm(co_index) if i == '#': continue comm_url = 'http://www.lzfc.com.cn:8080' + i comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_address = "cc1.innerHTML='(.*?)'" comm.area = "cc2.innerHTML='(.*?)'" comm.co_use = "cc4.innerHTML='(.*?)'" comm.co_develops = "cc5.innerHTML='(.*?)'" comm.co_open_time = "cc6.innerHTML='(.*?)'" comm.co_all_house = "cc9.innerHTML='(.*?)'" comm.co_build_size = "cc11.innerHTML='(.*?)'" comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_id = "BaseCode=(.*?)'" p = ProducerListUrl( page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details() for i in build_url: build = Building(co_index) build_detail_url = 'http://www.lzfc.com.cn:8080' + i build.bu_num = 'onclick=comInfoView.*?center">(.*?)<' build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<' build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<' build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<' build.co_name = 'fontbg_red">(.*?)<' build.bu_id = "onclick=comInfoView\('(.*?)'\)" p = ProducerListUrl( page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details()