def start_crawler(self): response = requests.get(url) html = response.text tree = etree.HTML(html) all_url = tree.xpath('//a[@class="a_name"]/@href') for i in all_url: comm = Comm(co_index) if i == '#': continue comm_url = 'http://www.lzfc.com.cn:8080' + i comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_address = "cc1.innerHTML='(.*?)'" comm.area = "cc2.innerHTML='(.*?)'" comm.co_use = "cc4.innerHTML='(.*?)'" comm.co_develops = "cc5.innerHTML='(.*?)'" comm.co_open_time = "cc6.innerHTML='(.*?)'" comm.co_all_house = "cc9.innerHTML='(.*?)'" comm.co_build_size = "cc11.innerHTML='(.*?)'" comm.co_name = "cc0.innerHTML='(.*?)'" comm.co_id = "BaseCode=(.*?)'" p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details() for i in build_url: build = Building(co_index) build_detail_url = 'http://www.lzfc.com.cn:8080' + i build.bu_num = 'onclick=comInfoView.*?center">(.*?)<' build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<' build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<' build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<' build.co_name = 'fontbg_red">(.*?)<' build.bu_id = "onclick=comInfoView\('(.*?)'\)" p = ProducerListUrl(page_url=comm_url, request_type='get', encode='gbk', analyzer_rules_dict=comm.to_dict(), current_url_rule="queryBuildHerf1.href='(.*?)'", analyzer_type='regex') build_url = p.get_details()
def get_build_info(self, build_url_list): for i in build_url_list: try: build = Building(co_index) build_url = 'http://www.fjnpfdc.com/House/' + i res = requests.get(build_url, headers=self.headers) con = res.content.decode('gbk') build.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_num = re.search("幢 号:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_use = re.search("设计用途:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_structural = re.search("建筑结构:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_floor = re.search("总 层 数:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.bu_build_size = re.search("总 面 积:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) build.co_build_end_time = re.search("竣工日期:.*?<td.*?>(.*?)<", con, re.S | re.M).group(1) house_url_list = re.findall('<a href="(HouseInfo.*?)"', con) # p = ProducerListUrl(page_url=build_url, # request_type='get', encode='gbk', # analyzer_rules_dict=build.to_dict(), # current_url_rule='<a href="(HouseInfo.*?)"', # analyzer_type='regex', # headers=self.headers) build.co_id = re.search('ProjectId=(.*?)&', i).group(1) build.bu_id = re.search('BuildingId=(.*?)&P', i).group(1) build.insert_db() # house_url_list = p.get_details() self.get_house_info(house_url_list, build.bu_id, build.co_id) except Exception as e: print("co_index={},楼栋{}错误".format(co_index, i), e)