def build_info(self, co_id, temp_url_list): for temp_url in temp_url_list: try: build_url = "http://222.77.178.63:7002/" + temp_url res = requests.get(build_url, headers=self.headers) html = etree.HTML(res.content.decode('gbk')) build_info_list = html.xpath("//tr[@class='indextabletxt']") for build_info in build_info_list: bu = Building(co_index) ho_url = build_info.xpath("./td/a/@href")[0] bu.co_id = co_id bu.bu_id = re.search('Param=(.*)', ho_url).group(1) bu.bu_num = build_info.xpath("./td/a/text()")[0] bu.bu_all_house = build_info.xpath("./td[2]/text()")[0] try: bu.bu_all_size = build_info.xpath("./td[3]/text()")[0] except: bu.bu_all_size = None try: bu.bu_live_size = build_info.xpath("./td[5]/text()")[0] except: bu.bu_live_size = None bu.insert_db() except Exception as e: # log.error('楼栋信息错误{}'.format(e)) print('楼栋信息错误{}'.format(e)) continue self.house_info(ho_url, co_id, bu.bu_id)
def get_build_info(self, build_info_list, co_id): for i in build_info_list: try: build = Building(co_index) build.bu_num = re.search('<td>(.*?)</td>', i, re.S | re.M).group(1) build.bu_all_house = re.search('<td>.*?<td>(.*?)</td>', i, re.S | re.M).group(1) build.bu_all_size = re.search('<td>.*?<td>.*?<td>(.*?)</td>', i, re.S | re.M).group(1) build.bu_id = re.search('\?id=(.*?)"', i, re.S | re.M).group(1) build.co_id = co_id build.insert_db() house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1) self.get_house_info(house_url, co_id, build.bu_id) except Exception as e: print('楼栋错误,co_index={},str={}'.format(co_index, i), e)
def bu_parse(self, detail_url, co_id): pre_url = detail_url.replace('lp', 'presell') pre_res = requests.get(pre_url, headers=self.headers) pre_html = etree.HTML(pre_res.text) bu_pre_list = pre_html.xpath("//dt/strong/a") for bu_pre in bu_pre_list: bu_pre_url = bu_pre.xpath("./@href")[0] bu_pre_sale = bu_pre.xpath("./text()")[0] bu_url = 'http://www.zstmsf.com' + bu_pre_url while True: try: proxy = self.proxies[random.randint(0, 9)] bu_res = requests.get(bu_url, headers=self.headers, proxies=proxy, timeout=10) break except: continue bu_html = etree.HTML(bu_res.text) bu_list = bu_html.xpath("//tr//strong/a/@href") for bo_url in bu_list: ho_url = "http://www.zstmsf.com" + bo_url while True: try: proxy = self.proxies[random.randint(0, 9)] ho_res = requests.get(ho_url, headers=self.headers, proxies=proxy, timeout=10) break except: continue build = Building(co_index) build.co_id = co_id build.bu_id = re.search('zid=.*?(\d+)', ho_url).group(1) build.bu_num = re.search('幢名称:<strong>(.*?)<', ho_res.text).group(1) build.bu_all_house = re.search("幢总套数.*?'>(.*?)</", ho_res.text).group(1) build.bu_all_size = re.findall("面积.*?'>(.*?)</", ho_res.text)[0] build.bu_pre_sale = bu_pre_sale build.insert_db() self.ho_parse(co_id, build.bu_id, ho_res)
def get_build_info(self, co_id): build_url = 'http://www.yanjifc.com/jdi' payload = "activityId=" + str(co_id) + "&module=jtsActBuildingInfo" result = requests.post(url=build_url, data=payload, headers=self.headers) data = result.json() build_list = data['ROWS']['ROW'] for i in build_list: build = Building(co_index) build.bu_all_size = self.dict_get(i, 'BUILDING_AREA') build.bu_address = self.dict_get(i, 'LOCATION') build.bu_num = self.dict_get(i, 'LOCATION') build.bu_floor = self.dict_get(i, 'TOTAL_FLOORS') build.bu_all_house = self.dict_get(i, 'TOTAL_SET') build.co_build_structural = self.dict_get(i, 'STRUCTURE') build.bu_id = self.dict_get(i, 'RESOURCE_GUID') build.co_id = co_id build.insert_db() self.get_house_info(co_id, build.bu_id)
def bu_parse(self, co_id, bulist): for bo in bulist: bu_url = "http://110.89.45.7:8082" + bo bu_res = requests.get(bu_url, headers=self.headers) con = bu_res.text bu = Building(co_index) bu.co_id = co_id bu.bu_id = re.search('buildingInfoID=(.*?)&', bo).group(1) bu.bu_num = re.search('幢号.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_floor = re.search('总 层 数.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_live_size = re.search('批准销售.*?">.*?</td.*?">(.*?)</td', con, re.S | re.M).group(1) bu.bu_all_size = re.search('总面积.*?">(.*?)</', con, re.S | re.M).group(1) bu.bu_type = re.search('设计用途.*?">(.*?)</', con, re.S | re.M).group(1) bu.insert_db() bu_html = etree.HTML(con) ho_list = bu_html.xpath("//td[@style]/a") self.ho_parse(co_id, bu.bu_id, ho_list)