def get_house_info(self, house_url_list): for i in house_url_list: try: dongid = re.search('dongid=(.*?)&', i).group(1) roomid = re.search('roomid=(.*?)&', i).group(1) house_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/roominfo.aspx?dongid=' + dongid + '&roomid=' + roomid house = House(co_index) house.co_name = 'Labelxqmc">(.*?)<' house.area = 'Labelxzq">(.*?)<' house.bu_num = 'Labeldongmc">(.*?)<' house.ho_type = 'Labelyxyongtu">(.*?)<' house.ho_name = '<span id="Labelroommc".*?>(.*?)</span>' house.ho_build_size = 'Labeljzmianji">(.*?)<' house.ho_true_size = 'Labeltaonei">(.*?)<' house.ho_share_size = 'Labelgongtan">(.*?)<' house.ho_room_type = 'Labelhuxing">(.*?)<' house.bu_id = 'dongid=(.*?)&' p = ProducerListUrl(page_url=house_url, request_type='get', encode='utf-8', analyzer_rules_dict=house.to_dict(), analyzer_type='regex', headers=self.headers) p.get_details() except Exception as e: print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
def get_house_info(self, house_url, bu_id): response = requests.get('http://thfdc.net/' + house_url, headers=self.headers) html = response.text house_info_list = re.findall('<tr onClick=.*?</tr>', html, re.S | re.M) for i in house_info_list: try: house = House(co_index) house.ho_name = re.search('<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house.area = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house.ho_build_size = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house.ho_type = re.search( '<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1) house.bu_id = bu_id house.insert_db() except Exception as e: print('房号错误,co_index={},url={}'.format())