def city_info(self, index_url, city): for i in range(1, 101): index_url_ = index_url + 'i3' + str(i) + '/' if i == 1: index_url_ = index_url try: response = requests.get(index_url_, headers=self.headers) html = response.text try: city_real = re.search('city = "(.*?)"', html, re.S | re.M).group(1) if city != city_real: break house_num = re.search('class="org">(.*?)</b>', html, re.S | re.M).group(1) if house_num == '0': break comm_info_paper_list = re.findall('class="info rel floatr".*?</dd>', html, re.S | re.M) for comm_info_paper in comm_info_paper_list: comm = Comm('房天下') comm.city = city comm.district_name = re.search('<a.*?>(.*?)<', comm_info_paper, re.S | re.M).group(1).strip() if '�' in comm.district_name: log.error('网页出现繁体字, url={}'.format(index_url_)) continue comm.direction = re.search('class="mt18">(.*?)<', comm_info_paper, re.S | re.M).group(1) try: comm.height = int(re.search('共(.*?)层', comm_info_paper, re.S | re.M).group(1)) except Exception as e: comm.height = None comm.region = re.search('class="mt15">.*?<a.*?chengjiao.*?>(.*?)<', comm_info_paper, re.S | re.M).group( 1) total_price = re.search('class="price">(.*?)<', comm_info_paper, re.S | re.M).group(1) if '*' in total_price: continue comm.total_price = int(total_price) * 10000 comm.room = int(re.search('(\d+)室', comm.district_name, re.S | re.M).group(1)) comm.hall = int(re.search('(\d+)厅', comm.district_name, re.S | re.M).group(1)) try: comm.area = float(re.search('(\d+\.\d+)平米', comm.district_name, re.S | re.M).group(1)) except Exception as e: comm.area = None trade_date = re.search('class="time".*?>(.*?)<', comm_info_paper, re.S | re.M).group(1) t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) try: comm.avg_price = int(comm.total_price / comm.area) except Exception as e: comm.avg_price = None comm.insert_db() except Exception as e: log.error('解析错误,source="{}",html="{}",e="{}"'.format('房天下', html, e)) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format('房天下', index_url_, e))
def room(self, co_list, city_name): for co in co_list: try: co_name = co.xpath("./div[1]/a/text()")[0] co_url = "http:" + co.xpath("./div[1]/a/@href")[0] region = co.xpath("./div[3]/span[1]/a[1]/text()")[0] addr = co.xpath("./div[3]/span[3]/@title")[0] detail = requests.get(co_url, headers=self.headers) html = etree.HTML(detail.text) room_url = "http:" + html.xpath("//div[@class='tab-toolbar pr']//li/a/@href")[-1] page_index = requests.get(room_url, headers=self.headers) except: continue if re.search('共(\d+)页', page_index.text): page_num = re.search('共(\d+)页', page_index.text).group(1) else: log.info('小区无相关数据') continue for i in range(1, int(page_num) + 1): url = re.sub('#.*', 'n', room_url) + str(i) while True: try: res = requests.get(url, headers=self.headers) break except: continue con = res.text room_html = etree.HTML(con) room_list = room_html.xpath("//div[@class='right-information']") for m in room_list: try: room = Comm(source) room.district_name = co_name room.city = city_name room.region = region room_type = m.xpath("./h3/span[2]/text()")[0] try: room.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: room.room = None try: room.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: room.hall = None size = m.xpath("./h3/span[3]/text()")[0] area = size.replace('平米', '') if area: area = float(area) room.area = round(area, 2) total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0] room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 avg_price = m.xpath(".//div[@class='size fs14']/text()")[0] room.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) try: room.fitment = m.xpath(".//div[@class='t1 fs14']/text()[3]")[0] room.direction = m.xpath(".//div[@class='t1 fs14']/text()[2]")[0] # room.use = m.xpath(".//div[@class='t1 fs14']/text()[1]")[0] except: room.fitment = None room.direction = None # room.use = None floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0] try: floor = re.search('(.*?)/', floor_info).group(1) room.floor = int(re.search('\d+',floor).group(0)) except Exception as e: room.floor = None try: room.height = int(re.search('.*?/(\d+)层', floor_info).group(1)) except: room.height = None trade_date = m.xpath(".//div[@class='date']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = datetime.datetime(y, m, d) room.insert_db() except Exception as e: log.error('房屋信息提取失败{}'.format(e))
def comm_info(self, comm_url_list, city_url): for comm_url in comm_url_list: url = city_url.replace('/esf/', comm_url) re_url = url.replace('xq', 'fangjia') try: res = requests.get(url=re_url, headers=self.headers, proxies=next(p)) except Exception as e: log.error('请求失败, source={}, url={}, e={}'.format( '乐有家', re_url, e)) continue con = res.text co_name = re.search('wrap-head-name">(.*?)</div', con, re.S | re.M).group(1) co_name = co_name.strip() try: page = re.search('(\d+)">尾页', con).group(1) except: page = 1 for i in range(1, int(page) + 1): page_url = re_url.rstrip('.html') + "/?n=" + str(i) print(page_url) try: co_res = requests.get(url=page_url, headers=self.headers, proxies=next(p)) except Exception as e: log.error('请求失败, source={}, url={}, e={}'.format( '乐有家', page_url, e)) continue co_html = etree.HTML(co_res.text) city = co_html.xpath( "//span[@class='change-city']/text()")[0].replace( '\t', '').replace('[', '') romm_info_list = co_html.xpath("//div[@class='list-cont']/div") for room_info in romm_info_list: room = Comm(source) # 城市 room.city = city # 小区名称 room.district_name = co_name try: # 所在楼层 floor = room_info.xpath( ".//div[@class='text']/p[2]/span[1]/text()")[0] floor = re.search('(.*?)/', floor).group(1) room.floor = int(re.search('\d+', floor).group(0)) except: room.floor = None try: # 总楼层 height = room_info.xpath( ".//div[@class='text']/p[2]/span[1]/text()")[0] room.height = int( re.search('/(\d+)层', height).group(1)) except: room.height = None try: # 交易时间 trade_date = room_info.xpath( ".//span[@class='cj-data-num']/text()")[0] t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = datetime.datetime(y, m, d) except: room.trade_date = None try: # 总价 total_price = room_info.xpath( ".//span[@class='cj-data-num c4a4a4a']/em/text()" )[0] if '*' in total_price: log.error('source={}, 总价有问题 带*号'.format('乐有家')) continue else: room.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 except: room.total_price = None try: # 均价 avg_price = room_info.xpath( ".//span[@class='cj-data-num']/em/text()")[0] if '*' in avg_price: log.error('source={}, 均价有问题 带*号'.format('乐有家')) continue else: room.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) except: room.avg_price = None try: # 朝向 room.direction = room_info.xpath( ".//div[@class='text']/p[2]/span[2]/text()" )[0].replace('朝', '') except: room.direction = None try: region_area_info = room_info.xpath( "./div[@class='text']/p[1]/text()")[1] except: return try: # 区域 room.region = region_area_info.split(' ')[1] except: room.region = None try: # 面积 size = re.search('建筑面积(.*?)平', region_area_info).group(1) if size: area = float(size) room.area = round(area, 2) except: room.area = None room.insert_db()
def get_page_url(self, page_url, city, area_): response = requests.get(page_url, headers=self.headers, proxies=self.proxy) html = response.text comm_html_list = re.findall('<li class=" clearfix">.*?</li>', html, re.S | re.M) for i in comm_html_list: try: comm = Comm('Q房网') comm.city = city.strip() comm.region = area_.strip() comm.district_name = re.search('house-title">.*?<a.*?>(.*?)<', i, re.S | re.M).group(1).strip() comm.direction = re.search( 'class="house-about clearfix".*?showKeyword">(.*?)<', i, re.S | re.M).group(1).strip() try: comm.height = int( re.search( 'class="house-about clearfix".*?showKeyword">.*?<span.*?<span>.*?/(.*?)<', i, re.S | re.M).group(1).strip()) except Exception as e: comm.height = None total_price = re.search('class="show-price".*?span.*?>(.*?)<', i, re.S | re.M).group(1).strip() comm.total_price = int(total_price) * 10000 avg_price = re.search('class="show-price".*?<p.*?>(.*?)<', i, re.S | re.M).group(1).strip() comm.avg_price = int(re.search('(\d+)', avg_price).group(1)) trade_date = re.search( 'class="show-price concluded".*?span.*?>(.*?)<', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) room_type = re.search('house-title">.*?<a.*?>.*? (.*?) ', i, re.S | re.M).group(1).strip() try: comm.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = None try: comm.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.hall = None area = re.search('house-title">.*?<a.*?>.*? .*? (.*?平米)', i, re.S | re.M).group(1).strip() area = area.replace('㎡', '').replace('平米', '') if area: area = float(area) comm.area = round(area, 2) comm.insert_db() except Exception as e: log.error('解析错误,source="{}",html="{}",e="{}"'.format( 'Q房网', i, e))
def get_comm_detail(self, comm_url, region, city): comm = Comm('购房网') comm.url = comm_url comm.region = region.strip() comm.city = city try: response = requests.get(url=comm_url, headers=self.headers, proxies=next(p)) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format( '购房网', comm_url, e)) return html = response.text comm.district_name = re.search('title fl.*?<h1>(.*?)</h1>', html, re.S | re.M).group(1).strip() comm_info_html = re.search('<ul class="lscjlist">.*?</ul>', html, re.S | re.M).group() comm_info_list = re.findall('<li>(.*?)</li>', comm_info_html, re.S | re.M) if not comm_info_list: log.info('source={}, 此小区没有数据,url="{}"'.format('购房网', comm_url)) for i in comm_info_list: trade_date = re.search('<span>(.*?)</span>', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime(y, m, d) room_type = re.search('<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() try: comm.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) comm.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = None comm.hall = None area = re.search('<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip().replace('㎡', '').replace( '平', '') if area: area = float(area) comm.area = round(area, 2) try: height = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?/(.*?)</span>', i, re.S | re.M).group(1).strip() comm.height = int(re.search('(\d+)', height).group(1)) except Exception as e: comm.height = None comm.fitment = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() comm.direction = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1).strip() avg_price = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>(.*?)</span>', i, re.S | re.M).group(1) comm.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) total_price = re.search( '<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span>.*?<span.*?>(.*?)</span>', i, re.S | re.M).group(1) comm.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 comm.insert_db()
def get_city_info(self, city_dict): for city in city_dict: city_url = city_dict[city] + 'chengjiao/' try: response = requests.get(city_url, headers=self.headers) html = response.text area_html = re.search('data-role="ershoufang".*?地铁', html, re.S | re.M).group() area_list_str = re.findall('<a.*?</a>', area_html, re.S | re.M) for area_i in area_list_str: if 'ershoufang' in area_i: continue area_url = re.search('href="(.*?)"', area_i, re.S | re.M).group(1) area = re.search('<a.*?>(.*?)<', area_i, re.S | re.M).group(1) for i in range(1, 101): city_url_ = city_url.replace( '/chengjiao/', '') + area_url + 'pg' + str(i) try: result = requests.get(city_url_, headers=self.headers) content = result.text comm_str_list = re.findall( 'class="info".*?</div></div></li>', content, re.S | re.M) for i in comm_str_list: comm = Comm('链家在线') comm.region = area.strip() comm.city = city.strip() comm.district_name = re.search( 'target="_blank">(.*?)<', i, re.S | re.M).group(1).strip() comm.direction = re.search( 'class="houseIcon"></span>(.*?) \|', i, re.S | re.M).group(1).strip() try: comm.fitment = re.search( 'class="houseIcon"></span>.*? \|(.*?)\| ', i, re.S | re.M).group(1).strip() except Exception as e: comm.fitment = None try: height = re.search( 'class="positionIcon"></span>.*?\((.*?)\)', i, re.S | re.M).group(1).strip() comm.height = int( re.search('(\d+)', height, re.S | re.M).group(1)) except Exception as e: comm.height = None total_price = re.search( "class='number'>(.*?)<", i, re.S | re.M).group(1).strip() if "*" in total_price: continue comm.total_price = int( re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 room_type = re.search( 'arget="_blank">.*? (.*?) ', i, re.S | re.M).group(1).strip() try: comm.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except Exception as e: comm.room = 0 try: comm.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except Exception as e: comm.hall = None area_ = re.search( 'target="_blank">.*? .*? (.*?平米)', i, re.S | re.M).group(1).strip() if area_: area_ = area_.replace('㎡', '').replace( '平米', '') try: area_ = float(area_) comm.area = round(area_, 2) except Exception as e: comm.area = None trade_date = re.search( 'dealDate">(.*?)<', i, re.S | re.M).group(1).strip() if trade_date: t = time.strptime(trade_date, "%Y.%m.%d") y = t.tm_year m = t.tm_mon d = t.tm_mday comm.trade_date = datetime.datetime( y, m, d) try: comm.avg_price = int(i['total_price'] / i['area']) except Exception as e: comm.avg_price = None comm.insert_db() except Exception as e: log.error( '解析错误,source="{}",html="{}",e="{}"'.format( '链家在线', html, e)) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format( '链家在线', city_url, e))