def parse(self, response, region): source = '太屋网' city = '上海' try: result_json = response.json() except Exception as e: log.error('无法序列化,source="{}",e="{}"'.format('太屋网', e)) return data_list = result_json['data'] for j in data_list: c = Base(source) # 城市 c.city = city # 区域 c.region = region # 室 c.room = int(j['RoomCount']) # 厅 c.hall = int(j['HollCount']) # 小区名称 c.district_name = j['BuildingName'] # 面积 c.area = round(float(j['BldArea']), 2) # 朝向 c.direction = j['Directed'] # 所在楼层 c.floor = int(j['Floor']) # 总楼层 c.height = int(j['FloorCount']) # 交易日期 trade_date = j['ExDate'] trade_date_ = int(re.search('(\d+)', trade_date).group(1)) t = time.localtime(int(trade_date_ / 1000)) y = t.tm_year m = t.tm_mon d = t.tm_mday c.trade_date = c.local2utc(datetime.datetime(y, m, d)) # 总价 c.total_price = int(j['ExPrice']) # 均价 try: c.avg_price = int(round(c.total_price / c.area, 2)) except: c.avg_price = None # # 总价 # try: # c.total_price = int(int(c.avg_price)*float(c.area)) # except: # c.total_price = None c.insert_db()
def parse(self, room_url, co_name, region, city_name): try: page_index = requests.get(url=room_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误, source="{}",url="{}",e="{}"'.format( '新浪乐居', room_url, e)) return if re.search('共(\d+)页', page_index.text): page_num = re.search('共(\d+)页', page_index.text).group(1) for i in range(1, int(page_num) + 1): url = re.sub('#.*', 'n', room_url) + str(i) while True: try: res = requests.get(url=url, headers=self.headers, proxies=self.proxies) break except Exception as e: log.error('请求错误, source="{}",url="{}",e="{}"'.format( '新浪乐居', url, e)) continue con = res.text room_html = etree.HTML(con) room_list = room_html.xpath( "//div[@class='right-information']") for m in room_list: room = Base(source) room.url = url # 小区名 room.district_name = co_name # 城市 room.city = city_name # 区域 room.region = region room_type = m.xpath("./h3/span[2]/text()")[0] try: # 室 room.room = int( re.search('(\d)室', room_type, re.S | re.M).group(1)) except: room.room = None try: # 厅 room.hall = int( re.search('(\d)厅', room_type, re.S | re.M).group(1)) except: room.hall = None # 面积 size = m.xpath("./h3/span[3]/text()")[0] area = size.replace('平米', '') if area: area = float(area) room.area = round(area, 2) # 总价 # total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0] # room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000 # 均价 avg_price = m.xpath( ".//div[@class='size fs14']/text()")[0] room.avg_price = int( re.search('(\d+)', avg_price, re.S | re.M).group(1)) try: room.total_price = int( int(room.avg_price) * float(room.area)) except: room.total_price = None try: fitment_direction_info = m.xpath( ".//div[@class='t1 fs14']")[0] fitment_direction_info = fitment_direction_info.xpath( 'string(.)') fitment_direction_info = fitment_direction_info.split( '|') if len(fitment_direction_info) == 2: room.fitment = fitment_direction_info[1] room.direction = fitment_direction_info[0] elif len(fitment_direction_info) == 3: room.fitment = fitment_direction_info[2] room.direction = fitment_direction_info[1] except: room.fitment = None room.direction = None floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0] try: floor = re.search('(.*?)/', floor_info).group(1) room.floor = int(re.search('\d+', floor).group(0)) except Exception as e: room.floor = None try: room.height = int( re.search('.*?/(\d+)层', floor_info).group(1)) except: room.height = None trade_date = m.xpath(".//div[@class='date']/text()")[0] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday room.trade_date = room.local2utc( datetime.datetime(y, m, d)) room.insert_db() else: log.info('source={}, url={}, 小区无相关数据'.format('新浪乐居', room_url)) return
def crawler(self, city_url, city): print(city_url) try: res = requests.get(url=city_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', city_url, e)) return con = etree.HTML(res.text) try: last_page = con.xpath("//a[@class='down_page']/@href")[1] page_num = re.search('\d+', last_page).group(0) except Exception as e: log.error('获取页码失败,source="{}",url="{}",e="{}"'.format('麦田', city_url, e)) return for i in range(1, int(page_num) + 1): page_url = city_url + "/PG" + str(i) try: page_res = requests.get(url=page_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', page_url, e)) continue page_con = etree.HTML(page_res.text) temp = page_con.xpath("//h1/a/@href") for temp_url in temp: com = Base(source) comm_url = city + temp_url com.url = comm_url try: co_res = requests.get(url=comm_url, headers=self.headers, proxies=self.proxies) except Exception as e: log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', comm_url, e)) continue co_con = etree.HTML(co_res.text) # 城市 try: com.city = co_con.xpath("//div/a[@class='show']/text()")[0] # 区域 region = co_con.xpath("//section[@class='fl home_main']/p[3]/a/text()")[-1] com.region = re.search("\[(.*)\]", region, re.S | re.M).group(1) # 小区名称 com.district_name = co_con.xpath("//cite/span/text()")[0] info = co_con.xpath("//table/tbody/tr") except Exception as e: log.error('获取城市区域小区名失败, source="{}",url="{}",e="{}"'.format('麦田', comm_url, e)) continue for tag in info: size = tag.xpath("./td[2]/text()")[0] area = size.replace('㎡', '') area = float(area) # 面积 com.area = round(area, 2) # 均价 avg_price = tag.xpath("./td[3]/text()")[0] com.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1)) # # 总价 # total_price = tag.xpath("./td/span/text()")[0] # com.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000 try: com.total_price = int(int(com.avg_price)*float(com.area)) except: com.total_price = None # 成交日期 trade_date = tag.xpath("./td/text()")[-2] if trade_date: t = time.strptime(trade_date, "%Y-%m-%d") y = t.tm_year m = t.tm_mon d = t.tm_mday com.trade_date = com.local2utc(datetime.datetime(y, m, d)) room_type = tag.xpath("./td//p/a/text()")[0] try: # 室 com.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1)) except: com.room = None try: # 厅 com.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1)) except: com.hall = None # 总楼层 floor = tag.xpath("./td//p/span/text()")[0] com.floor = int(re.search('(\d+)层', floor, re.S | re.M).group(1)) # 朝向 com.direction = floor.split(' ')[1] com.insert_db()