def parse(self, response, region):
     source = '太屋网'
     city = '上海'
     try:
         result_json = response.json()
     except Exception as e:
         log.error('无法序列化,source="{}",e="{}"'.format('太屋网', e))
         return
     data_list = result_json['data']
     for j in data_list:
         c = Base(source)
         # 城市
         c.city = city
         # 区域
         c.region = region
         # 室
         c.room = int(j['RoomCount'])
         # 厅
         c.hall = int(j['HollCount'])
         # 小区名称
         c.district_name = j['BuildingName']
         # 面积
         c.area = round(float(j['BldArea']), 2)
         # 朝向
         c.direction = j['Directed']
         # 所在楼层
         c.floor = int(j['Floor'])
         # 总楼层
         c.height = int(j['FloorCount'])
         # 交易日期
         trade_date = j['ExDate']
         trade_date_ = int(re.search('(\d+)', trade_date).group(1))
         t = time.localtime(int(trade_date_ / 1000))
         y = t.tm_year
         m = t.tm_mon
         d = t.tm_mday
         c.trade_date = c.local2utc(datetime.datetime(y, m, d))
         # 总价
         c.total_price = int(j['ExPrice'])
         # 均价
         try:
             c.avg_price = int(round(c.total_price / c.area, 2))
         except:
             c.avg_price = None
         # # 总价
         # try:
         #     c.total_price = int(int(c.avg_price)*float(c.area))
         # except:
         #     c.total_price = None
         c.insert_db()
Example #2
0
    def parse(self, room_url, co_name, region, city_name):
        try:
            page_index = requests.get(url=room_url,
                                      headers=self.headers,
                                      proxies=self.proxies)
        except Exception as e:
            log.error('请求错误, source="{}",url="{}",e="{}"'.format(
                '新浪乐居', room_url, e))
            return
        if re.search('共(\d+)页', page_index.text):
            page_num = re.search('共(\d+)页', page_index.text).group(1)
            for i in range(1, int(page_num) + 1):
                url = re.sub('#.*', 'n', room_url) + str(i)
                while True:
                    try:
                        res = requests.get(url=url,
                                           headers=self.headers,
                                           proxies=self.proxies)
                        break
                    except Exception as e:
                        log.error('请求错误, source="{}",url="{}",e="{}"'.format(
                            '新浪乐居', url, e))
                        continue
                con = res.text
                room_html = etree.HTML(con)
                room_list = room_html.xpath(
                    "//div[@class='right-information']")
                for m in room_list:
                    room = Base(source)
                    room.url = url
                    # 小区名
                    room.district_name = co_name
                    # 城市
                    room.city = city_name
                    # 区域
                    room.region = region
                    room_type = m.xpath("./h3/span[2]/text()")[0]
                    try:
                        # 室
                        room.room = int(
                            re.search('(\d)室', room_type,
                                      re.S | re.M).group(1))
                    except:
                        room.room = None
                    try:
                        # 厅
                        room.hall = int(
                            re.search('(\d)厅', room_type,
                                      re.S | re.M).group(1))
                    except:
                        room.hall = None
                    # 面积
                    size = m.xpath("./h3/span[3]/text()")[0]
                    area = size.replace('平米', '')
                    if area:
                        area = float(area)
                        room.area = round(area, 2)
                    # 总价
                    # total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0]
                    # room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                    # 均价
                    avg_price = m.xpath(
                        ".//div[@class='size  fs14']/text()")[0]
                    room.avg_price = int(
                        re.search('(\d+)', avg_price, re.S | re.M).group(1))
                    try:
                        room.total_price = int(
                            int(room.avg_price) * float(room.area))
                    except:
                        room.total_price = None
                    try:
                        fitment_direction_info = m.xpath(
                            ".//div[@class='t1 fs14']")[0]
                        fitment_direction_info = fitment_direction_info.xpath(
                            'string(.)')
                        fitment_direction_info = fitment_direction_info.split(
                            '|')
                        if len(fitment_direction_info) == 2:
                            room.fitment = fitment_direction_info[1]
                            room.direction = fitment_direction_info[0]
                        elif len(fitment_direction_info) == 3:
                            room.fitment = fitment_direction_info[2]
                            room.direction = fitment_direction_info[1]
                    except:
                        room.fitment = None
                        room.direction = None

                    floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0]
                    try:
                        floor = re.search('(.*?)/', floor_info).group(1)
                        room.floor = int(re.search('\d+', floor).group(0))
                    except Exception as e:
                        room.floor = None
                    try:
                        room.height = int(
                            re.search('.*?/(\d+)层', floor_info).group(1))
                    except:
                        room.height = None
                    trade_date = m.xpath(".//div[@class='date']/text()")[0]
                    if trade_date:
                        t = time.strptime(trade_date, "%Y-%m-%d")
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        room.trade_date = room.local2utc(
                            datetime.datetime(y, m, d))
                    room.insert_db()
        else:
            log.info('source={}, url={}, 小区无相关数据'.format('新浪乐居', room_url))
            return
Example #3
0
    def crawler(self, city_url, city):
        print(city_url)
        try:
            res = requests.get(url=city_url, headers=self.headers, proxies=self.proxies)
        except Exception as e:
            log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', city_url, e))
            return
        con = etree.HTML(res.text)
        try:
            last_page = con.xpath("//a[@class='down_page']/@href")[1]
            page_num = re.search('\d+', last_page).group(0)
        except Exception as e:
            log.error('获取页码失败,source="{}",url="{}",e="{}"'.format('麦田', city_url, e))
            return
        for i in range(1, int(page_num) + 1):
            page_url = city_url + "/PG" + str(i)
            try:
                page_res = requests.get(url=page_url, headers=self.headers, proxies=self.proxies)
            except Exception as e:
                log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', page_url, e))
                continue
            page_con = etree.HTML(page_res.text)
            temp = page_con.xpath("//h1/a/@href")
            for temp_url in temp:
                com = Base(source)
                comm_url = city + temp_url
                com.url = comm_url
                try:
                    co_res = requests.get(url=comm_url, headers=self.headers, proxies=self.proxies)
                except Exception as e:
                    log.error('请求错误,source="{}",url="{}",e="{}"'.format('麦田', comm_url, e))
                    continue

                co_con = etree.HTML(co_res.text)
                # 城市
                try:
                    com.city = co_con.xpath("//div/a[@class='show']/text()")[0]
                    # 区域
                    region = co_con.xpath("//section[@class='fl home_main']/p[3]/a/text()")[-1]
                    com.region = re.search("\[(.*)\]", region, re.S | re.M).group(1)
                    # 小区名称
                    com.district_name = co_con.xpath("//cite/span/text()")[0]
                    info = co_con.xpath("//table/tbody/tr")
                except Exception as e:
                    log.error('获取城市区域小区名失败, source="{}",url="{}",e="{}"'.format('麦田', comm_url, e))
                    continue
                for tag in info:
                    size = tag.xpath("./td[2]/text()")[0]
                    area = size.replace('㎡', '')
                    area = float(area)
                    # 面积
                    com.area = round(area, 2)
                    # 均价
                    avg_price = tag.xpath("./td[3]/text()")[0]
                    com.avg_price = int(re.search('(\d+)', avg_price, re.S | re.M).group(1))
                    # # 总价
                    # total_price = tag.xpath("./td/span/text()")[0]
                    # com.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1)) * 10000
                    try:
                        com.total_price = int(int(com.avg_price)*float(com.area))
                    except:
                        com.total_price = None
                    # 成交日期
                    trade_date = tag.xpath("./td/text()")[-2]
                    if trade_date:
                        t = time.strptime(trade_date, "%Y-%m-%d")
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        com.trade_date = com.local2utc(datetime.datetime(y, m, d))
                    room_type = tag.xpath("./td//p/a/text()")[0]
                    try:
                        # 室
                        com.room = int(re.search('(\d)室', room_type, re.S | re.M).group(1))
                    except:
                        com.room = None
                    try:
                        # 厅
                        com.hall = int(re.search('(\d)厅', room_type, re.S | re.M).group(1))
                    except:
                        com.hall = None
                    # 总楼层
                    floor = tag.xpath("./td//p/span/text()")[0]
                    com.floor = int(re.search('(\d+)层', floor, re.S | re.M).group(1))
                    # 朝向
                    com.direction = floor.split(' ')[1]
                    com.insert_db()