コード例 #1
0
 def parse(self, response, region):
     source = '太屋网'
     city = '上海'
     try:
         result_json = response.json()
     except Exception as e:
         log.error('无法序列化,source="{}",e="{}"'.format('太屋网', e))
         return
     data_list = result_json['data']
     for j in data_list:
         c = Base(source)
         # 城市
         c.city = city
         # 区域
         c.region = region
         # 室
         c.room = int(j['RoomCount'])
         # 厅
         c.hall = int(j['HollCount'])
         # 小区名称
         c.district_name = j['BuildingName']
         # 面积
         c.area = round(float(j['BldArea']), 2)
         # 朝向
         c.direction = j['Directed']
         # 所在楼层
         c.floor = int(j['Floor'])
         # 总楼层
         c.height = int(j['FloorCount'])
         # 交易日期
         trade_date = j['ExDate']
         trade_date_ = int(re.search('(\d+)', trade_date).group(1))
         t = time.localtime(int(trade_date_ / 1000))
         y = t.tm_year
         m = t.tm_mon
         d = t.tm_mday
         c.trade_date = c.local2utc(datetime.datetime(y, m, d))
         # 总价
         c.total_price = int(j['ExPrice'])
         # 均价
         try:
             c.avg_price = int(round(c.total_price / c.area, 2))
         except:
             c.avg_price = None
         # # 总价
         # try:
         #     c.total_price = int(int(c.avg_price)*float(c.area))
         # except:
         #     c.total_price = None
         c.insert_db()
コード例 #2
0
ファイル: leju.py プロジェクト: zjx15996163721/githubproject
    def parse(self, room_url, co_name, region, city_name):
        try:
            page_index = requests.get(url=room_url,
                                      headers=self.headers,
                                      proxies=self.proxies)
        except Exception as e:
            log.error('请求错误, source="{}",url="{}",e="{}"'.format(
                '新浪乐居', room_url, e))
            return
        if re.search('共(\d+)页', page_index.text):
            page_num = re.search('共(\d+)页', page_index.text).group(1)
            for i in range(1, int(page_num) + 1):
                url = re.sub('#.*', 'n', room_url) + str(i)
                while True:
                    try:
                        res = requests.get(url=url,
                                           headers=self.headers,
                                           proxies=self.proxies)
                        break
                    except Exception as e:
                        log.error('请求错误, source="{}",url="{}",e="{}"'.format(
                            '新浪乐居', url, e))
                        continue
                con = res.text
                room_html = etree.HTML(con)
                room_list = room_html.xpath(
                    "//div[@class='right-information']")
                for m in room_list:
                    room = Base(source)
                    room.url = url
                    # 小区名
                    room.district_name = co_name
                    # 城市
                    room.city = city_name
                    # 区域
                    room.region = region
                    room_type = m.xpath("./h3/span[2]/text()")[0]
                    try:
                        # 室
                        room.room = int(
                            re.search('(\d)室', room_type,
                                      re.S | re.M).group(1))
                    except:
                        room.room = None
                    try:
                        # 厅
                        room.hall = int(
                            re.search('(\d)厅', room_type,
                                      re.S | re.M).group(1))
                    except:
                        room.hall = None
                    # 面积
                    size = m.xpath("./h3/span[3]/text()")[0]
                    area = size.replace('平米', '')
                    if area:
                        area = float(area)
                        room.area = round(area, 2)
                    # 总价
                    # total_price = m.xpath(".//div[@class='price fs14 ']/em/text()")[0]
                    # room.total_price = int(re.search('(\d+)', total_price, re.S | re.M).group(1))*10000
                    # 均价
                    avg_price = m.xpath(
                        ".//div[@class='size  fs14']/text()")[0]
                    room.avg_price = int(
                        re.search('(\d+)', avg_price, re.S | re.M).group(1))
                    try:
                        room.total_price = int(
                            int(room.avg_price) * float(room.area))
                    except:
                        room.total_price = None
                    try:
                        fitment_direction_info = m.xpath(
                            ".//div[@class='t1 fs14']")[0]
                        fitment_direction_info = fitment_direction_info.xpath(
                            'string(.)')
                        fitment_direction_info = fitment_direction_info.split(
                            '|')
                        if len(fitment_direction_info) == 2:
                            room.fitment = fitment_direction_info[1]
                            room.direction = fitment_direction_info[0]
                        elif len(fitment_direction_info) == 3:
                            room.fitment = fitment_direction_info[2]
                            room.direction = fitment_direction_info[1]
                    except:
                        room.fitment = None
                        room.direction = None

                    floor_info = m.xpath(".//div[@class='fs14']/text()[1]")[0]
                    try:
                        floor = re.search('(.*?)/', floor_info).group(1)
                        room.floor = int(re.search('\d+', floor).group(0))
                    except Exception as e:
                        room.floor = None
                    try:
                        room.height = int(
                            re.search('.*?/(\d+)层', floor_info).group(1))
                    except:
                        room.height = None
                    trade_date = m.xpath(".//div[@class='date']/text()")[0]
                    if trade_date:
                        t = time.strptime(trade_date, "%Y-%m-%d")
                        y = t.tm_year
                        m = t.tm_mon
                        d = t.tm_mday
                        room.trade_date = room.local2utc(
                            datetime.datetime(y, m, d))
                    room.insert_db()
        else:
            log.info('source={}, url={}, 小区无相关数据'.format('新浪乐居', room_url))
            return
コード例 #3
0
 def parse(self, url, city):
     try:
         response = requests.get(url=url,
                                 headers=self.headers,
                                 proxies=self.proxies)
     except Exception as e:
         log.error('请求失败,source="{}", url="{}",e="{}"'.format(
             '房天下', url, e))
         return
     tree = etree.HTML(response.text)
     info_list = tree.xpath("//div[@class='houseList']/dl")
     comm = Base('房天下')
     comm.url = url
     comm.city = city
     for info in info_list:
         district_name_info = info.xpath("./dd/p/a/text()")[0]
         # 小区名称
         comm.district_name = district_name_info.split(' ')[0]
         if '�' in comm.district_name:
             log.error('source={}, 网页出现繁体字, url={}'.format('房天下', url))
             break
         # 室
         try:
             comm.room = int(
                 re.search('(\d+)室', district_name_info,
                           re.S | re.M).group(1))
         except Exception as e:
             comm.room = None
         # 厅
         try:
             comm.hall = int(
                 re.search('(\d+)厅', district_name_info,
                           re.S | re.M).group(1))
         except Exception as e:
             comm.hall = None
         # 面积
         try:
             comm.area = float(
                 re.search('(\d+\.?\d+?)平米', district_name_info,
                           re.S | re.M).group(1))
         except Exception as e:
             comm.area = None
         # 区域
         try:
             region_info = info.xpath("./dd/p[2]/text()")[0]
             comm.region = region_info.split('-')[0]
         except Exception as e:
             comm.region = None
         # 朝向 总楼层
         try:
             direction_info = info.xpath("./dd/p[3]")[0]
             direction_info = direction_info.xpath('string(.)')
             comm.direction = direction_info.split('|')[0]
             comm.height = int(
                 re.search('\(共(.*?)层\)', direction_info,
                           re.S | re.M).group(1))
         except Exception as e:
             comm.direction = None
             comm.height = None
         # 时间
         try:
             trade_date = info.xpath("./dd/div[2]/p[1]/text()")[0]
             t = time.strptime(trade_date, "%Y-%m-%d")
             y = t.tm_year
             m = t.tm_mon
             d = t.tm_mday
             comm.trade_date = datetime.datetime(y, m, d)
         except Exception as e:
             comm.trade_date = None
         # 总价
         try:
             total_price = info.xpath("./dd/div[3]/p[1]/span[1]/text()")[0]
             comm.total_price = int(total_price) * 10000
         except Exception as e:
             comm.total_price = None
         # 均价
         try:
             avg_price_info = info.xpath("./dd/div[3]/p[2]/b[1]/text()")[0]
             comm.avg_price = int(
                 re.search("(\d+)元", avg_price_info, re.S | re.M).group(1))
         except Exception as e:
             comm.avg_price = None
         comm.insert_db()
コード例 #4
0
 def final_parse(self, data):
     final_url = data['link']
     city = data['city']
     region = data['region']
     try:
         r = requests.get(url=final_url,
                          headers=self.headers,
                          proxies=self.proxies,
                          timeout=60)
     except Exception as e:
         log.error('请求失败, source={}, 没有更多小区成交 url={}, e={}'.format(
             '链家在线', final_url, e))
         return
     tree = etree.HTML(r.text)
     url_list = tree.xpath("//ul[@class='listContent']/li")
     if url_list:
         for info in url_list:
             comm = Base('链家在线')
             comm.url = final_url
             # 区域
             comm.region = region.strip()
             # 城市
             comm.city = city.strip()
             district_name_room_area = info.xpath(
                 "./div/div[@class='title']/a/text()")[0]
             # 小区名称
             comm.district_name = district_name_room_area.split(' ')[0]
             try:
                 room_hall = district_name_room_area.split(' ')[1]
             except:
                 room_hall = None
             try:
                 # 室
                 comm.room = int(
                     re.search('(\d+)室', room_hall, re.S | re.M).group(1))
             except:
                 comm.room = None
             try:
                 # 厅
                 comm.hall = int(
                     re.search('(\d+)厅', room_hall, re.S | re.M).group(1))
             except:
                 comm.hall = None
             try:
                 # 面积
                 area = district_name_room_area.split(' ')[2]
                 area = re.search("(.*?)平米", area, re.S | re.M).group(1)
                 comm.area = round(float(area), 2)
             except:
                 comm.area = None
             try:
                 direction_fitment = info.xpath(
                     "./div/div[@class='address']/div[1]/text()")[0].split(
                         '|')
                 # 朝向
                 comm.direction = direction_fitment[0]
                 # 装修
                 comm.fitment = direction_fitment[1]
             except:
                 comm.direction = None
                 comm.fitment = None
             # 总楼层
             try:
                 height = info.xpath(
                     "./div/div[@class='flood']/div[1]/text()")[0]
                 comm.height = int(
                     re.search("共(\d+)层", height, re.S | re.M).group(1))
             except:
                 comm.height = None
             # # 总价
             # try:
             #     total_price = info.xpath("./div/div[@class='address']/div[3]/span/text()")[0]
             #     if "*" in total_price:
             #         log.error('source={}, 总价有问题 带*号'.format('链家在线'))
             #         continue
             #     else:
             #         comm.total_price = int(total_price) * 10000
             # except:
             #     comm.total_price = None
             # 交易时间
             try:
                 trade_date = info.xpath(
                     "./div/div[@class='address']/div[2]/text()")[0]
                 t = time.strptime(trade_date, "%Y.%m.%d")
                 y = t.tm_year
                 m = t.tm_mon
                 d = t.tm_mday
                 comm.trade_date = comm.local2utc(datetime.datetime(
                     y, m, d))
             except:
                 comm.trade_date = None
             # 均价
             try:
                 avg_price = info.xpath(
                     "./div/div[@class='flood']/div[3]/span/text()")[0]
                 comm.avg_price = int(avg_price)
             except:
                 comm.avg_price = None
             try:
                 comm.total_price = int(
                     int(comm.avg_price) * float(comm.area))
             except:
                 comm.total_price = None
             comm.insert_db()
コード例 #5
0
 def get_detail(self, response, city, region, url):
     html = response.text
     tree = etree.HTML(html)
     info_list = tree.xpath("//div[@class='house-detail']/ul/li")
     for info in info_list:
         comm = Base('Q房网')
         # 链接
         comm.url = url
         # 城市
         comm.city = city.strip()
         # 区域
         comm.region = region.strip()
         district_name_room_area = info.xpath("./div[1]/p[1]/a[1]/text()")[0]
         # 小区名称
         comm.district_name = district_name_room_area.split(' ')[0]
         # 室
         try:
             comm.room = int(re.search("(\d+)室", district_name_room_area, re.S | re.M).group(1))
         except:
             comm.room = None
         # 厅
         try:
             comm.hall = int(re.search("(\d+)厅", district_name_room_area, re.S | re.M).group(1))
         except:
             comm.hall = None
         # 面积
         try:
             area = re.search("(\d+.?\d+?)平米", district_name_room_area, re.S | re.M).group(1)
             comm.area = round(float(area), 2)
         except:
             comm.area = None
         # 朝向 总楼层
         try:
             direction = info.xpath("./div[1]/p[2]/span[4]/text()")[0]
             if '层' not in direction:
                 comm.direction = direction
                 height = info.xpath("./div[1]/p[2]/span[6]/text()")[0]
                 comm.height = int(re.search("(\d+)层", height, re.S | re.M).group(1))
             else:
                 comm.direction = None
                 comm.height = int(re.search("(\d+)层", direction, re.S | re.M).group(1))
         except:
             comm.direction = None
             comm.height = None
         # # 总价
         # try:
         #     total_price = info.xpath("./div[2]/span[1]/text()")[0]
         #     comm.total_price = int(total_price) * 10000
         # except:
         #     comm.total_price = None
         # 均价
         try:
             avg_price = info.xpath("./div[2]/p[1]/text()")[0]
             comm.avg_price = int(re.search("\d+", avg_price, re.S | re.M).group(0))
         except:
             comm.avg_price = None
         # 总价
         try:
             comm.total_price = int(int(comm.avg_price)*float(comm.area))
         except:
             comm.total_price = None
         # 交易时间
         try:
             trade_date = info.xpath("./div[3]/span[1]/text()")[0]
             t = time.strptime(trade_date, "%Y.%m.%d")
             y = t.tm_year
             m = t.tm_mon
             d = t.tm_mday
             comm.trade_date = comm.local2utc(datetime.datetime(y, m, d))
         except:
             comm.trade_date = None
         comm.insert_db()