Exemple #1
0
    def build_crawler(self, co_id, co_name, comm_con):

        bu = Building(co_index, co_id=co_id, co_name=co_name)
        build_list = re.search('查看楼盘表.*?<tr>(.*?)</table>', comm_con,
                               re.S | re.M).group(1)
        build = re.findall('<tr>(.*?)</tr>', build_list, re.S | re.M)

        for bul in build:
            try:
                bul_html = etree.HTML(bul)
                buli = bul_html.xpath("//td/text()")

                bu.bu_num = bu_num = buli[1]
                bu.bu_all_house = buli[2]
                bu.size = buli[3]
                house_url = re.search(
                    r'"(.*?)" t',
                    bul,
                ).group(1)
                bu.bu_id = bu_id = re.search('-(\d+)', house_url).group(1)

                bu.insert_db()
            except:
                continue
            self.house_crawler(house_url, bu_num, co_id, bu_id)
Exemple #2
0
    def get_build_info(self, build_lis, co_id):
        for build_ in build_lis:
            build_url = "http://xx.yyfdcw.com" + build_
            try:
                build_res = requests.get(build_url, headers=self.headers)
            except Exception as e:
                print("co_index={},楼栋信息错误".format(co_index), e)
                continue
            con = build_res.text
            bu = Building(co_index)
            bu.co_id = co_id
            bu.bu_id = re.search('Bid=(\d+)', build_).group(1)
            bu.bu_num = re.search('名称.*?">(.*?)</spa', con).group(1)
            bu.bu_pre_sale = re.search("编.*?red'>(.*?)</a", con).group(1)
            bu.bu_pre_sale_date = re.search('颁发日期.*?Date">(.*?)</span',
                                            con).group(1)
            bu.bo_build_start_time = re.search('开工日期.*?">(.*?)</span',
                                               con).group(1)
            bu.bo_build_end_time = re.search('竣工日期.*?">(.*?)</span',
                                             con).group(1)
            bu.bo_develops = re.search('单位.*?">(.*?)</span', con).group(1)
            bu.bu_floor = re.search('层数.*?">(.*?)</span', con).group(1)
            bu.bu_live_size = re.search('住宅面积.*?">(.*?)</span', con).group(1)
            bu.size = re.search('总面积.*?">(.*?)</span', con).group(1)

            bu.insert_db()

            id = re.search('测量号.*?">(.*?)</span', con).group(1)
            self.get_house_info(co_id, bu.bu_id, id)
Exemple #3
0
    def comm_info(
        self,
        con,
    ):
        # 小区及楼栋
        comm = Comm(co_index)

        comm.co_name = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()"
        )[0]  # 小区名称
        co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0]  # 小区id
        comm.co_id = re.search(r"\d+", co_id_str).group(0)
        comm.co_address = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[
                0]  # 小区地址
        comm.co_develops = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[
                0]  # 开发商
        comm.co_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0]  # 总面积
        comm.co_build_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[
                0]  # 建筑面积
        comm.co_build_end_time = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[
                0]  # 竣工时间
        comm.co_plan_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[
                0]  # 用地规划许可
        comm.co_work_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0]  # 施工许可
        comm.co_green = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()"
        )[0]  # 绿地百分比
        comm.co_land_use = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0]  # 土地使用证

        comm.insert_db()

        build = Building(co_index)
        build_table = con.xpath("//tr[@style='color:#000066;']")
        room_list = []
        for build_list in build_table:
            build.co_id = comm.co_id
            build.co_name = comm.co_name
            build_info = build_list.xpath("./td/text()")
            build.bu_id = build_info[0]
            build.bu_num = build_info[1]
            build.bu_all_house = build_info[2]
            build.size = build_info[3]
            build.bu_floor = build_info[4]
            build.bu_pre_sale = build_info[5]

            build.insert_db()

            room_url = build_list.xpath("./td/a/@href")[0]
            room_list.append(room_url)

        return room_list
Exemple #4
0
 def build_info(self, bu_info_list, co_id):
     for bu_info in bu_info_list:
         try:
             bu = Building(co_index)
             url = bu_info.xpath("./@onclick")[0]
             bu.bu_id = re.search('dbh=(\d+)', url).group(1)
             bu.co_id = co_id
             bu.bu_num = bu_info.xpath("./td[@class='org']/text()")[0]
             bu.bu_all_house = bu_info.xpath("./td[3]/text()")[0]
             bu.size = bu_info.xpath("./td[2]/text()")[0]
             bu.insert_db()
         except Exception as e:
             log.error('楼栋信息错误', e)
Exemple #5
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://222.223.160.199:8088/website/buildquery/selectBuild.jsp?buildID=' + i[0]
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             build.bu_id = i[0]
             build.co_build_structural = re.search('结构类型.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bo_build_end_time = re.search('建成年份.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bu_build_size = re.search('总建筑面积.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bu_num = re.search('幢号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.size = re.search('占地面积.*?<td>(.*?)<', html, re.S | re.M).group(1)
             build.bu_floor = re.search('房屋层数.*?<td>(.*?)<', html, re.S | re.M).group(1)
             build.bu_all_house = re.search('房屋套数.*?<td>(.*?)<', html, re.S | re.M).group(1)
             build.area = re.search('坐落区.*?<td>(.*?)<', html, re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(build.bu_id)
         except Exception as e:
             print('请求错误,url={}'.format(build_url),e)