Beispiel #1
0
 def get_build_info(self, build_info_list, co_id, comm_html, url):
     for i in build_info_list:
         try:
             building = Building(2)
             bu_name = i[1]  # 楼栋名称
             bu_num = bu_name.split('#')[0]  # 楼号
             bu_all_house = i[3]  # 总套数
             bu_build_size = i[5]  # 面积
             bu_price = i[9]  # 价格
             # 给对象增加属性
             building.bu_name = bu_name
             building.bu_num = bu_num
             building.bu_all_house = bu_all_house
             building.bu_build_size = bu_build_size
             building.bu_price = bu_price
             building.co_id = co_id  # 小区id
             build_html = re.search(r'楼盘表(.*?)个楼栋信息', comm_html).group(1)
             build_url = re.search(r'<ahref="(.*?)">查看信息<',
                                   build_html).group(1)
             build_id = re.search('buildingId=(.*?)$', build_url).group(1)
             building.bu_id = build_id  # 楼栋id
             building.insert_db()
             self.get_build_detail(build_url, co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
    def bu_info(self, bu_list, co_id):
        for bu in bu_list:
            try:
                bu_url = 'http://www.fxfdcw.com/' + bu
                res = requests.get(bu_url, headers=self.headers)
                con = res.content.decode('gbk')
                html = etree.HTML(con)
                build = Building(co_index)
                build.co_id = co_id
                build.bu_id = re.search('bdid=(\d+)', bu).group(1)
                build.bu_num = re.search('楼号.*?">(.*?)</', con,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('坐落.*?">(.*?)</', con,
                                             re.S | re.M).group(1)
                build.bu_floor = re.search('地上层数.*?">(.*?)</', con,
                                           re.S | re.M).group(1)
                build.bu_build_size = re.search('建筑面积.*?wrap">(.*?)</', con,
                                                re.S | re.M).group(1)
                build.bu_all_house = re.search('套 数.*?">(.*?)</', con,
                                               re.S | re.M).group(1)
                build.bu_type = re.search('用  途.*?wrap">(.*?)</', con,
                                          re.S | re.M).group(1)
                build.insert_db()

                ho_list = html.xpath("//span[@title]")
            except Exception as e:
                # log.error("楼栋信息错误{}".format(e))
                print("楼栋信息错误{}".format(e))
                continue
            self.ho_info(ho_list, co_id, build.bu_id)
Beispiel #3
0
    def get_build_info(self, url, co_id):
        try:
            building = Building(co_index)
            response = requests.get(url)
            html = response.text
            tree = etree.HTML(html)
            co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0]  # 小区名字
            print(co_name)
            bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0]  # 楼栋名称
            bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0]  # 楼号 栋号
            bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[
                0]  # 总套数
            bu_floor = tree.xpath('//*[@id="cell3-1"]/text()')
            bu_floor = self.is_none(bu_floor)  # 楼层
            bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[
                0]  # 建筑面积
            bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[
                0]  # 住宅面积
            bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()')
            bu_price = self.is_none(bu_price)  # 住宅价格
            bu_id = re.search('\?(\d+)$', url).group(1)  # 楼栋id
            building.co_id = co_id
            building.bu_name = bu_name
            building.bu_num = bu_num
            building.bu_all_house = bu_all_house
            building.bu_floor = bu_floor
            building.bu_build_size = bu_build_size
            building.bu_live_size = bu_live_size
            building.bu_price = bu_price
            building.bu_id = bu_id
            building.insert_db()
            house_info_html = re.findall('<tr id="row3">(.*)$', html,
                                         re.S | re.M)[0]
            for i in re.findall('(<td.*?>.*?</td>)', house_info_html,
                                re.S | re.M):
                if '<br>' not in i:
                    continue
                ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M)
                ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i,
                                               re.S | re.M)
                ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i,
                                     re.S | re.M)[0]
                for i in range(len(ho_name_list)):
                    try:
                        if 'font' in ho_name_list[i]:
                            ho_name = re.sub('<font.*?>', '', ho_name_list[i])
                        else:
                            ho_name = ho_name_list[i]
                        house = House(8)
                        house.ho_name = ho_name
                        house.ho_true_size = ho_true_size_list[i]
                        house.co_id = co_id
                        house.bu_id = bu_id
                        house.ho_type = ho_type
                        house.insert_db()

                    except Exception as e:
                        print(e)
        except BaseException as e:
            print(e)
Beispiel #4
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://222.223.160.199:8088/website/buildquery/selectBuild.jsp?buildID=' + i[
                 0]
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             build.bu_id = i[0]
             build.co_build_structural = re.search('结构类型.*?<td.*?>(.*?)<',
                                                   html,
                                                   re.S | re.M).group(1)
             build.bo_build_end_time = re.search('建成年份.*?<td.*?>(.*?)<',
                                                 html, re.S | re.M).group(1)
             build.bu_build_size = re.search('总建筑面积.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
             build.bu_num = re.search('幢号.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             build.size = re.search('占地面积.*?<td>(.*?)<', html,
                                    re.S | re.M).group(1)
             build.bu_floor = re.search('房屋层数.*?<td>(.*?)<', html,
                                        re.S | re.M).group(1)
             build.bu_all_house = re.search('房屋套数.*?<td>(.*?)<', html,
                                            re.S | re.M).group(1)
             build.area = re.search('坐落区.*?<td>(.*?)<', html,
                                    re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(build.bu_id)
         except Exception as e:
             print('请求错误,url={}'.format(build_url), e)
    def get_build_info(self, build_id_list, co_id):
        bu = Building(co_index)
        for build_id in build_id_list:
            formdata = {}
            formdata["action"] = "qeurySingleBuilding"
            formdata['pk'] = str(build_id)
            header = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
                'Referer':
                'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildingList.php'
            }
            try:
                build_info = self.s.post(
                    'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildHouseAction.php',
                    data=formdata,
                    headers=header)
            except Exception as e:
                print("co_idnex={},楼栋错误".format(co_index), e)

            build_con = build_info.text
            bu.bu_id = build_id
            bu.co_id = co_id
            bu.bu_num = re.search('幢名称.*?<td>(.*?)<', build_con,
                                  re.S | re.M).group(1)
            bu.bu_floor = re.search('总层数.*?<td>(.*?)<', build_con,
                                    re.S | re.M).group(1)
            bu.bu_build_size = re.search('>建筑面积.*?<td>(.*?)<', build_con,
                                         re.S | re.M).group(1)
            bu.bo_develops = re.search('房地产企业.*?">(.*?)</td', build_con,
                                       re.S | re.M).group(1)

            bu.insert_db()

            self.get_house_info(build_con, co_id, build_id)
Beispiel #6
0
 def build_parse(self, co_id):
     list_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/loulist.jsp?Id_xmxq=' + co_id
     res = requests.get(list_url, headers=self.headers)
     con = res.content.decode()
     build_id_list = re.findall("searchByLid\('(\d+)'\)", con)
     for build_id in build_id_list:
         try:
             bu_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/lpbxx_new.jsp?lid=' + build_id
             bu_res = requests.get(bu_url, headers=self.headers)
             bu_con = bu_res.content.decode('gbk')
             bu = Building(co_index)
             bu.co_id = co_id
             bu.bu_id = build_id
             bu.bu_num = re.search('楼栋名称.*?">(.*?)</td', bu_con,
                                   re.S | re.M).group(1)
             bu.bu_all_house = re.search('总套数.*?">总(.*?)套</td', bu_con,
                                         re.S | re.M).group(1)
             bu.bu_floor = re.search('地上层数.*?">共(.*?)层</td', bu_con,
                                     re.S | re.M).group(1)
             bu.bu_build_size = re.search('总建筑面积.*?">(.*?)</td', bu_con,
                                          re.S | re.M).group(1)
             bu.bu_pre_sale = re.search("searchysxk\('(.*?)'\)", bu_con,
                                        re.S | re.M).group(1)
             bu.bu_type = re.search('房屋用途.*?">(.*?)</td', bu_con,
                                    re.S | re.M).group(1)
             bu.insert_db()
         except Exception as e:
             log.error('{}楼栋错误{}'.format(build_id, e))
         self.house_parse(co_id, build_id, bu_con)
    def parse(self, res):
        html = etree.HTML(res.content.decode('gbk'))
        bu_list = html.xpath("//div[@class='listCon']")
        for i in bu_list:
            temp = i.xpath("./a[@class='listCon2']/@href")[0]
            name = i.xpath("./a[@class='listCon1']/@title")[0]
            url = "http://www.hyfc365.com" + temp
            try:
                bu_res = requests.get(url, headers=self.headers)
                content = bu_res.content.decode('gbk')
                bu = Building(co_index)
                bu.bu_num = name
                project_id = re.search('ID=(.*)', temp).group(1)
                bu.bu_pre_sale = re.search('预售证名称.*?NAME">(.*?)</span',
                                           content, re.S | re.M).group(1)
                bu.bu_pre_sale_date = re.search('申领时间.*?">(.*?)</span',
                                                content, re.S | re.M).group(1)
                bu.bo_develops = re.search('申领单位.*?">(.*?)</span', content,
                                           re.S | re.M).group(1)
                bu.bu_build_size = re.search('"SALE_HOUSE_AREA">(.*?)<',
                                             content, re.S | re.M).group(1)
                bu.bu_all_house = re.search('"SALE_HOUSE_COUNT">(.*?)<',
                                            content, re.S | re.M).group(1)

                detail_url = 'http://www.hyfc365.com/RealEstate/Project/BuildingList.aspx?ID=' + project_id
                detail_res = requests.get(detail_url)
                bu_id = re.search("BUILDING_ID=(.*?)'",
                                  detail_res.text).group(1)
                bu.bu_id = bu_id
                bu.insert_db()
            except Exception as e:
                log.error("{}楼栋页面解析失败{}".format(url, e))
                continue
            self.house_parse(bu_id)
Beispiel #8
0
 def detail_parse(self, id, build_list):
     for build in build_list:
         bu_temp = re.search('<a href="(.*?)"', build).group(1)
         build_url = self.start_url + bu_temp
         try:
             bu_res = requests.get(build_url, headers=self.headers)
             time.sleep(2)
             bu_text = bu_res.content.decode()
             bu = Building(co_index)
             bu.bu_num = re.search('幢号:(.*?) 许', bu_text).group(1)
             bu.bu_pre_sale = re.search('许可证号:<span>(.*?)</span>',
                                        bu_text).group(1)
             bu.bu_id = int(bu.bu_pre_sale)
             bu.bu_all_house = re.search('套数:<span>(.*?)</span',
                                         bu_text).group(1)
             bu.bu_floor = re.search('地上层数:<span>(.*?)</span',
                                     bu_text).group(1)
             bu.bo_build_end_time = re.search('竣工日期:<span>(.*?)</span',
                                              bu_text).group(1)
             bu.bu_build_size = re.search('预售许可面积:<span>(.*?)</span',
                                          bu_text).group(1)
             bu.bu_type = re.search('用途:<span>(.*?)</span',
                                    bu_text).group(1)
             bu.insert_db()
         except Exception as e:
             log.error("楼栋出错{}".format(e))
             continue
         self.house_detail(bu_text, id, bu.bu_id)
    def bu_parse(self, co_id, page, co_url, co_res, path_url):
        html = etree.HTML(co_res.text)
        viewstate = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
        generator = html.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
        valid = html.xpath("//input[@id='__EVENTVALIDATION']/@value")[0]
        formdata = {
            "__VIEWSTATE": viewstate,
            "__EVENTTARGET": 'ctl00$MainContent$OraclePager1$ctl11$PageList',
            "__VIEWSTATEGENERATOR": generator,
            "__EVENTVALIDATION": valid,
            "ctl00$MainContent$OraclePager1$ctl11$PageList": 0
        }
        self.headers['Referer'] = co_url

        for i in range(1, int(page) + 1):
            page_res = requests.post(co_url,
                                     data=formdata,
                                     headers=self.headers)
            page_html = etree.HTML(page_res.text)
            view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
            generator_ = html.xpath(
                "//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
            valid_ = html.xpath("//input[@id='__EVENTVALIDATION']/@value")[0]
            formdata = {
                "__VIEWSTATE": view_state,
                "__EVENTTARGET":
                'ctl00$MainContent$OraclePager1$ctl11$PageList',
                "__VIEWSTATEGENERATOR": generator_,
                "__EVENTVALIDATION": valid_,
                "ctl00$MainContent$OraclePager1$ctl11$PageList": i - 1
            }

            bu_list = page_html.xpath(
                "//table[@id='ctl00_MainContent_OraclePager1']//tr")

            for bu in bu_list[1:]:
                build = Building(co_index)
                build.co_id = co_id
                build.bu_num = bu.xpath("./td/a/text()")[0]
                build.bu_build_size = bu.xpath("./td[2]/text()")[0]
                build.bu_floor = bu.xpath("./td[4]/text()")[0]
                build.bu_all_house = bu.xpath("./td[3]/text()")[0]
                tmp_url = bu.xpath("./td/a/@href")[0]
                build.bu_id = re.search('PBTAB_ID=(.*?)&', tmp_url).group(1)
                build.insert_db()
                house_url = path_url.replace('SaleInfoProListIndex.aspx',
                                             '') + tmp_url
                self.ho_parse(co_id, build.bu_id, house_url)
 def get_build_info(self, build_url_list, co_name):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build.co_name = co_name
             build_url = 'http://www.sxczfdc.com/pubinfo/' + i
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0]
             for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"',
                                 html, re.S | re.M):
                 try:
                     build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k
                     result = requests.get(build_url_detail,
                                           headers=self.headers)
                     content = result.text
                     build.bu_num = re.findall(
                         'BuildingInfo1_lblBuildingName">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_all_house = re.findall(
                         'BuildingInfo1_lblZts">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_floor = re.findall(
                         'BuildingInfo1_lblZcs">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_build_size = re.findall(
                         'BuildingInfo1_lblJzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_live_size = re.findall(
                         'BuildingInfo1_lblZzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale = re.findall(
                         'BuildingInfo1_lblYsxkzh">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale_date = re.findall(
                         'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content,
                         re.S | re.M)[0]
                     build.insert_db()
                     house_url_list = re.findall(
                         "onClick=.getMoreHouseInfo\('(.*?)'\)", content,
                         re.S | re.M)
                     self.get_house_info(house_url_list, co_name,
                                         build.bu_num)
                 except Exception as e:
                     print(e)
         except Exception as e:
             print(e)
Beispiel #11
0
 def get_comm_info(self, comm_url, comm):
     co_url = 'http://www.fangdi.com.cn/' + comm_url
     response = requests.get(co_url, headers=self.headers)
     html = response.content.decode('gbk')
     comm.co_develops = re.search('企业名称:.*?<a.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
     comm.insert_db()
     add_build_url = 'http://www.fangdi.com.cn/Presell.asp?projectID=' + comm.co_id
     result = requests.get(add_build_url, headers=self.headers)
     html_str = result.content.decode('gbk')
     build_detail_tuple_list = re.findall(
         "javascript:SetSelect\(.*?,.*?,.*?,.*?,.*?,'(.*?)','(.*?)'\)",
         html_str, re.S | re.M)
     for i in build_detail_tuple_list:
         PreSell_ID = i[0]
         Start_ID = i[1]
         build_detail_url = 'http://www.fangdi.com.cn/building.asp?ProjectID=OTU4OHwyMDE4LTQtNHwxNw&PreSell_ID=' + PreSell_ID + '&Start_ID=' + Start_ID
         massage = requests.get(build_detail_url,
                                headers=self.headers).content.decode('gbk')
         build_url_list = re.findall('class="indextabletxt">.*?</tr>',
                                     massage, re.S | re.M)
         for i in build_url_list:
             try:
                 build = Building(co_index)
                 build.bu_num = re.search('<a.*?>(.*?)</a>', i,
                                          re.S | re.M).group(1)
                 build.bu_all_house = re.search(
                     '<a.*?<td.*?<td.*?<td.*?>(.*?)<', i,
                     re.S | re.M).group(1)
                 build.bu_build_size = re.search(
                     '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i,
                     re.S | re.M).group(1)
                 build.bu_id = re.search('Param=(.*?)=', i,
                                         re.S | re.M).group(1)
                 build.co_id = comm.co_id
                 build.insert_db()
                 house_url = re.search('href="(.*?)"', i,
                                       re.S | re.M).group(1)
                 self.get_house_info(house_url, build.bu_id, build.co_id)
             except Exception as e:
                 print(
                     '楼栋错误,co_index={},url={}'.format(
                         co_index, build_detail_url), e)
    def build_parse(self, co_id):
        bu = Building(co_index)

        url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id)
        res = requests.get(url, headers=self.headers)
        con_html = etree.HTML(res.text)
        build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1]
        a = con_html.xpath("//td[@width='54%']")

        for index in range(0, len(build_url_list)):
            try:
                build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index]
                res = requests.get(build_info_url, headers=self.headers)
                con = res.text
                bu.co_id = co_id
                bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con,
                                                re.S | re.M).group(1)
                bu.bu_num = re.search('幢.*?did">(.*?)<', con,
                                      re.S | re.M).group(1)
                bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con,
                                           re.S | re.M).group(1)
                bu.bu_address = re.search('位置.*?ss">(.*?)<', con,
                                          re.S | re.M).group(1)
                bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con,
                                             re.S | re.M).group(1)
                bu.bu_type = re.search('性质.*?type">(.*?)<', con,
                                       re.S | re.M).group(1)
                bu.bu_all_house = re.search('套数.*?number">(.*?)<', con,
                                            re.S | re.M).group(1)
                bu.bu_id = re.search('id=(\d+)',
                                     build_url_list[index]).group(1)

                bu.insert_db()
            except Exception as e:
                print(
                    '楼栋错误,co_index={},url={}'.format(co_index, build_info_url),
                    e)
                continue
            try:
                house_url = a[index].xpath("./a/@href")[0]
                self.house_parse(house_url, co_id, bu.bu_id)
            except Exception as e:
                continue
Beispiel #13
0
 def get_build_detail(self, build_url, co_id):
     bu_url = 'http://www.yzfdc.cn/' + build_url
     response = self.s.get(bu_url, headers=self.headers)
     html = response.text
     build = Building(co_index)
     build.bu_num = re.search('查询幢号:.*?<span.*?<span.*?>(.*?)<', html,
                              re.S | re.M).group(1)
     bu_html = re.search('<div align="center">已售已备案.*?</table>', html,
                         re.S | re.M).group()
     build_html_list = re.findall('<tr.*?</tr>', bu_html, re.S | re.M)
     all_size = 0
     for i in build_html_list:
         num = re.search(
             '<div.*?<div.*?<div.*?<div.*?<div.*?<div.*?>(.*?)<', i,
             re.S | re.M).group(1)
         if num:
             all_size += float(num)
     build.bu_build_size = all_size
     build.co_id = co_id
     build.bu_id = re.search('GCZHId=(.*?)$', bu_url).group(1)
     build.insert_db()
     self.get_house_info(co_id, build.bu_id)
Beispiel #14
0
 def get_build_info(self, presell_url_list, co_id):
     for presell_url in presell_url_list:
         pre_url = self.url + presell_url
         res = requests.get(pre_url, headers=self.headers)
         build_url_list = re.findall('【<a href="(.*?)" target="_self"',
                                     res.text, re.S | re.M)
         for build_url in build_url_list:
             build_info_url = self.url + build_url
             try:
                 build_res = requests.get(build_info_url,
                                          headers=self.headers)
                 con = build_res.text
                 bu = Building(co_index)
                 bu.co_id = co_id
                 bu.bu_id = re.search('ID=(\d+)', build_url).group(1)
                 bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span',
                                       con, re.S | re.M).group(1)
                 bu.bu_floor = re.search('总 层 数.*?(\d+)</span', con,
                                         re.S | re.M).group(1)
                 bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span',
                                              con, re.S | re.M).group(1)
                 bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span',
                                             con, re.S | re.M).group(1)
                 bu.bu_not_live_size = re.search(
                     '非住宅面积.*?Fzzmj">(.*?)</span', con,
                     re.S | re.M).group(1)
                 bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span',
                                            con, re.S | re.M).group(1)
                 bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span',
                                                 con, re.S | re.M).group(1)
                 bu.bu_type = re.search('项目类型.*?Type">(.*?)</span', con,
                                        re.S | re.M).group(1)
                 bu.insert_db()
             except Exception as e:
                 print("co_index={},楼栋信息错误".format(co_index), e)
                 continue
             house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"",
                                            con, re.S | re.M)
             self.get_house_info(co_id, bu.bu_id, house_detail_list)
    def build_info(self, co_id, bu_id):
        bu_url = 'http://www.lsjs.gov.cn/WebLSZFGB/ZNInfo.aspx?YSZID=' + bu_id + "&YSXMID=" + co_id
        bu_res = requests.get(bu_url, headers=self.headers)
        con = bu_res.text
        bu = Building(co_index)
        bu.co_id = co_id
        bu.bu_id = bu_id
        bu.bu_num = re.search('znxx">(.*?)</span', con).group(1)
        bu.bu_all_house = re.search('纳入网上预(销)售总套数.*?">(.*?)</', con,
                                    re.S | re.M).group(1)
        bu.bu_build_size = re.search('纳入网上预(销)售总面积.*?">(.*?)</', con,
                                     re.S | re.M).group(1)
        bu.insert_db()

        html = etree.HTML(con)
        house_list = html.xpath("//span[@class='syt-span']")
        for tag in house_list:
            ho = House(co_index)
            ho.bu_id = bu_id
            ho.co_id = co_id
            ho.ho_name = tag.xpath(".//p[@class='ewb-num']/text()")[0]
            ho.ho_build_size = tag.xpath(".//p[@class='ewb-con']/text()")[0]
            ho.insert_db()
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://www.ndjsj.gov.cn/House/' + i
             build.co_name = '项目名称:.*?<td.*?>(.*?)<'
             build.bu_num = '幢  号:.*?<td.*?>(.*?)<'
             build.bu_address = '坐落位置:.*?<td.*?>(.*?)<'
             build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<'
             build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<'
             build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<'
             # build.bu_type = '设计用途:.*?<td.*?>(.*?)<'
             build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(page_url=build_url,
                                 request_type='get', encode='utf-8',
                                 analyzer_rules_dict=build.to_dict(),
                                 current_url_rule='javascript:ShowTitle.*?href="(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print('宁德楼栋错误,url={}'.format(build_url), e)
    def get_build_info(self, build_url_list):
        for i in build_url_list:
            try:
                build = Building(co_index)
                build_url = 'http://www.fjnpfdc.com/House/' + i
                res = requests.get(build_url, headers=self.headers)
                con = res.content.decode('gbk')
                build.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", con,
                                          re.S | re.M).group(1)
                build.bu_num = re.search("幢  号:.*?<td.*?>(.*?)<", con,
                                         re.S | re.M).group(1)
                build.co_use = re.search("设计用途:.*?<td.*?>(.*?)<", con,
                                         re.S | re.M).group(1)
                build.co_build_structural = re.search("建筑结构:.*?<td.*?>(.*?)<",
                                                      con,
                                                      re.S | re.M).group(1)
                build.bu_floor = re.search("总 层 数:.*?<td.*?>(.*?)<", con,
                                           re.S | re.M).group(1)
                build.bu_build_size = re.search("总 面 积:.*?<td.*?>(.*?)<", con,
                                                re.S | re.M).group(1)
                build.co_build_end_time = re.search("竣工日期:.*?<td.*?>(.*?)<",
                                                    con, re.S | re.M).group(1)

                house_url_list = re.findall('<a href="(HouseInfo.*?)"', con)
                # p = ProducerListUrl(page_url=build_url,
                #                     request_type='get', encode='gbk',
                #                     analyzer_rules_dict=build.to_dict(),
                #                     current_url_rule='<a href="(HouseInfo.*?)"',
                #                     analyzer_type='regex',
                #                     headers=self.headers)
                build.co_id = re.search('ProjectId=(.*?)&', i).group(1)
                build.bu_id = re.search('BuildingId=(.*?)&P', i).group(1)
                build.insert_db()
                # house_url_list = p.get_details()
                self.get_house_info(house_url_list, build.bu_id, build.co_id)
            except Exception as e:
                print("co_index={},楼栋{}错误".format(co_index, i), e)
    def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con):

        build = Building(co_index)

        build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con,
                                   re.S | re.M).group(1)
        build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con,
                                        re.S | re.M).group(1)
        build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con,
                                     re.S | re.M).group(1)
        build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con,
                                       re.S | re.M).group(1)
        build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con,
                                           re.S | re.M).group(1)
        build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con,
                                              re.S | re.M).group(1)
        build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con,
                                       re.S | re.M).group(1)
        build.bu_pre_sale = bu_pre_sale
        build.bo_develops = bo_develops
        build.co_name = bu_co_name
        build.insert_db()
 def get_build_detail(self, all_building_url_list):
     house_url_list = []
     for i in all_building_url_list:
         try:
             response = requests.get(i, headers=self.headers)
             html = response.text
             tree = etree.HTML(html)
             bo_develops = tree.xpath(
                 '//*[@id="content_1"]/div[3]/text()[2]')[0]  # 开发商
             bu_build_size = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[6]/a/text()')  # 销售面积
             if bu_build_size:
                 bu_build_size = bu_build_size[0]
             bu_pre_sale = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[1]/a/text()')  # 预售证书
             if bu_pre_sale:
                 bu_pre_sale = bu_pre_sale[0]
             bu_floor = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[3]/a/text()')[0]  # 总层数
             bu_all_house = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[4]/a/text()')[0]  # 总套数
             bu_type = tree.xpath(
                 '//*[@id="houseTable_1"]/tr[2]/td[5]/a/text()')[0]  # 房屋用途
             build_html = re.search('houseTable_1.*?当前共有', html,
                                    re.S | re.M).group()
             build_detail_html = re.findall(
                 'class.*?</a></td>.*?</a></td>.*?</a></td>', build_html,
                 re.S | re.M)
             bu_num = re.findall('项目名称:</b>(.*?)</div>', html,
                                 re.S | re.M)[0].strip()
             url_list = []
             for bu in build_detail_html:
                 try:
                     build = Building(co_index)
                     build.bu_id = re.search(
                         "href='roomTable.aspx\?id=(.*?)&", bu,
                         re.S | re.M).group(1)
                     build.bu_address = re.search(
                         "_blank.*?_blank'>(.*?)</a></td><td>", bu,
                         re.S | re.M).group(1).strip()
                     build.bo_develops = bo_develops
                     build.bu_build_size = bu_build_size
                     build.bu_pre_sale = bu_pre_sale
                     build.bu_num = bu_num
                     build.bu_floor = bu_floor
                     build.bu_all_house = bu_all_house
                     build.bu_type = bu_type
                     for k in self.area_list:
                         if k in build.bu_address:
                             build.area = k
                             continue
                     build.insert_db()
                     house_url = re.search(
                         "(roomTable.aspx\?id=.*?&vc=.*?)'", bu,
                         re.S | re.M).group(1)
                     url_list.append(
                         'http://dgfc.dg.gov.cn/dgwebsite_v2/Vendition/' +
                         house_url)
                 except Exception as e:
                     print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
             house_url_list = url_list + house_url_list
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
     return house_url_list
Beispiel #20
0
    def get_build_url(self, all_list_url):
        # 存储小区的信息,存储楼栋的信息
        for i in all_list_url:
            try:
                res = requests.get(url=i['url'], )
                html_str = res.content.decode()
                c = Comm(self.co_index)
                c.area = i['area']
                c.co_name = re.search('项目名称:.*?left">(.*?)</td>', html_str,
                                      re.S | re.M).group(1)
                c.co_owner = re.search('所有权证号:.*?left">(.*?)</td>', html_str,
                                       re.S | re.M).group(1)
                c.co_land_use = re.search('土地使用权证:.*?left">(.*?)</td>',
                                          html_str, re.S | re.M).group(1)
                c.co_land_type = re.search('土地权证类型:.*?left">(.*?)</td>',
                                           html_str, re.S | re.M).group(1)
                print(c.co_name)
                c.insert_db()

                # 找到楼栋
                build_str = re.search('楼盘表<(.*?)/table>', html_str,
                                      re.S | re.M).group(1)
                # 遍历所有楼栋
                for k in re.findall('<tr>.*?</tr>', build_str, re.S | re.M):
                    try:
                        b = Building(self.co_index)
                        b.co_name = re.search('项目名称:.*?left">(.*?)</td>',
                                              html_str, re.S | re.M).group(1)
                        b.bu_num = re.search('buildingInfo.*?">(.*?)</a>', k,
                                             re.S | re.M).group(1)
                        b.bu_build_size = re.findall('<td.*?>(.*?)</td>', k,
                                                     re.S | re.M)[3]
                        house_url = re.search("this,'(.*?)'\);", k,
                                              re.S | re.M).group(1)
                        b.insert_db()

                        complete_url = self.house_url + house_url

                        res = requests.get(url=complete_url,
                                           headers=self.headers)
                        # 房号页面
                        house_html_str = res.content.decode()
                        # 找到所有的房号
                        for j in re.findall('a href="(.*?)" target',
                                            house_html_str, re.S | re.M):
                            try:
                                h = House(self.co_index)
                                h.bu_num = re.search(
                                    '<h3 class="h3">(.*?)</h3>',
                                    house_html_str, re.S | re.M).group(1)
                                com_url = self.house_url + j
                                res = requests.get(url=com_url,
                                                   headers=self.headers)
                                house_detail_html = res.content.decode()
                                h.co_name = re.search('项目名称:.*?<td>(.*?)</td>',
                                                      house_detail_html,
                                                      re.S | re.M).group(1)
                                h.ho_name = re.search('房  号:.*?<td>(.*?)</td>',
                                                      house_detail_html,
                                                      re.S | re.M).group(1)
                                h.ho_build_size = re.search(
                                    '建筑面积:.*?<td>(.*?)</td>',
                                    house_detail_html, re.S | re.M).group(1)
                                h.ho_type = re.search('房屋用途:.*?<td>(.*?)</td>',
                                                      house_detail_html,
                                                      re.S | re.M).group(1)
                                h.ho_floor = re.search(
                                    '所 在 层:.*?<td>(.*?)</td>',
                                    house_detail_html, re.S | re.M).group(1)
                                h.ho_share_size = re.search(
                                    '分摊面积:.*?<td>(.*?)</td>',
                                    house_detail_html, re.S | re.M).group(1)
                                h.ho_room_type = re.search(
                                    '房屋户型:.*?<td>(.*?)</td>',
                                    house_detail_html, re.S | re.M).group(1)
                                h.ho_true_size = re.search(
                                    '套内面积:.*?<td>(.*?)</td>',
                                    house_detail_html, re.S | re.M).group(1)
                                h.insert_db()
                            except Exception as e:
                                print(
                                    '房号错误,co_index={},url={}'.format(
                                        self.co_index, com_url), e)
                                continue
                    except Exception as e:
                        print(
                            '楼栋错误,co_index={},url={}'.format(
                                self.co_index, i['url']), e)
                        continue
            except Exception as e:
                print(
                    '小区错误,co_index={},url={}'.format(self.co_index, i['url']),
                    e)
                continue
    def start_crawler(self):
        response = requests.get(url)
        html = response.text
        tree = etree.HTML(html)
        comm_list = tree.xpath('//tr[@class="Row"]/td[1]/text()')
        co_develops_list = tree.xpath('//tr[@class="Row"]/td[3]/text()')
        co_address_list = tree.xpath('//tr[@class="Row"]/td[8]/text()')
        co_open_time_list = tree.xpath('//tr[@class="Row"]/td[9]/text()')
        co_pre_sale_list = tree.xpath('//tr[@class="Row"]/td[5]/text()')
        co_all_house_list = tree.xpath('//tr[@class="Row"]/td[11]/text()')
        co_build_size_list = tree.xpath('//tr[@class="Row"]/td[10]/text()')
        co_name_list = tree.xpath('//tr[@class="Row"]/td[4]/text()')
        for co in range(0, len(comm_list)):
            try:
                comm = Comm(co_index)
                comm_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/PreSellLicenceDetailInfo.aspx?PreSellLicenceSN=' + \
                           comm_list[
                               co]
                result = requests.get(comm_url)
                html_build = result.text
                tree = etree.HTML(html_build)
                build_list = tree.xpath('//tr[@class="Row"]/td[1]/text()')
                area = tree.xpath('//*[@id="LabSCFW"]/text()')[0]
                comm.co_id = comm_list[co]
                comm.area = area
                comm.co_develops = co_develops_list[co]
                comm.co_address = co_address_list[co]
                comm.co_open_time = co_open_time_list[co]
                comm.co_pre_sale = co_pre_sale_list[co]
                comm.co_all_house = co_all_house_list[co]
                comm.co_build_size = co_build_size_list[co]
                comm.co_develops = co_develops_list[co]
                comm.co_name = co_name_list[co]
                comm.insert_db()
                for bu in range(0, len(build_list)):
                    try:

                        build_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/PubRptHouseList.aspx?BuildingSN=' + \
                                    build_list[bu]
                        res = requests.get(build_url, headers=self.headers)
                        con = res.content.decode('gbk')
                        building = Building(co_index)

                        building.co_id = comm.co_id
                        building.bu_id = build_list[bu]
                        building.bu_num = re.search(
                            '栋号.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_build_size = re.search(
                            '总建筑面积.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_floor = re.search(
                            '层数.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_all_house = re.search(
                            '预售套数.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_pre_sale_date = re.search(
                            '有效期.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_type = re.search(
                            '土地用途.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_pre_sale = re.search(
                            '许可证编号.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.insert_db()

                        house_list = re.findall('房号:<a href="(.*?)"', con)
                        for ho in house_list:
                            try:
                                house = House(co_index)
                                house_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/' + ho
                                respon = requests.get(house_url)
                                html = respon.text
                                house.co_id = comm.co_id
                                house.bu_id = building.bu_id
                                house.ho_name = re.search(
                                    '房号:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_build_size = re.search(
                                    '预测建筑面积:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_true_size = re.search(
                                    '预测套内面积:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_share_size = re.search(
                                    '预测分摊面积:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_type = re.search(
                                    '房屋用途:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_room_type = re.search(
                                    '户型结构:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)

                                house.insert_db()
                            except Exception as e:
                                print("co_index={},房屋{}信息提取失败".format(
                                    co_index, house_url))
                                print(e)
                                continue
                    except Exception as e:
                        print(e)
                        print('co_idnex={},楼栋{}提取失败'.format(
                            co_index, build_url))
                        continue
            except Exception as e:
                print('co_index={},小区{}提取失败'.format(co_index, comm_url))
                print(e)
                continue
Beispiel #22
0
 def get_comm_info(self, url, comm):
     try:
         response = requests.get(url=url, headers=self.headers)
         html = response.text
         tree = etree.HTML(html)
         # 小区名称
         co_name = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[1]/td[2]/text()'
         )[0].strip()
         # 小区地址
         co_address = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[2]/td[2]/text()'
         )[0].strip()
         # 开工时间
         co_build_start_time = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[2]/text()'
         )[0].strip()
         # 竣工时间
         co_build_end_time = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[4]/text()'
         )[0].strip()
         # 建筑结构
         co_build_structural = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[4]/td[2]/text()'
         )[0].strip()
         # 容积率
         co_volumetric = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[4]/text()'
         )[0].strip()
         # 绿化率
         co_green = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[2]/text()'
         )[0].strip()
         # 占地面的
         co_size = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[5]/td[2]/text()'
         )[0].strip()
         co_id = re.search('home/(.*?).html', url).group(1)
         comm.co_name = co_name
         comm.co_address = co_address
         comm.co_build_start_time = co_build_start_time
         comm.co_build_end_time = co_build_end_time
         comm.co_build_structural = co_build_structural
         comm.co_volumetric = co_volumetric
         comm.co_green = co_green
         comm.co_size = co_size
         comm.co_id = co_id
         comm.insert_db()
         build_info_list = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox1"]/div/table/tr[@class="hobuild"]'
         )
         for i in build_info_list:
             try:
                 build = Building(11)
                 # 楼栋名称
                 bu_name = i.xpath('string(td[1])')[0]
                 bu_all_house = i.xpath('td[2]/text()')[0]
                 # 楼栋id
                 bu_id = i.xpath('td[1]/strong/a/@href')[0]
                 bu_id = re.search('building_id=(.*?)$', bu_id).group(1)
                 # 建筑面积
                 bu_build_size = i.xpath('string(td[3])').replace('�O', '')
                 build.co_id = co_id
                 build.bu_id = bu_id
                 build.bu_all_house = bu_all_house
                 build.bu_name = bu_name
                 build.bu_build_size = bu_build_size
                 build.insert_db()
                 self.get_house_info(bu_id, co_id)
             except Exception as e:
                 print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
     except BaseException as e:
         print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
Beispiel #23
0
    def get_build_url_list(self, url_list):
        for i in url_list:
            try:
                res = requests.get(i)
                html = res.content.decode('gbk')
                for k in re.findall('项目名称.*?</dl>', html, re.S | re.M):
                    try:
                        c = Comm(self.co_index)
                        c.co_name = re.search('html">(.*?)</a>', k,
                                              re.S | re.M).group(1)
                        c.co_address = re.search('class="address"(.*?)</dd>',
                                                 k, re.S | re.M).group(1)
                        c.area = re.search('"city">(.*?)</dd>', k,
                                           re.S | re.M).group(1)
                        c.co_develops = re.search('"average">(.*?)</dd>', k,
                                                  re.S | re.M).group(1)
                        c.insert_db()
                        global count
                        count += 1
                        print(count)

                        url = re.search('a href="(.*?)">', k,
                                        re.S | re.M).group(1)
                        complete_url = self.url_source + url
                        res = requests.get(complete_url)
                        html = res.content.decode('gbk')
                        build_info_str = re.search('楼盘表</td>(.*?)合  计', html,
                                                   re.S | re.M).group(1)
                        for j in re.findall('<tr.*?</tr>', build_info_str,
                                            re.S | re.M):
                            try:
                                b = Building(self.co_index)
                                b.co_name = re.search('html">(.*?)</a>', k,
                                                      re.S | re.M).group(1)
                                b.bu_all_house = re.search(
                                    'absmiddle"  />(.*?)</a>', j,
                                    re.S | re.M).group(1)
                                b.bu_num = re.search(
                                    '="absmiddle"  />(.*?)</a></strong></', j,
                                    re.S | re.M).group(1)
                                b.bu_build_size = re.search(
                                    'td class="t_c">.*?td class="t_c">(.*?㎡)</td>',
                                    j, re.S | re.M).group(1)
                                b.insert_db()

                                url = re.search('a href="(.*?)"', j,
                                                re.S | re.M).group(1)
                                complete_url = self.url_source + url
                                res = requests.get(complete_url)
                                html = res.content.decode('gbk')
                                # 解析html获取iframe表单的数据
                                house_url = self.url_source + re.search(
                                    '<iframe.*?"(.*?)"', html,
                                    re.S | re.M).group(1)
                                logic_house_url = house_url.replace(
                                    'Default', 'GetData')
                                logic_house_html = requests.get(
                                    url=logic_house_url).content.decode()
                                logic_id = re.search(
                                    '<LOGICBUILDING_ID>(.*?)<',
                                    logic_house_html, re.S | re.M).group(1)
                                final_url = 'http://www.yingtanfdc.com/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logic_id
                                final_html = requests.get(
                                    url=final_url).content.decode('gbk')
                                for l in re.findall(
                                        '<ROOM_NUMBER>(.*?)</ROOM_NUMBER>',
                                        final_html, re.S | re.M):
                                    try:
                                        h = House(self.co_index)
                                        h.info = final_html
                                        h.ho_name = l
                                        h.co_name = re.search(
                                            'html">(.*?)</a>', k,
                                            re.S | re.M).group(1)
                                        h.bu_num = re.search(
                                            '="absmiddle"  />(.*?)</a></strong></',
                                            j, re.S | re.M).group(1)
                                        h.insert_db()
                                    except Exception as e:
                                        continue
                            except Exception as e:
                                continue
                    except Exception as e:
                        continue
            except Exception as e:
                continue
    def get_comm_detail(self, href, comm):
        comm_detail_url = self.URL_FRONT + href
        response = requests.get(url=comm_detail_url, headers=self.headers)
        co_id = response.url
        co_id = int(co_id.split('=')[1])  # 小区id
        html = response.content.decode('gbk')

        co_name = self.regex_common(r'项目名称.*?<td.*?>(.*?)</td>', html)  # 小区名字
        co_owner = self.regex_common(r'房屋所有权证号.*?<td.*?>(.*?)</td>', html)
        co_use = self.regex_common(r'用  途.*?<td.*?>(.*?)</td>', html)
        co_develops = self.regex_common(r'开 发 商.*?<td.*?>(.*?)</td>', html)
        co_address = self.regex_common(r'项目位置.*?<td.*?>(.*?)</td>', html)
        co_pre_sale = self.regex_common(r'预售证号.*?<td.*?>(.*?)</td>', html)
        co_land_use = self.regex_common(r'土地使用权证.*?<td.*?>(.*?)</td>', html)
        co_land_type = self.regex_common(r'土地权证类型.*?<td.*?>(.*?)</td>', html)
        co_handed_time = self.regex_common(r'终止日期.*?<td.*?>(.*?)</td>', html)
        co_plan_pro = self.regex_common(r'规划许可证.*?<td.*?>(.*?)</td>', html)
        co_work_pro = self.regex_common(r'施工许可证.*?<td.*?>(.*?)</td>', html)
        co_type = self.regex_common(r'项目类型.*?<td.*?>(.*?)</td>', html)  # 小区类型
        co_size = self.regex_common(r'批准面积.*?<td.*?>(.*?)</td>', html)  # 占地面积
        comm.co_id = co_id
        comm.co_name = co_name
        comm.co_type = co_type
        comm.co_size = co_size
        comm.co_owner = co_owner
        comm.co_use = co_use
        comm.co_develops = co_develops
        comm.co_address = co_address
        comm.co_pre_sale = co_pre_sale
        comm.co_land_use = co_land_use
        comm.co_land_type = co_land_type
        comm.co_handed_time = co_handed_time
        comm.co_plan_pro = co_plan_pro
        comm.co_work_pro = co_work_pro
        # 获取楼栋url列表
        build_url_list = re.findall(r"<td><a href='(.*?)'", html, re.M | re.S)
        if not build_url_list:
            return
        else:
            for build_url in build_url_list:
                try:
                    building = Building(self.CO_INDEX)
                    build_id = re.search(r'<td>(\d{2,6})</td>', html,
                                         re.M | re.S).group(1)  # 楼栋id
                    bu_all_house = re.search(r'<td>(\d{1,3})</td>', html,
                                             re.M | re.S).group(1)  # 总套数
                    bu_price_demo = re.findall('<td>[\.\d]+</td>', html,
                                               re.M | re.S)[4]
                    bu_price = re.search('\d+', bu_price_demo).group()
                    data_dict = self.get_build_detail(build_url)
                    bu_num = data_dict['bu_num']  # 楼号
                    bu_build_size = data_dict['bu_build_size']  # 建筑面积
                    co_address = data_dict['co_address']  # 小区地址
                    co_build_end_time = data_dict['co_build_end_time']  # 竣工时间
                    co_build_type = data_dict['co_build_type']  # 竣工时间
                    if not co_build_end_time:
                        building.co_is_build = '1'
                    comm.co_address = co_address
                    comm.co_build_end_time = co_build_end_time
                    comm.bu_build_size = bu_build_size
                    comm.co_build_type = co_build_type
                    # 楼栋
                    building.bu_num = bu_num
                    building.bu_build_size = bu_build_size
                    building.bu_all_house = bu_all_house
                    building.bu_id = build_id
                    building.co_id = co_id
                    building.bu_price = bu_price
                    # 插入
                    building.insert_db()
                except Exception as e:
                    build_detail_url = self.URL_FRONT + build_url
                    print('楼栋错误:', build_detail_url)
        comm.insert_db()