Esempio n. 1
0
 def get_build_info(self, build_info_list, co_id, comm_html, url):
     for i in build_info_list:
         try:
             building = Building(2)
             bu_name = i[1]  # 楼栋名称
             bu_num = bu_name.split('#')[0]  # 楼号
             bu_all_house = i[3]  # 总套数
             bu_build_size = i[5]  # 面积
             bu_price = i[9]  # 价格
             # 给对象增加属性
             building.bu_name = bu_name
             building.bu_num = bu_num
             building.bu_all_house = bu_all_house
             building.bu_build_size = bu_build_size
             building.bu_price = bu_price
             building.co_id = co_id  # 小区id
             build_html = re.search(r'楼盘表(.*?)个楼栋信息', comm_html).group(1)
             build_url = re.search(r'<ahref="(.*?)">查看信息<',
                                   build_html).group(1)
             build_id = re.search('buildingId=(.*?)$', build_url).group(1)
             building.bu_id = build_id  # 楼栋id
             building.insert_db()
             self.get_build_detail(build_url, co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
Esempio n. 2
0
    def get_build_info(self, url, co_id):
        try:
            building = Building(co_index)
            response = requests.get(url)
            html = response.text
            tree = etree.HTML(html)
            co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0]  # 小区名字
            print(co_name)
            bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0]  # 楼栋名称
            bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0]  # 楼号 栋号
            bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[
                0]  # 总套数
            bu_floor = tree.xpath('//*[@id="cell3-1"]/text()')
            bu_floor = self.is_none(bu_floor)  # 楼层
            bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[
                0]  # 建筑面积
            bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[
                0]  # 住宅面积
            bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()')
            bu_price = self.is_none(bu_price)  # 住宅价格
            bu_id = re.search('\?(\d+)$', url).group(1)  # 楼栋id
            building.co_id = co_id
            building.bu_name = bu_name
            building.bu_num = bu_num
            building.bu_all_house = bu_all_house
            building.bu_floor = bu_floor
            building.bu_build_size = bu_build_size
            building.bu_live_size = bu_live_size
            building.bu_price = bu_price
            building.bu_id = bu_id
            building.insert_db()
            house_info_html = re.findall('<tr id="row3">(.*)$', html,
                                         re.S | re.M)[0]
            for i in re.findall('(<td.*?>.*?</td>)', house_info_html,
                                re.S | re.M):
                if '<br>' not in i:
                    continue
                ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M)
                ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i,
                                               re.S | re.M)
                ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i,
                                     re.S | re.M)[0]
                for i in range(len(ho_name_list)):
                    try:
                        if 'font' in ho_name_list[i]:
                            ho_name = re.sub('<font.*?>', '', ho_name_list[i])
                        else:
                            ho_name = ho_name_list[i]
                        house = House(8)
                        house.ho_name = ho_name
                        house.ho_true_size = ho_true_size_list[i]
                        house.co_id = co_id
                        house.bu_id = bu_id
                        house.ho_type = ho_type
                        house.insert_db()

                    except Exception as e:
                        print(e)
        except BaseException as e:
            print(e)
Esempio n. 3
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://old.newhouse.cnnbfdc.com/' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.findall('项目名称:.*?<span.*?>(.*?)<', html, re.S | re.M)[0].strip()
             comm.co_address = re.findall('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip()
             comm.co_develops = re.findall('开发公司:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip()
             comm.co_pre_sale = re.findall('预\(现\)售证名称:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip()
             comm.co_build_size = re.findall('纳入网上可售面积:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace('m&sup2;',
                                                                                                         '').strip()
             comm.co_all_house = re.findall('纳入网上可售套数:.*?<img.*?>(.*?)<', html, re.S | re.M)[0].replace('套',
                                                                                                        '').strip()
             comm.area = re.findall('所在区县:.*?<td.*?>(.*?)<', html, re.S | re.M)[0].strip()
             comm.co_id = re.findall('mobanshow.aspx\?projectid=(.*?)"', html, re.S | re.M)[0].strip()
             comm.insert_db()
             global count
             count += 1
             print(count)
             build_url_list = re.findall("window.open\('(.*?)'", html, re.S | re.M)
             bu_name_list = re.findall("window.open.*?<font.*?>(.*?)<", html, re.S | re.M)
             bu_all_house_list = re.findall("window.open.*?<td.*?>(.*?)<", html, re.S | re.M)
             qrykey = re.findall("qrykey=(.*?)&", html, re.S | re.M)
             for index in range(len(build_url_list)):
                 try:
                     build = Building(co_index)
                     build.bu_name = bu_name_list[index].strip()
                     build.bu_all_house = bu_all_house_list[index].strip()
                     build.co_id = comm.co_id
                     build.bu_id = qrykey[index].strip()
                     build.insert_db()
                 except Exception as e:
                     print(e)
             self.get_house_info(build_url_list)
         except Exception as e:
             print(e)
Esempio n. 4
0
    def get_comm_info(self, url, response, comm):

        html = response.text
        tree = etree.HTML(html)
        # 地区
        co_area = tree.xpath(
            '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[3]/td[2]/text()'
        )[0]

        # 小区名称
        co_name = tree.xpath(
            '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[1]/td/strong/span/text()'
        )[0]
        # 小区地址
        co_address = tree.xpath(
            '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[2]/td/span/text()'
        )[0]
        # 开发商
        co_develops = tree.xpath(
            '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[3]/td[1]/span/@title'
        )[0]
        # 物业公司
        co_develops = tree.xpath(
            '//div[@class="wzjs-box"]//tr[3]//span/text()')[0]
        # 容积率
        co_volumetric = tree.xpath(
            '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[5]/td[2]/span/text()'
        )[0]
        # 预售证书
        co_pre_sale = tree.xpath(
            '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[6]/td[1]/text()'
        )[0]
        # 建筑面积
        co_build_size = tree.xpath(
            '//*[@id="content"]/div[2]/div[1]/div[2]/table/tr[5]/td[1]'
        )[0].text
        # 小区id
        co_id = re.search('id=(.*?)$', url).group(1)
        html_ = html.replace('\t',
                             '').replace('\r',
                                         '').replace('\n',
                                                     '').replace(' ', '')
        bu_url_info = re.search('<pclass="bot-a">(.*?)</p>', html_).group(1)
        building_url_list = re.findall('<td><aid="(.*?)"(.*?)>(.*?)</a>',
                                       bu_url_info)

        for i in building_url_list:
            build = Building(co_index)
            value = i[0]
            bu_name = i[2]
            house_url = 'http://fsfc.fsjw.gov.cn/hpms_project/room.jhtml?id=' + value
            floor_url = "http://fsfc.fsjw.gov.cn/hpms_project/roomtj.jhtml?id=" + value

            try:
                res = requests.get(floor_url, headers=self.headers)
            except Exception as e:
                print("co_index={},楼栋详情页{}访问失败".format(co_index, floor_url))
                print(e)
                continue

            try:
                bu_floor = json.loads(res.text)
                build.bu_floor = bu_floor["zcs"]
            except:
                build.bu_floor = None

            try:
                response = requests.get(house_url, headers=self.headers)
            except Exception as e:
                print("co_index={},房屋详情页{}请求失败".format(co_index, house_url))
                print(e)
            self.get_build_info(house_url, response, co_id, value)

            build.co_id = co_id
            build.bu_id = value
            build.bu_name = bu_name

            build.insert_db()

        comm.co_name = co_name
        comm.co_id = co_id
        comm.co_address = co_address
        comm.co_develops = co_develops
        comm.co_volumetric = co_volumetric
        comm.co_pre_sale = co_pre_sale
        comm.co_build_size = co_build_size
        comm.area = co_area
        comm.insert_db()
Esempio n. 5
0
 def get_comm_info(self, url, comm):
     try:
         response = requests.get(url=url, headers=self.headers)
         html = response.text
         tree = etree.HTML(html)
         # 小区名称
         co_name = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[1]/td[2]/text()'
         )[0].strip()
         # 小区地址
         co_address = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[2]/td[2]/text()'
         )[0].strip()
         # 开工时间
         co_build_start_time = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[2]/text()'
         )[0].strip()
         # 竣工时间
         co_build_end_time = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[3]/td[4]/text()'
         )[0].strip()
         # 建筑结构
         co_build_structural = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[4]/td[2]/text()'
         )[0].strip()
         # 容积率
         co_volumetric = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[4]/text()'
         )[0].strip()
         # 绿化率
         co_green = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[6]/td[2]/text()'
         )[0].strip()
         # 占地面的
         co_size = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox3"]/div/table/tr[5]/td[2]/text()'
         )[0].strip()
         co_id = re.search('home/(.*?).html', url).group(1)
         comm.co_name = co_name
         comm.co_address = co_address
         comm.co_build_start_time = co_build_start_time
         comm.co_build_end_time = co_build_end_time
         comm.co_build_structural = co_build_structural
         comm.co_volumetric = co_volumetric
         comm.co_green = co_green
         comm.co_size = co_size
         comm.co_id = co_id
         comm.insert_db()
         build_info_list = tree.xpath(
             '//*[@id="ctl00_CPH_M_sm_spfBox1"]/div/table/tr[@class="hobuild"]'
         )
         for i in build_info_list:
             try:
                 build = Building(11)
                 # 楼栋名称
                 bu_name = i.xpath('string(td[1])')[0]
                 bu_all_house = i.xpath('td[2]/text()')[0]
                 # 楼栋id
                 bu_id = i.xpath('td[1]/strong/a/@href')[0]
                 bu_id = re.search('building_id=(.*?)$', bu_id).group(1)
                 # 建筑面积
                 bu_build_size = i.xpath('string(td[3])').replace('�O', '')
                 build.co_id = co_id
                 build.bu_id = bu_id
                 build.bu_all_house = bu_all_house
                 build.bu_name = bu_name
                 build.bu_build_size = bu_build_size
                 build.insert_db()
                 self.get_house_info(bu_id, co_id)
             except Exception as e:
                 print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
     except BaseException as e:
         print('楼栋错误,co_index={},url={}'.format(co_index, url), e)