예제 #1
0
    def bu_info(self, bu_list, co_id):
        for bu in bu_list:
            try:
                bu_url = 'http://www.fxfdcw.com/' + bu
                res = requests.get(bu_url, headers=self.headers)
                con = res.content.decode('gbk')
                html = etree.HTML(con)
                build = Building(co_index)
                build.co_id = co_id
                build.bu_id = re.search('bdid=(\d+)', bu).group(1)
                build.bu_num = re.search('楼号.*?">(.*?)</', con,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('坐落.*?">(.*?)</', con,
                                             re.S | re.M).group(1)
                build.bu_floor = re.search('地上层数.*?">(.*?)</', con,
                                           re.S | re.M).group(1)
                build.bu_build_size = re.search('建筑面积.*?wrap">(.*?)</', con,
                                                re.S | re.M).group(1)
                build.bu_all_house = re.search('套 数.*?">(.*?)</', con,
                                               re.S | re.M).group(1)
                build.bu_type = re.search('用  途.*?wrap">(.*?)</', con,
                                          re.S | re.M).group(1)
                build.insert_db()

                ho_list = html.xpath("//span[@title]")
            except Exception as e:
                # log.error("楼栋信息错误{}".format(e))
                print("楼栋信息错误{}".format(e))
                continue
            self.ho_info(ho_list, co_id, build.bu_id)
예제 #2
0
    def get_build_info(self, url, co_id):
        try:
            building = Building(co_index)
            response = requests.get(url)
            html = response.text
            tree = etree.HTML(html)
            co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0]  # 小区名字
            print(co_name)
            bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0]  # 楼栋名称
            bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0]  # 楼号 栋号
            bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[
                0]  # 总套数
            bu_floor = tree.xpath('//*[@id="cell3-1"]/text()')
            bu_floor = self.is_none(bu_floor)  # 楼层
            bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[
                0]  # 建筑面积
            bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[
                0]  # 住宅面积
            bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()')
            bu_price = self.is_none(bu_price)  # 住宅价格
            bu_id = re.search('\?(\d+)$', url).group(1)  # 楼栋id
            building.co_id = co_id
            building.bu_name = bu_name
            building.bu_num = bu_num
            building.bu_all_house = bu_all_house
            building.bu_floor = bu_floor
            building.bu_build_size = bu_build_size
            building.bu_live_size = bu_live_size
            building.bu_price = bu_price
            building.bu_id = bu_id
            building.insert_db()
            house_info_html = re.findall('<tr id="row3">(.*)$', html,
                                         re.S | re.M)[0]
            for i in re.findall('(<td.*?>.*?</td>)', house_info_html,
                                re.S | re.M):
                if '<br>' not in i:
                    continue
                ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M)
                ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i,
                                               re.S | re.M)
                ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i,
                                     re.S | re.M)[0]
                for i in range(len(ho_name_list)):
                    try:
                        if 'font' in ho_name_list[i]:
                            ho_name = re.sub('<font.*?>', '', ho_name_list[i])
                        else:
                            ho_name = ho_name_list[i]
                        house = House(8)
                        house.ho_name = ho_name
                        house.ho_true_size = ho_true_size_list[i]
                        house.co_id = co_id
                        house.bu_id = bu_id
                        house.ho_type = ho_type
                        house.insert_db()

                    except Exception as e:
                        print(e)
        except BaseException as e:
            print(e)
예제 #3
0
 def get_build_info(self, build_logo_list, preid):
     for build_logo in build_logo_list:
         try:
             build_url = 'https://www.qdfd.com.cn/qdweb/realweb/fh/FhBuildingList.jsp?preid=' + build_logo
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             bu_num_list = re.findall(
                 'javascript:showHouseStatus.*?>(.*?)</a', html,
                 re.S | re.M)
             bu_all_house_list = re.findall(
                 'javascript:showHouseStatus.*?center.*?center.*?center.*?center.*?center.*?>(.*?)<',
                 html, re.S | re.M)
             house_code_list = re.findall(
                 "javascript:showHouseStatus\((.*?)\)'>", html, re.S | re.M)
             for i in range(len(bu_num_list)):
                 try:
                     build = Building(co_index)
                     bu_code_list = re.findall('"(.*?)"',
                                               house_code_list[i])
                     build.bu_num = bu_num_list[i]
                     build.bu_all_house = bu_all_house_list[i]
                     build.co_id = preid
                     build.bu_id = bu_code_list[0]
                     build.insert_db()
                     co_id = bu_code_list[2]
                     house_id = bu_code_list[1]
                     self.get_house_info(build.bu_id, co_id, house_id)
                 except Exception as e:
                     print(e)
         except Exception as e:
             print('青岛楼栋问题,url:={}'.format(build_url), e)
예제 #4
0
 def build_parse(self, co_id):
     list_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/loulist.jsp?Id_xmxq=' + co_id
     res = requests.get(list_url, headers=self.headers)
     con = res.content.decode()
     build_id_list = re.findall("searchByLid\('(\d+)'\)", con)
     for build_id in build_id_list:
         try:
             bu_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/lpbxx_new.jsp?lid=' + build_id
             bu_res = requests.get(bu_url, headers=self.headers)
             bu_con = bu_res.content.decode('gbk')
             bu = Building(co_index)
             bu.co_id = co_id
             bu.bu_id = build_id
             bu.bu_num = re.search('楼栋名称.*?">(.*?)</td', bu_con,
                                   re.S | re.M).group(1)
             bu.bu_all_house = re.search('总套数.*?">总(.*?)套</td', bu_con,
                                         re.S | re.M).group(1)
             bu.bu_floor = re.search('地上层数.*?">共(.*?)层</td', bu_con,
                                     re.S | re.M).group(1)
             bu.bu_build_size = re.search('总建筑面积.*?">(.*?)</td', bu_con,
                                          re.S | re.M).group(1)
             bu.bu_pre_sale = re.search("searchysxk\('(.*?)'\)", bu_con,
                                        re.S | re.M).group(1)
             bu.bu_type = re.search('房屋用途.*?">(.*?)</td', bu_con,
                                    re.S | re.M).group(1)
             bu.insert_db()
         except Exception as e:
             log.error('{}楼栋错误{}'.format(build_id, e))
         self.house_parse(co_id, build_id, bu_con)
예제 #5
0
 def get_build_info(self, build_url_list, comm):
     for i in build_url_list:
         try:
             build_url = 'http://58.51.240.121:8503/' + i
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             comm.co_pre_sale = re.search(
                 'id="PresellInfo1_lblXkzh">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_pre_sale_date = re.search(
                 'id="PresellInfo1_lblFzrq">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.insert_db()
             build_info_list = re.findall('<tr bgcolor="#FFFFFF">.*?</tr>',
                                          html, re.S | re.M)
             for i in build_info_list:
                 build = Building(co_index)
                 build.co_id = comm.co_id
                 build.bu_num = re.search('<td.*?>(.*?)<', i,
                                          re.S | re.M).group(1)
                 build.bu_floor = re.search('<td.*?<td.*?>(.*?)<', i,
                                            re.S | re.M).group(1)
                 build.bu_all_house = re.search('<td.*?<td.*?<td.*?>(.*?)<',
                                                i, re.S | re.M).group(1)
                 build.bu_id = re.search('PresellId=(.*?)$',
                                         build_url).group(1)
                 build.insert_db()
                 house_url = re.search('a href="(.*?)"', i,
                                       re.S | re.M).group(1)
                 self.get_house_info(house_url, comm.co_id, build.bu_id)
         except Exception as e:
             print('请求错误,co_index={},url={}'.format(co_index, build_url), e)
예제 #6
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://222.223.160.199:8088/website/buildquery/selectBuild.jsp?buildID=' + i[
                 0]
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             build.bu_id = i[0]
             build.co_build_structural = re.search('结构类型.*?<td.*?>(.*?)<',
                                                   html,
                                                   re.S | re.M).group(1)
             build.bo_build_end_time = re.search('建成年份.*?<td.*?>(.*?)<',
                                                 html, re.S | re.M).group(1)
             build.bu_build_size = re.search('总建筑面积.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
             build.bu_num = re.search('幢号.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             build.size = re.search('占地面积.*?<td>(.*?)<', html,
                                    re.S | re.M).group(1)
             build.bu_floor = re.search('房屋层数.*?<td>(.*?)<', html,
                                        re.S | re.M).group(1)
             build.bu_all_house = re.search('房屋套数.*?<td>(.*?)<', html,
                                            re.S | re.M).group(1)
             build.area = re.search('坐落区.*?<td>(.*?)<', html,
                                    re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(build.bu_id)
         except Exception as e:
             print('请求错误,url={}'.format(build_url), e)
예제 #7
0
 def build_info(self, co_id, temp_url_list):
     for temp_url in temp_url_list:
         try:
             build_url = "http://222.77.178.63:7002/" + temp_url
             res = requests.get(build_url, headers=self.headers)
             html = etree.HTML(res.content.decode('gbk'))
             build_info_list = html.xpath("//tr[@class='indextabletxt']")
             for build_info in build_info_list:
                 bu = Building(co_index)
                 ho_url = build_info.xpath("./td/a/@href")[0]
                 bu.co_id = co_id
                 bu.bu_id = re.search('Param=(.*)', ho_url).group(1)
                 bu.bu_num = build_info.xpath("./td/a/text()")[0]
                 bu.bu_all_house = build_info.xpath("./td[2]/text()")[0]
                 try:
                     bu.bu_all_size = build_info.xpath("./td[3]/text()")[0]
                 except:
                     bu.bu_all_size = None
                 try:
                     bu.bu_live_size = build_info.xpath("./td[5]/text()")[0]
                 except:
                     bu.bu_live_size = None
                 bu.insert_db()
         except Exception as e:
             # log.error('楼栋信息错误{}'.format(e))
             print('楼栋信息错误{}'.format(e))
             continue
         self.house_info(ho_url, co_id, bu.bu_id)
예제 #8
0
 def build_info(self, build_detail, co_id):
     proxy = Proxy_contact(app_name='wuhan',
                           method='get',
                           url=build_detail,
                           headers=self.headers)
     # build_res = requests.get(build_detail,headers=self.headers)
     build_res = proxy.contact()
     html = etree.HTML(build_res.decode('gb18030'))
     info_list = html.xpath("//tr[@bgcolor='#FFFFFF']")
     for info in info_list:
         try:
             bu = Building(co_index)
             bu.co_id = co_id
             bu.bu_floor = info.xpath('./td[3]/text()')[0]
             bu.bu_all_house = info.xpath('./td[4]/text()')[0]
             bu.bu_num = info.xpath('./td//span/text()')[0]
             temp_url = info.xpath('./td/a/@href')[0]
             bu.bu_id = re.search('HouseDengjh=(.*?\d+)', temp_url).group(1)
             bu.insert_db()
         except Exception as e:
             log.error('楼栋错误{}'.format(e))
             continue
         a = parse.quote(re.search('DengJh=(.*?\d+)&', temp_url).group(1),
                         encoding='gbk')
         b = parse.quote(re.search('HouseDengjh=(.*?\d+)',
                                   temp_url).group(1),
                         encoding='gbk')
         bu_url = 'http://scxx.fgj.wuhan.gov.cn/5.asp?DengJh=' + a + '&HouseDengjh=' + b
         self.house_info(bu.bu_id, bu_url, co_id)
         time.sleep(3)
예제 #9
0
 def detail_parse(self, id, build_list):
     for build in build_list:
         bu_temp = re.search('<a href="(.*?)"', build).group(1)
         build_url = self.start_url + bu_temp
         try:
             bu_res = requests.get(build_url, headers=self.headers)
             time.sleep(2)
             bu_text = bu_res.content.decode()
             bu = Building(co_index)
             bu.bu_num = re.search('幢号:(.*?) 许', bu_text).group(1)
             bu.bu_pre_sale = re.search('许可证号:<span>(.*?)</span>',
                                        bu_text).group(1)
             bu.bu_id = int(bu.bu_pre_sale)
             bu.bu_all_house = re.search('套数:<span>(.*?)</span',
                                         bu_text).group(1)
             bu.bu_floor = re.search('地上层数:<span>(.*?)</span',
                                     bu_text).group(1)
             bu.bo_build_end_time = re.search('竣工日期:<span>(.*?)</span',
                                              bu_text).group(1)
             bu.bu_build_size = re.search('预售许可面积:<span>(.*?)</span',
                                          bu_text).group(1)
             bu.bu_type = re.search('用途:<span>(.*?)</span',
                                    bu_text).group(1)
             bu.insert_db()
         except Exception as e:
             log.error("楼栋出错{}".format(e))
             continue
         self.house_detail(bu_text, id, bu.bu_id)
예제 #10
0
    def parse(self, res):
        html = etree.HTML(res.content.decode('gbk'))
        bu_list = html.xpath("//div[@class='listCon']")
        for i in bu_list:
            temp = i.xpath("./a[@class='listCon2']/@href")[0]
            name = i.xpath("./a[@class='listCon1']/@title")[0]
            url = "http://www.hyfc365.com" + temp
            try:
                bu_res = requests.get(url, headers=self.headers)
                content = bu_res.content.decode('gbk')
                bu = Building(co_index)
                bu.bu_num = name
                project_id = re.search('ID=(.*)', temp).group(1)
                bu.bu_pre_sale = re.search('预售证名称.*?NAME">(.*?)</span',
                                           content, re.S | re.M).group(1)
                bu.bu_pre_sale_date = re.search('申领时间.*?">(.*?)</span',
                                                content, re.S | re.M).group(1)
                bu.bo_develops = re.search('申领单位.*?">(.*?)</span', content,
                                           re.S | re.M).group(1)
                bu.bu_build_size = re.search('"SALE_HOUSE_AREA">(.*?)<',
                                             content, re.S | re.M).group(1)
                bu.bu_all_house = re.search('"SALE_HOUSE_COUNT">(.*?)<',
                                            content, re.S | re.M).group(1)

                detail_url = 'http://www.hyfc365.com/RealEstate/Project/BuildingList.aspx?ID=' + project_id
                detail_res = requests.get(detail_url)
                bu_id = re.search("BUILDING_ID=(.*?)'",
                                  detail_res.text).group(1)
                bu.bu_id = bu_id
                bu.insert_db()
            except Exception as e:
                log.error("{}楼栋页面解析失败{}".format(url, e))
                continue
            self.house_parse(bu_id)
예제 #11
0
 def get_build_info(self, build_info_list, co_id, comm_html, url):
     for i in build_info_list:
         try:
             building = Building(2)
             bu_name = i[1]  # 楼栋名称
             bu_num = bu_name.split('#')[0]  # 楼号
             bu_all_house = i[3]  # 总套数
             bu_build_size = i[5]  # 面积
             bu_price = i[9]  # 价格
             # 给对象增加属性
             building.bu_name = bu_name
             building.bu_num = bu_num
             building.bu_all_house = bu_all_house
             building.bu_build_size = bu_build_size
             building.bu_price = bu_price
             building.co_id = co_id  # 小区id
             build_html = re.search(r'楼盘表(.*?)个楼栋信息', comm_html).group(1)
             build_url = re.search(r'<ahref="(.*?)">查看信息<',
                                   build_html).group(1)
             build_id = re.search('buildingId=(.*?)$', build_url).group(1)
             building.bu_id = build_id  # 楼栋id
             building.insert_db()
             self.get_build_detail(build_url, co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, url), e)
예제 #12
0
    def comm_info(
        self,
        con,
    ):
        # 小区及楼栋
        comm = Comm(co_index)

        comm.co_name = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()"
        )[0]  # 小区名称
        co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0]  # 小区id
        comm.co_id = re.search(r"\d+", co_id_str).group(0)
        comm.co_address = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[
                0]  # 小区地址
        comm.co_develops = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[
                0]  # 开发商
        comm.co_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0]  # 总面积
        comm.co_build_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[
                0]  # 建筑面积
        comm.co_build_end_time = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[
                0]  # 竣工时间
        comm.co_plan_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[
                0]  # 用地规划许可
        comm.co_work_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0]  # 施工许可
        comm.co_green = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()"
        )[0]  # 绿地百分比
        comm.co_land_use = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0]  # 土地使用证

        comm.insert_db()

        build = Building(co_index)
        build_table = con.xpath("//tr[@style='color:#000066;']")
        room_list = []
        for build_list in build_table:
            build.co_id = comm.co_id
            build.co_name = comm.co_name
            build_info = build_list.xpath("./td/text()")
            build.bu_id = build_info[0]
            build.bu_num = build_info[1]
            build.bu_all_house = build_info[2]
            build.size = build_info[3]
            build.bu_floor = build_info[4]
            build.bu_pre_sale = build_info[5]

            build.insert_db()

            room_url = build_list.xpath("./td/a/@href")[0]
            room_list.append(room_url)

        return room_list
예제 #13
0
 def get_comm_info(self, comm_id_list):
     for i in comm_id_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://web.xxfdc.gov.cn/onlineQuery/projectInformation.do?xmId=' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.co_develops = re.search('开发商:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_all_house = re.search('已售总套数:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_build_size = re.search('已售总面积:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.area = re.search('行政区别:.*?<td.*?>(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_id = i
             comm.insert_db()
             bu_html = re.search(
                 '<table class="table table-bordered itemInfoDetail.*?</table>',
                 html, re.S | re.M).group()
             build_info_list = re.findall('<tr>.*?</tr>', bu_html,
                                          re.S | re.M)[1:]
             for i in build_info_list:
                 try:
                     build = Building(co_index)
                     build.bu_num = re.search('<td>(.*?)<', i,
                                              re.S | re.M).group(1)
                     build.bu_all_house = re.search(
                         '<td>.*?<td>.*?<td>(.*?)<', i,
                         re.S | re.M).group(1)
                     build.bu_id = re.search('buildId=(.*?)&', i,
                                             re.S | re.M).group(1)
                     build.co_id = comm.co_id
                     build.insert_db()
                     house_url = re.search('<a href="(.*?)"', bu_html,
                                           re.S | re.M).group(1)
                     response = requests.get(house_url,
                                             headers=self.headers)
                     html = response.text
                     house_url_list = re.findall(
                         '<td width="110">.*?<a.*?href="(.*?)"', html,
                         re.S | re.M)
                     self.get_house_info(house_url_list, build.bu_id,
                                         comm.co_id)
                 except Exception as e:
                     print(
                         '楼栋错误,co_index={},url={}'.format(
                             co_index, house_url), e)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
예제 #14
0
    def get_build_info(self, comm_url_list):
        for i in comm_url_list:
            try:
                sid = re.findall('\+(\d+)\+', i)[0]
                pid = re.findall('\+(\d+)\+', i)[1]
                build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid
                # print(build_url)
                response = requests.get(build_url)
                html = response.text
                build = Building(co_index)
                build.bu_id = pid
                build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
                build.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', html,
                                              re.S | re.M).group(1)
                build.bu_pre_sale_date = re.search('时间.*?">(.*?)&nbsp', html,
                                                   re.S | re.M).group(1)
                build.bu_all_house = re.search('dM.*?">(.*?)&nbsp', html,
                                               re.S | re.M).group(1)
                # build.bu_address = re.search('售楼处地址.*?">(.*?)&nbsp', html, re.S | re.M).group(1)
                build.insert_db()
            except Exception as e:
                print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url),
                      e)

            house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001&params=' + sid
            # print(house_url)
            result = requests.get(house_url)
            html_ = result.text

            for house_info in re.findall('<Result.*?</Result>', html_,
                                         re.S | re.M):
                try:
                    house = House(co_index)
                    house.bu_id = build.bu_id
                    house.bu_num = build.bu_num
                    house.ho_name = re.search('<ONAME>(.*?)</ONAME>',
                                              house_info, re.S | re.M).group(1)
                    house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info,
                                             re.S | re.M).group(1)
                    house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>',
                                                    house_info,
                                                    re.S | re.M).group(1)
                    house.ho_floor = re.search('<FORC>(.*?)</FORC>',
                                               house_info,
                                               re.S | re.M).group(1)
                    house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>',
                                                   house_info,
                                                   re.S | re.M).group(1)
                    house.insert_db()
                except Exception as e:
                    print('co_index={}, 房号错误'.format(co_index), e)
예제 #15
0
    def get_build_info(self, co_id, co_name):
        url = 'http://www.czhome.com.cn/Presell.asp?projectID=' + co_id + '&projectname=' + co_name
        response = requests.get(url, headers=self.headers)
        html = response.content.decode('gbk')
        tree = etree.HTML(html)
        xpath_list = tree.xpath('//tr[@class="indextabletxt"]')
        for i in xpath_list[1:]:
            build_url = i.xpath('td[2]/a/@href')[0]
            url = 'http://www.czhome.com.cn/' + build_url
            result = requests.get(url, headers=self.headers)
            if result.status_code is not 200:
                print("co_index={},预售url:{}连接失败".format(co_index, url))
                continue
            html = result.content.decode('gbk')
            tree = etree.HTML(html)
            # 总套数
            bu_xpath = tree.xpath('/html/body/table/tr/td/table/tr/td/table/tr')[1:]
            for i in bu_xpath:
                try:
                    building = Building(7)
                    global building_id
                    building_id += 1
                    building.bu_id = building_id
                    bu_all_house = i.xpath('td[7]/text()')[0]
                    bu_url = i.xpath('td[1]/a/@href')[0]
                    url = 'http://www.czhome.com.cn/' + bu_url
                    response = requests.get(url, headers=self.headers)
                    if response.status_code is not 200:
                        print("co_index={},楼栋url:{}连接失败".format(co_index, url))
                        continue
                    html = response.content.decode('gbk')
                    tree = etree.HTML(html)
                    # 楼层
                    bu_floor = tree.xpath('//*[@id="Table4"]/tr[2]/td/table[3]/tr/td[1]/u/text()')[-1]
                    house_url_list = tree.xpath('//*[@id="Table4"]/tr[2]/td/table[3]/tr/td/a/@href')
                    bu_address = re.search('<center><font color=.*?&nbsp;&nbsp;(.*?)<', html, re.S | re.M).group(1)
                    building.bu_all_house = bu_all_house
                    building.bu_address = bu_address
                    building.bu_floor = bu_floor
                    building.bu_id = building_id
                    building.co_id = co_id
                    building.insert_db()
                    for i in house_url_list:
                        try:
                            house = House(7)
                            house_url = 'http://www.czhome.com.cn/' + i
                            self.get_house_info(house_url, house, co_id, building_id, building)
                        except Exception as e:
                            print(e)


                except Exception as e:
                    print(e)
예제 #16
0
 def build_info(self, bu_info_list, co_id):
     for bu_info in bu_info_list:
         try:
             bu = Building(co_index)
             url = bu_info.xpath("./@onclick")[0]
             bu.bu_id = re.search('dbh=(\d+)', url).group(1)
             bu.co_id = co_id
             bu.bu_num = bu_info.xpath("./td[@class='org']/text()")[0]
             bu.bu_all_house = bu_info.xpath("./td[3]/text()")[0]
             bu.size = bu_info.xpath("./td[2]/text()")[0]
             bu.insert_db()
         except Exception as e:
             log.error('楼栋信息错误', e)
예제 #17
0
    def bu_parse(self, co_id, page, co_url, co_res, path_url):
        html = etree.HTML(co_res.text)
        viewstate = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
        generator = html.xpath("//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
        valid = html.xpath("//input[@id='__EVENTVALIDATION']/@value")[0]
        formdata = {
            "__VIEWSTATE": viewstate,
            "__EVENTTARGET": 'ctl00$MainContent$OraclePager1$ctl11$PageList',
            "__VIEWSTATEGENERATOR": generator,
            "__EVENTVALIDATION": valid,
            "ctl00$MainContent$OraclePager1$ctl11$PageList": 0
        }
        self.headers['Referer'] = co_url

        for i in range(1, int(page) + 1):
            page_res = requests.post(co_url,
                                     data=formdata,
                                     headers=self.headers)
            page_html = etree.HTML(page_res.text)
            view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
            generator_ = html.xpath(
                "//input[@id='__VIEWSTATEGENERATOR']/@value")[0]
            valid_ = html.xpath("//input[@id='__EVENTVALIDATION']/@value")[0]
            formdata = {
                "__VIEWSTATE": view_state,
                "__EVENTTARGET":
                'ctl00$MainContent$OraclePager1$ctl11$PageList',
                "__VIEWSTATEGENERATOR": generator_,
                "__EVENTVALIDATION": valid_,
                "ctl00$MainContent$OraclePager1$ctl11$PageList": i - 1
            }

            bu_list = page_html.xpath(
                "//table[@id='ctl00_MainContent_OraclePager1']//tr")

            for bu in bu_list[1:]:
                build = Building(co_index)
                build.co_id = co_id
                build.bu_num = bu.xpath("./td/a/text()")[0]
                build.bu_build_size = bu.xpath("./td[2]/text()")[0]
                build.bu_floor = bu.xpath("./td[4]/text()")[0]
                build.bu_all_house = bu.xpath("./td[3]/text()")[0]
                tmp_url = bu.xpath("./td/a/@href")[0]
                build.bu_id = re.search('PBTAB_ID=(.*?)&', tmp_url).group(1)
                build.insert_db()
                house_url = path_url.replace('SaleInfoProListIndex.aspx',
                                             '') + tmp_url
                self.ho_parse(co_id, build.bu_id, house_url)
예제 #18
0
 def get_build_info(self, build_url_list, co_name):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build.co_name = co_name
             build_url = 'http://www.sxczfdc.com/pubinfo/' + i
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0]
             for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"',
                                 html, re.S | re.M):
                 try:
                     build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k
                     result = requests.get(build_url_detail,
                                           headers=self.headers)
                     content = result.text
                     build.bu_num = re.findall(
                         'BuildingInfo1_lblBuildingName">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_all_house = re.findall(
                         'BuildingInfo1_lblZts">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_floor = re.findall(
                         'BuildingInfo1_lblZcs">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_build_size = re.findall(
                         'BuildingInfo1_lblJzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_live_size = re.findall(
                         'BuildingInfo1_lblZzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale = re.findall(
                         'BuildingInfo1_lblYsxkzh">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale_date = re.findall(
                         'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content,
                         re.S | re.M)[0]
                     build.insert_db()
                     house_url_list = re.findall(
                         "onClick=.getMoreHouseInfo\('(.*?)'\)", content,
                         re.S | re.M)
                     self.get_house_info(house_url_list, co_name,
                                         build.bu_num)
                 except Exception as e:
                     print(e)
         except Exception as e:
             print(e)
예제 #19
0
 def get_build_info(self, build_info_list, co_id):
     for i in build_info_list:
         try:
             build = Building(co_index)
             build.bu_num = re.search('<td>(.*?)</td>', i,
                                      re.S | re.M).group(1)
             build.bu_all_house = re.search('<td>.*?<td>(.*?)</td>', i,
                                            re.S | re.M).group(1)
             build.bu_all_size = re.search('<td>.*?<td>.*?<td>(.*?)</td>',
                                           i, re.S | re.M).group(1)
             build.bu_id = re.search('\?id=(.*?)"', i, re.S | re.M).group(1)
             build.co_id = co_id
             build.insert_db()
             house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
             self.get_house_info(house_url, co_id, build.bu_id)
         except Exception as e:
             print('楼栋错误,co_index={},str={}'.format(co_index, i), e)
예제 #20
0
 def get_build_info(self, all_build_url_list):
     b = Building(co_index)
     b.co_id = "onclick=GetData\('(.*?)',"
     b.bu_id = "onclick=GetData\('.*?','(.*?)'"
     b.bu_num = "font12yellow-leftA'>.*?</span>套</td><td>.*?</td><td>(.*?)<"
     b.bu_all_house = "font12yellow-leftA'>(.*?)<"
     data_list = b.to_dict()
     p = ProducerListUrl(
         page_url=all_build_url_list,
         request_type='get',
         encode='utf-8',
         analyzer_rules_dict=data_list,
         current_url_rule="onclick=GetData\('(.*?)','(.*?)'\)",
         analyzer_type='regex',
         headers=self.headers)
     house_url_list = p.get_details()
     return house_url_list
예제 #21
0
 def bu_parse(self, detail_url, co_id):
     pre_url = detail_url.replace('lp', 'presell')
     pre_res = requests.get(pre_url, headers=self.headers)
     pre_html = etree.HTML(pre_res.text)
     bu_pre_list = pre_html.xpath("//dt/strong/a")
     for bu_pre in bu_pre_list:
         bu_pre_url = bu_pre.xpath("./@href")[0]
         bu_pre_sale = bu_pre.xpath("./text()")[0]
         bu_url = 'http://www.zstmsf.com' + bu_pre_url
         while True:
             try:
                 proxy = self.proxies[random.randint(0, 9)]
                 bu_res = requests.get(bu_url,
                                       headers=self.headers,
                                       proxies=proxy,
                                       timeout=10)
                 break
             except:
                 continue
         bu_html = etree.HTML(bu_res.text)
         bu_list = bu_html.xpath("//tr//strong/a/@href")
         for bo_url in bu_list:
             ho_url = "http://www.zstmsf.com" + bo_url
             while True:
                 try:
                     proxy = self.proxies[random.randint(0, 9)]
                     ho_res = requests.get(ho_url,
                                           headers=self.headers,
                                           proxies=proxy,
                                           timeout=10)
                     break
                 except:
                     continue
             build = Building(co_index)
             build.co_id = co_id
             build.bu_id = re.search('zid=.*?(\d+)', ho_url).group(1)
             build.bu_num = re.search('幢名称:<strong>(.*?)<',
                                      ho_res.text).group(1)
             build.bu_all_house = re.search("幢总套数.*?'>(.*?)</",
                                            ho_res.text).group(1)
             build.bu_all_size = re.findall("面积.*?'>(.*?)</",
                                            ho_res.text)[0]
             build.bu_pre_sale = bu_pre_sale
             build.insert_db()
             self.ho_parse(co_id, build.bu_id, ho_res)
예제 #22
0
 def start_crawler(self):
     response = requests.get(url)
     html = response.text
     tree = etree.HTML(html)
     all_url = tree.xpath('//a[@class="a_name"]/@href')
     for i in all_url:
         comm = Comm(co_index)
         if i == '#':
             continue
         comm_url = 'http://www.lzfc.com.cn:8080' + i
         comm.co_name = "cc0.innerHTML='(.*?)'"
         comm.co_address = "cc1.innerHTML='(.*?)'"
         comm.area = "cc2.innerHTML='(.*?)'"
         comm.co_use = "cc4.innerHTML='(.*?)'"
         comm.co_develops = "cc5.innerHTML='(.*?)'"
         comm.co_open_time = "cc6.innerHTML='(.*?)'"
         comm.co_all_house = "cc9.innerHTML='(.*?)'"
         comm.co_build_size = "cc11.innerHTML='(.*?)'"
         comm.co_name = "cc0.innerHTML='(.*?)'"
         comm.co_id = "BaseCode=(.*?)'"
         p = ProducerListUrl(
             page_url=comm_url,
             request_type='get',
             encode='gbk',
             analyzer_rules_dict=comm.to_dict(),
             current_url_rule="queryBuildHerf1.href='(.*?)'",
             analyzer_type='regex')
         build_url = p.get_details()
         for i in build_url:
             build = Building(co_index)
             build_detail_url = 'http://www.lzfc.com.cn:8080' + i
             build.bu_num = 'onclick=comInfoView.*?center">(.*?)<'
             build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<'
             build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<'
             build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<'
             build.co_name = 'fontbg_red">(.*?)<'
             build.bu_id = "onclick=comInfoView\('(.*?)'\)"
             p = ProducerListUrl(
                 page_url=comm_url,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=comm.to_dict(),
                 current_url_rule="queryBuildHerf1.href='(.*?)'",
                 analyzer_type='regex')
             build_url = p.get_details()
예제 #23
0
 def get_build_url(self, build_url_list, co_id):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build.co_id = co_id
             bu_url = 'http://www.nhfg.cn/webhouseinfo/ItemList/' + i
             response = self.s.get(bu_url)
             html = response.text
             build.bu_num = \
                 re.findall('<TD style="WIDTH: 471px" colSpan="11"><FONT style="COLOR: white" face="宋体">(.*?)<',
                            html,
                            re.S | re.M)[0].strip()
             build.bu_all_house = re.findall('商业</FONT></TD>.*?center">(.*?)<', html, re.S | re.M)[0].strip()
             build.insert_db()
             house_url = re.findall('(RoomLoad\.aspx\?.*?)"', html, re.S | re.M)[0]
             zu_house_url = 'http://www.nhfg.cn/webhouseinfo/ItemList/HouseList/' + house_url
             self.get_house_info(zu_house_url, build.bu_num, co_id)
         except Exception as e:
             print(e)
예제 #24
0
 def get_comm_info(self, comm_url, comm):
     co_url = 'http://www.fangdi.com.cn/' + comm_url
     response = requests.get(co_url, headers=self.headers)
     html = response.content.decode('gbk')
     comm.co_develops = re.search('企业名称:.*?<a.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
     comm.insert_db()
     add_build_url = 'http://www.fangdi.com.cn/Presell.asp?projectID=' + comm.co_id
     result = requests.get(add_build_url, headers=self.headers)
     html_str = result.content.decode('gbk')
     build_detail_tuple_list = re.findall(
         "javascript:SetSelect\(.*?,.*?,.*?,.*?,.*?,'(.*?)','(.*?)'\)",
         html_str, re.S | re.M)
     for i in build_detail_tuple_list:
         PreSell_ID = i[0]
         Start_ID = i[1]
         build_detail_url = 'http://www.fangdi.com.cn/building.asp?ProjectID=OTU4OHwyMDE4LTQtNHwxNw&PreSell_ID=' + PreSell_ID + '&Start_ID=' + Start_ID
         massage = requests.get(build_detail_url,
                                headers=self.headers).content.decode('gbk')
         build_url_list = re.findall('class="indextabletxt">.*?</tr>',
                                     massage, re.S | re.M)
         for i in build_url_list:
             try:
                 build = Building(co_index)
                 build.bu_num = re.search('<a.*?>(.*?)</a>', i,
                                          re.S | re.M).group(1)
                 build.bu_all_house = re.search(
                     '<a.*?<td.*?<td.*?<td.*?>(.*?)<', i,
                     re.S | re.M).group(1)
                 build.bu_build_size = re.search(
                     '<a.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i,
                     re.S | re.M).group(1)
                 build.bu_id = re.search('Param=(.*?)=', i,
                                         re.S | re.M).group(1)
                 build.co_id = comm.co_id
                 build.insert_db()
                 house_url = re.search('href="(.*?)"', i,
                                       re.S | re.M).group(1)
                 self.get_house_info(house_url, build.bu_id, build.co_id)
             except Exception as e:
                 print(
                     '楼栋错误,co_index={},url={}'.format(
                         co_index, build_detail_url), e)
예제 #25
0
    def build_parse(self, co_id):
        bu = Building(co_index)

        url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id)
        res = requests.get(url, headers=self.headers)
        con_html = etree.HTML(res.text)
        build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1]
        a = con_html.xpath("//td[@width='54%']")

        for index in range(0, len(build_url_list)):
            try:
                build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index]
                res = requests.get(build_info_url, headers=self.headers)
                con = res.text
                bu.co_id = co_id
                bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con,
                                                re.S | re.M).group(1)
                bu.bu_num = re.search('幢.*?did">(.*?)<', con,
                                      re.S | re.M).group(1)
                bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con,
                                           re.S | re.M).group(1)
                bu.bu_address = re.search('位置.*?ss">(.*?)<', con,
                                          re.S | re.M).group(1)
                bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con,
                                             re.S | re.M).group(1)
                bu.bu_type = re.search('性质.*?type">(.*?)<', con,
                                       re.S | re.M).group(1)
                bu.bu_all_house = re.search('套数.*?number">(.*?)<', con,
                                            re.S | re.M).group(1)
                bu.bu_id = re.search('id=(\d+)',
                                     build_url_list[index]).group(1)

                bu.insert_db()
            except Exception as e:
                print(
                    '楼栋错误,co_index={},url={}'.format(co_index, build_info_url),
                    e)
                continue
            try:
                house_url = a[index].xpath("./a/@href")[0]
                self.house_parse(house_url, co_id, bu.bu_id)
            except Exception as e:
                continue
예제 #26
0
 def get_build_info(self, co_id):
     try:
         build_url = 'http://222.184.103.50:7700/WW/ZHList.aspx?projectID=' + co_id + '&projectname='
         response = requests.get(build_url, headers=self.headers)
         html = response.text
         build_info_list = re.findall('<tr bgcolor="#f5f5f5">.*?</tr>',
                                      html, re.S | re.M)
         for i in build_info_list:
             build = Building(co_index)
             build.bu_num = re.search('<a id="LH".*?>(.*?)<', i,
                                      re.S | re.M).group(1).strip()
             build.bu_all_house = re.search('<td.*?<td.*?>(.*?)<', i,
                                            re.S | re.M).group(1).strip()
             build.bu_id = re.search('ZNo=(.*?)"', i,
                                     re.S | re.M).group(1).strip()
             build.co_id = co_id
             build.insert_db()
             self.get_house_url(build.bu_id, co_id)
     except Exception as e:
         print('请求错误,co_index={},url={}'.format(co_index, build_url), e)
예제 #27
0
 def get_build_info(self, co_id):
     build_url = 'http://www.yanjifc.com/jdi'
     payload = "activityId=" + str(co_id) + "&module=jtsActBuildingInfo"
     result = requests.post(url=build_url,
                            data=payload,
                            headers=self.headers)
     data = result.json()
     build_list = data['ROWS']['ROW']
     for i in build_list:
         build = Building(co_index)
         build.bu_all_size = self.dict_get(i, 'BUILDING_AREA')
         build.bu_address = self.dict_get(i, 'LOCATION')
         build.bu_num = self.dict_get(i, 'LOCATION')
         build.bu_floor = self.dict_get(i, 'TOTAL_FLOORS')
         build.bu_all_house = self.dict_get(i, 'TOTAL_SET')
         build.co_build_structural = self.dict_get(i, 'STRUCTURE')
         build.bu_id = self.dict_get(i, 'RESOURCE_GUID')
         build.co_id = co_id
         build.insert_db()
         self.get_house_info(co_id, build.bu_id)
예제 #28
0
    def get_build_info(self, build_url_list, co_id):
        for i in build_url_list:

            build_url = 'http://gold.ncfdc.com.cn/' + i.replace('amp;', '')
            res = requests.get(build_url)

            co_name = re.search('ctl15_proname">(.*?)<', res.text,
                                re.S | re.M).group(1)
            str = re.search('项目楼栋列表.*?ctl17_fLinks_pDataShow', res.text,
                            re.S | re.M).group()
            for info in re.findall('<tr>.*?</tr>', str, re.S | re.M):
                if 'href' not in info:
                    continue
                try:
                    build = Building(co_index)
                    build.co_name = co_name
                    build.bu_num = re.search(
                        '<tr>.*?<td>.*?<a href=.*?>(.*?)<', info,
                        re.S | re.M).group(1)
                    build.bu_pre_sale = re.search(
                        'onclick="BinSHouseInfo.*?>(.*?)<', info,
                        re.S | re.M).group(1)
                    build.bu_pre_sale_date = re.search(
                        'onclick="BinSHouseInfo.*?<td>(.*?)<', info,
                        re.S | re.M).group(1)
                    build.bu_all_house = re.search('color:#ec5f00;">(.*?)<',
                                                   info, re.S | re.M).group(1)
                    build.bu_id = re.search("DisplayB_ld&hrefID=(.*?)'", info,
                                            re.S | re.M).group(1)
                    build.co_id = co_id
                    build.insert_db()

                except Exception as e:
                    print(
                        '楼栋错误,co_index={},url={}'.format(co_index, build_url),
                        e)
            house_url_list = re.findall(
                "</span>.*?</td><td>.*?<a href='(.*?xs.*?)' target=\"_blank\">.*?查看",
                res.text, re.S | re.M)

            self.get_house_info(house_url_list)
예제 #29
0
 def build_parse(self, co_id):
     bu_url = "http://www.zyfgj.org/spf/GetBTable.ashx"
     bu_data = {"itemRecord": co_id, "houseCode": 0}
     res = requests.post(bu_url, data=bu_data, headers=self.headers)
     con = res.content.decode()
     bu_list = re.findall('<tr id.*?</tr>', con)
     for bo in bu_list:
         bu = Building(co_index)
         bu.co_id = co_id
         bu_id = re.search('GetData.*?,(.*?)\)', bo).group(1)
         bu.bu_id = bu_id.strip("'")
         try:
             bu.bu_num = re.search('预售证时间:.*?<td>(.*?)</td', bo).group(1)
             bu.bu_pre_sale = re.search('预售证号:(.*?)</td', bo).group(1)
             bu.bu_pre_sale_date = re.search('预售证时间:(.*?)</td', bo).group(1)
             bu.bu_all_house = re.search('预售证号:.*?<td>(\d+)</td',
                                         bo).group(1)
         except Exception as e:
             log.error("{}楼栋无预售号等信息{}".format(bo, e))
         bu.insert_db()
         self.house_parse(co_id, bu.bu_id)
예제 #30
0
    def build_crawler(self, co_id, co_name, comm_con):

        bu = Building(co_index, co_id=co_id, co_name=co_name)
        build_list = re.search('查看楼盘表.*?<tr>(.*?)</table>', comm_con, re.S | re.M).group(1)
        build = re.findall('<tr>(.*?)</tr>', build_list, re.S | re.M)

        for bul in build:
            try:
                bul_html = etree.HTML(bul)
                buli = bul_html.xpath("//td/text()")

                bu.bu_num = bu_num = buli[1]
                bu.bu_all_house = buli[2]
                bu.size = buli[3]
                house_url = re.search(r'"(.*?)" t', bul, ).group(1)
                bu.bu_id = bu_id = re.search('-(\d+)', house_url).group(1)

                bu.insert_db()
            except:
                continue
            self.house_crawler(house_url, bu_num, co_id, bu_id)