Esempio n. 1
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_code = re.search('xqbm=(.*?)$', i).group(1)
             build_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/donginfo.aspx?xqbm=' + build_code
             build.bu_num = 'Labeldongmc">(.*?)<'
             build.bu_pre_sale = 'Labelyszheng">(.*?)<'
             build.bu_floor = 'Labelsceng">(.*?)<'
             build.bu_address = 'Label1zuoluo">(.*?)<'
             build.bo_build_start_time = 'Label1kaigong">(.*?)<'
             build.co_build_structural = 'Labeljiegou">(.*?)<'
             build.co_id = 'donginfo.aspx\?xqbm=(.*?)"'
             build.bu_id = 'id="DropDownList1".*?value="(.*?)"'
             p = ProducerListUrl(page_url=build_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=build.to_dict(),
                                 current_url_rule='location\.href=(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
Esempio n. 2
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm.co_id = '楼盘首页.*?aid-(.*?)/'
             comm.co_name = 'class="ls">(.*?)<'
             comm.co_type = '物业类型</em>(.*?)<'
             comm.area = '区域所属:</em>(.*?)<'
             comm.co_green = '绿 化 率:</em>(.*?)<'
             comm.co_volumetric = '容 积 率:</em>(.*?)<'
             comm.co_build_type = '楼&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;层:</em>(.*?)<'
             comm.co_size = '占地面积:</em>(.*?)<'
             comm.co_build_size = '建筑面积:</em>(.*?)<'
             comm.co_develops = '开&nbsp;&nbsp;发&nbsp;&nbsp;商:</em><.*?target="_blank">(.*?)<'
             comm.co_address = '项目地址:</em>(.*?)<'
             data_list = comm.to_dict()
             p = ProducerListUrl(
                 page_url=i,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=data_list,
                 current_url_rule=
                 'colspan="3" align="right"><a href="(.*?)"',
                 analyzer_type='regex',
                 headers=self.headers)
             more_build_url = p.get_details()
             self.get_build_info(more_build_url)
         except Exception as e:
             print(e)
Esempio n. 3
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = i.replace('view', 'detail')
             comm.co_type = '物业类型:.*?<dd>(.*?)<'
             comm.area = '区域所属:.*?<dd>(.*?)<'
             comm.co_build_size = '建筑面积:.*?<dd>(.*?)<'
             comm.co_size = '占地面积:.*?<dd>(.*?)<'
             comm.co_green = '绿化率:.*?<dd><.*?>(.*?)<'
             comm.co_build_type = '楼  层:.*?<dd>(.*?)<'
             comm.co_volumetric = '容积率:.*?<dd><.*?>(.*?)<'
             comm.co_id = '楼盘首页.*?newhouse/.*?/(.*?)/'
             comm.co_name = '<h1 class="title">(.*?)<'
             comm.co_address = '楼盘地址:.*?<dd>(.*?)<'
             comm.co_develops = '开发商:.*?<dd(.*?)<'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='gbk',
                                 analyzer_rules_dict=comm.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
         except Exception as e:
             print(e)
Esempio n. 4
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.gzbjfc.com/' + i
             comm.co_name = 'cph_hif1_xmmc.*?<.*?>(.*?)<'
             comm.co_pre_sale = 'cph_hif1_xsxkz.*?<.*?>(.*?)<'
             comm.co_address = 'cph_hif1_zl.*?<.*?>(.*?)<'
             comm.co_develops = 'cph_hif1_kfs.*?<.*?>(.*?)<'
             comm.co_handed_time = 'cph_hif1_jfsj.*?<.*?>(.*?)<'
             comm.co_build_size = 'cph_hif1_jzmj.*?>(.*?)<'
             comm.co_all_house = 'cph_hif1_fwts.*?>(.*?)<'
             comm.co_id = 'hdl1_hfYszh" value="(.*?)"'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=comm.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
             # 楼栋信息
             build_url = comm_url.replace('Info', 'Building')
             self.get_build_info(build_url)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Esempio n. 5
0
 def get_comm_info(self, all_url_list):
     try:
         c = Comm(co_index)
         c.co_name = "class='newtopleft font-k'>(.*?)</li>"
         c.co_id = 'form1" method="post" action="house_base\.aspx\?id=(.*?)"'
         c.co_address = "项目位置:</li><li class='DetaimidR font-f'>(.*?)</li></ul>"
         c.area = "地区/商圈:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_develops = "开发商:</li><li class='DetaimidR font-f'>(.*?)</li>"
         c.co_volumetric = "容积率:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_green = "绿化率:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_all_house = "总户数:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_open_time = "开盘时间:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_land_use = "国土使用证:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_plan_pro = "规划许可证:</li><li class='DetaimidR font-f'>(.*?)<"
         c.co_build_size = "建筑面积:</li><li class='DetaimidR font-f'>(.*?)<"
         data_list = c.to_dict()
         p = ProducerListUrl(page_url=all_url_list,
                             request_type='get',
                             encode='utf-8',
                             analyzer_rules_dict=data_list,
                             analyzer_type='regex',
                             headers=self.headers)
         p.get_details()
         global count
         count += 1
         print(count)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, all_url_list), e)
Esempio n. 6
0
 def get_house_info(self, house_url_list):
     for i in house_url_list:
         try:
             dongid = re.search('dongid=(.*?)&', i).group(1)
             roomid = re.search('roomid=(.*?)&', i).group(1)
             house_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/roominfo.aspx?dongid=' + dongid + '&roomid=' + roomid
             house = House(co_index)
             house.co_name = 'Labelxqmc">(.*?)<'
             house.area = 'Labelxzq">(.*?)<'
             house.bu_num = 'Labeldongmc">(.*?)<'
             house.ho_type = 'Labelyxyongtu">(.*?)<'
             house.ho_name = '<span id="Labelroommc".*?>(.*?)</span>'
             house.ho_build_size = 'Labeljzmianji">(.*?)<'
             house.ho_true_size = 'Labeltaonei">(.*?)<'
             house.ho_share_size = 'Labelgongtan">(.*?)<'
             house.ho_room_type = 'Labelhuxing">(.*?)<'
             house.bu_id = 'dongid=(.*?)&'
             p = ProducerListUrl(page_url=house_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=house.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
         except Exception as e:
             print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
Esempio n. 7
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             code = i.split(',')
             comm_url = 'http://www.tmsf.com/newhouse/property_' + code[
                 0] + '_' + code[1] + '_info.htm'
             comm = Comm(co_index)
             comm.co_name = 'buidname.*?>(.*?)<'
             comm.co_address = '--位置行--.*?<span.*?title="(.*?)"'
             comm.co_build_type = '建筑形式:<.*?>(.*?)<'
             comm.co_develops = '项目公司:<.*?>(.*?)<'
             comm.co_volumetric = '容 积 率:</span>(.*?)<'
             comm.co_green = '绿 化 率:</span>(.*?)<'
             comm.co_size = '占地面积:</span>(.*?)<'
             comm.co_build_size = '总建筑面积:</span>(.*?)<'
             comm.co_all_house = '总户数:</span>(.*?)<'
             comm.co_id = 'info" href="/newhouse/property_(.*?)_info'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=comm.to_dict(),
                                 current_url_rule='一房一价<.*?href="(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             build_all_url = p.get_details()
             global count
             count += 1
             print('comm:', count)
             self.get_build_info(build_all_url)
         except Exception as e:
             print('小区页面,co_index={},url={}'.format(co_index, comm_url), e)
Esempio n. 8
0
 def get_comm_url(self, all_url_list):
     p = ProducerListUrl(page_url=all_url_list,
                         request_type='get',
                         encode='utf-8',
                         current_url_rule="<a class='anone' href='(.*?)'",
                         analyzer_rules_dict=None,
                         analyzer_type='regex',
                         headers=self.headers)
     comm_url_list = p.get_current_page_url()
     return comm_url_list
Esempio n. 9
0
 def start_crawler(self):
     for i in range(1, self.page + 1):
         url = self.url + "More_xm.aspx?page=" + str(i)
         p = ProducerListUrl(page_url=url,
                             request_type='get',
                             encode='utf-8',
                             analyzer_rules_dict=None,
                             current_url_rule="//td[@align='left']/a/@href",
                             analyzer_type='xpath',
                             headers=self.headers)
         comm_url_list = p.get_current_page_url()
         self.get_comm_info(comm_url_list)
Esempio n. 10
0
 def get_build_info(self, all_build_url_list):
     b = Building(co_index)
     b.co_id = "onclick=GetData\('(.*?)',"
     b.bu_id = "onclick=GetData\('.*?','(.*?)'"
     b.bu_num = "font12yellow-leftA'>.*?</span>套</td><td>.*?</td><td>(.*?)<"
     b.bu_all_house = "font12yellow-leftA'>(.*?)<"
     data_list = b.to_dict()
     p = ProducerListUrl(
         page_url=all_build_url_list,
         request_type='get',
         encode='utf-8',
         analyzer_rules_dict=data_list,
         current_url_rule="onclick=GetData\('(.*?)','(.*?)'\)",
         analyzer_type='regex',
         headers=self.headers)
     house_url_list = p.get_details()
     return house_url_list
Esempio n. 11
0
 def get_house_info(self, build_num, sid):
     try:
         house_url = 'http://www.tmsf.com/newhouse/NewPropertyHz_showbox.jspx?buildingid=' + build_num + '&sid=' + sid
         house = House(co_index)
         house.bu_id = 'buildingid":(.*?),'
         house.co_build_size = 'builtuparea":(.*?),'
         house.ho_price = 'declarationofroughprice":(.*?),'
         house.ho_name = 'houseno":(.*?),'
         house.ho_true_size = 'setinsidefloorarea":(.*?),'
         house.ho_share_size = 'poolconstructionarea":(.*?),'
         house.ho_type = 'houseusage":(.*?),'
         p_2 = ProducerListUrl(page_url=house_url,
                               request_type='get',
                               encode='utf-8',
                               analyzer_rules_dict=house.to_dict(),
                               analyzer_type='regex',
                               headers=self.headers)
         p_2.get_details()
     except Exception as e:
         print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
Esempio n. 12
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=url,
            request_method='get',
            analyzer_type='regex',
            encode='gbk',
            page_count_rule='共(.*?)页',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):
            all_page_url = url + '&Page=' + str(i)
            p = ProducerListUrl(page_url=all_page_url,
                                request_type='get',
                                encode='gbk',
                                analyzer_rules_dict=None,
                                current_url_rule="eval\('openBldg\((.*?)\)",
                                analyzer_type='regex',
                                headers=self.headers)
            comm_url_list = p.get_current_page_url()
            self.get_build_info(comm_url_list)
Esempio n. 13
0
 def get_house_info(self, house_url_list):
     for i in house_url_list:
         try:
             house = House(co_index)
             house_url = 'http://www.ndjsj.gov.cn/House/' + i
             house.bu_num = '幢  号:.*?<td.*?>(.*?)<'
             house.ho_name = '房  号:.*?<td.*?>(.*?)<'
             house.co_name = '项目名称:.*?<td.*?>(.*?)<'
             house.ho_build_size = '建筑面积:.*?<td.*?>(.*?)<'
             house.ho_true_size = '套内面积:.*?<td.*?>(.*?)<'
             house.ho_share_size = '分摊面积:.*?<td.*?>(.*?)<'
             house.ho_type = '房屋用途:.*?<td.*?>(.*?)<'
             house.ho_floor = '所 在 层:.*?<td.*?>(.*?)<'
             house.ho_room_type = '房屋户型:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(page_url=house_url,
                                 request_type='get', encode='utf-8',
                                 analyzer_rules_dict=house.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
         except Exception as e:
             print('宁德房号错误,url={}'.format(house_url), e)
Esempio n. 14
0
 def get_comm_info(self, comm_detail_url_list):
     for i in comm_detail_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.ndjsj.gov.cn' + i
             comm.co_develops = '公司名称:.*?<td.*?>(.*?)<'
             comm.co_name = '项目名称:.*?<td.*?>(.*?)<'
             comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<'
             comm.co_address = '项目坐落:.*?<td.*?>(.*?)<'
             comm.co_use = '规划用途:.*?<td.*?>(.*?)<'
             comm.co_size = '占地面积:.*?<td.*?>(.*?)<'
             comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get', encode='utf-8',
                                 analyzer_rules_dict=comm.to_dict(),
                                 current_url_rule="(BuildingInfo\?BuildingId=.*?)'",
                                 analyzer_type='regex',
                                 headers=self.headers)
             build_url_list = p.get_details()
             self.get_build_info(build_url_list)
         except Exception as e:
             print('宁德小区错误,url={}'.format(comm_url), e)
Esempio n. 15
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='gbk',
         page_count_rule='>>></a>.*?href=".*?page-(.*?)\.html',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_url = 'http://www.jmfc.com.cn/index/caid-2/addno-1/page-' + str(
             i) + '.html'
         p = ProducerListUrl(
             page_url=all_url,
             request_type='get',
             encode='gbk',
             analyzer_rules_dict=None,
             current_url_rule=
             "/html/body/div[5]/div[6]/div/div[2]/h3/a/@href",
             analyzer_type='xpath',
             headers=self.headers)
         comm_url_list = p.get_current_page_url()
         self.get_comm_info(comm_url_list)
Esempio n. 16
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=self.start_url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='<cite>共.*?/(\d+)页',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         url = "http://www.f0795.cn/house/index-htm-page-" + str(
             i) + ".html"
         p = ProducerListUrl(
             page_url=url,
             request_type='get',
             encode='utf-8',
             analyzer_rules_dict=None,
             current_url_rule=
             "//ul[@class='list']//div[@class='text']/h3/a/@href",
             analyzer_type='xpath',
             headers=self.headers)
         comm_url_list = p.get_current_page_url()
         self.get_comm_info(comm_url_list)
Esempio n. 17
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://www.ndjsj.gov.cn/House/' + i
             build.co_name = '项目名称:.*?<td.*?>(.*?)<'
             build.bu_num = '幢  号:.*?<td.*?>(.*?)<'
             build.bu_address = '坐落位置:.*?<td.*?>(.*?)<'
             build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<'
             build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<'
             build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<'
             # build.bu_type = '设计用途:.*?<td.*?>(.*?)<'
             build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(page_url=build_url,
                                 request_type='get', encode='utf-8',
                                 analyzer_rules_dict=build.to_dict(),
                                 current_url_rule='javascript:ShowTitle.*?href="(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print('宁德楼栋错误,url={}'.format(build_url), e)
Esempio n. 18
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.fjnpfdc.com/House/' + i
             comm.co_develops = '公司名称:.*?<td.*?>(.*?)<'
             comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<'
             comm.co_name = '项目名称:.*?<td.*?>(.*?)<'
             comm.co_address = '项目坐落:.*?<td.*?>(.*?)<'
             comm.co_use = '规划用途:.*?<td.*?>(.*?)<'
             comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<'
             comm.co_id = 'ProjectId=(.*?)&'
             p = ProducerListUrl(
                 page_url=comm_url,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=comm.to_dict(),
                 current_url_rule="<a href='(BuildingInfo.*?)'",
                 analyzer_type='regex',
                 headers=self.headers)
             build_url_list = p.get_details()
             self.get_build_info(build_url_list)
         except Exception as e:
             print("co_index={},小区{}错误".format(co_index, i), e)
Esempio n. 19
0
 def get_build_info(self, more_build_url):
     for i in more_build_url:
         try:
             build = Building(co_index)
             build_url = 'http://www.jmfc.com.cn/' + i
             build.bu_num = '<tr bgcolor="#FFFFFF">.*?<td.*?>(.*?)<'
             build.co_id = '楼盘首页.*?aid-(.*?)/'
             build.bu_id = '&addno=12&action=loupantable&lzbm=(.*?)&ql_xh='
             build.bu_pre_sale = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>(.*?)<'
             build.bu_floor = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<'
             build.bu_all_house = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<'
             p = ProducerListUrl(
                 page_url=build_url,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=build.to_dict(),
                 current_url_rule=
                 '<tr bgcolor="#FFFFFF">.*?align="left".*?href="(.*?)"',
                 analyzer_type='regex',
                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print(e)
Esempio n. 20
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://zjjg.0557fdc.com:9555/' + i
             comm.co_name = '小区名称:.*?<td.*?>(.*?)<'
             comm.area = '所属区域:.*?<td.*?>(.*?)<'
             comm.co_address = '座落:.*?<td.*?>(.*?)<'
             comm.co_develops = '开发商名称:.*?<td.*?>(.*?)<'
             comm.co_pre_sale = '开发企业营业执照号.*?<td.*?>(.*?)<'
             comm.co_all_house = 'Label1">(.*?)<'
             comm.co_build_size = 'Label2">(.*?)<'
             comm.co_id = 'action=.*?xqbm=(.*?)"'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=comm.to_dict(),
                                 current_url_rule='action="(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             build_url_list = p.get_details()
             self.get_build_info(build_url_list)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Esempio n. 21
0
 def start_crawler(self):
     response = requests.get(url)
     html = response.text
     tree = etree.HTML(html)
     all_url = tree.xpath('//a[@class="a_name"]/@href')
     for i in all_url:
         comm = Comm(co_index)
         if i == '#':
             continue
         comm_url = 'http://www.lzfc.com.cn:8080' + i
         comm.co_name = "cc0.innerHTML='(.*?)'"
         comm.co_address = "cc1.innerHTML='(.*?)'"
         comm.area = "cc2.innerHTML='(.*?)'"
         comm.co_use = "cc4.innerHTML='(.*?)'"
         comm.co_develops = "cc5.innerHTML='(.*?)'"
         comm.co_open_time = "cc6.innerHTML='(.*?)'"
         comm.co_all_house = "cc9.innerHTML='(.*?)'"
         comm.co_build_size = "cc11.innerHTML='(.*?)'"
         comm.co_name = "cc0.innerHTML='(.*?)'"
         comm.co_id = "BaseCode=(.*?)'"
         p = ProducerListUrl(
             page_url=comm_url,
             request_type='get',
             encode='gbk',
             analyzer_rules_dict=comm.to_dict(),
             current_url_rule="queryBuildHerf1.href='(.*?)'",
             analyzer_type='regex')
         build_url = p.get_details()
         for i in build_url:
             build = Building(co_index)
             build_detail_url = 'http://www.lzfc.com.cn:8080' + i
             build.bu_num = 'onclick=comInfoView.*?center">(.*?)<'
             build.co_use = 'onclick=comInfoView.*?center.*?center">(.*?)<'
             build.bu_pre_sale = 'onclick=comInfoView.*?center.*?center.*?center"><.*?>(.*?)<'
             build.bu_all_house = 'onclick=comInfoView.*?center.*?center.*?center.*?center">(.*?)<'
             build.co_name = 'fontbg_red">(.*?)<'
             build.bu_id = "onclick=comInfoView\('(.*?)'\)"
             p = ProducerListUrl(
                 page_url=comm_url,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=comm.to_dict(),
                 current_url_rule="queryBuildHerf1.href='(.*?)'",
                 analyzer_type='regex')
             build_url = p.get_details()