Ejemplo n.º 1
0
    def get_build_info(self, url, co_id):
        try:
            building = Building(co_index)
            response = requests.get(url)
            html = response.text
            tree = etree.HTML(html)
            co_name = tree.xpath('//*[@id="PageB_Location"]/text()')[0]  # 小区名字
            print(co_name)
            bu_name = tree.xpath('//*[@id="ItemName"]/text()')[0]  # 楼栋名称
            bu_num = tree.xpath('//*[@id="PageB_HouseNo"]/text()')[0]  # 楼号 栋号
            bu_all_house = tree.xpath('//*[@id="lb_countbulidtaoshu"]/text()')[
                0]  # 总套数
            bu_floor = tree.xpath('//*[@id="cell3-1"]/text()')
            bu_floor = self.is_none(bu_floor)  # 楼层
            bu_build_size = tree.xpath('//*[@id="lb_countbulidarea"]/text()')[
                0]  # 建筑面积
            bu_live_size = tree.xpath('//*[@id="lb_buildarea"]/text()')[
                0]  # 住宅面积
            bu_price = tree.xpath('//*[@id="lb_buildavg"]/text()')
            bu_price = self.is_none(bu_price)  # 住宅价格
            bu_id = re.search('\?(\d+)$', url).group(1)  # 楼栋id
            building.co_id = co_id
            building.bu_name = bu_name
            building.bu_num = bu_num
            building.bu_all_house = bu_all_house
            building.bu_floor = bu_floor
            building.bu_build_size = bu_build_size
            building.bu_live_size = bu_live_size
            building.bu_price = bu_price
            building.bu_id = bu_id
            building.insert_db()
            house_info_html = re.findall('<tr id="row3">(.*)$', html,
                                         re.S | re.M)[0]
            for i in re.findall('(<td.*?>.*?</td>)', house_info_html,
                                re.S | re.M):
                if '<br>' not in i:
                    continue
                ho_name_list = re.findall('<td.*?>(.*?)<br>', i, re.S | re.M)
                ho_true_size_list = re.findall('<td.*?>.*?<br>(.*?)<br>', i,
                                               re.S | re.M)
                ho_type = re.findall('<td.*?>.*?<br>.*?<br>(.*?)<br>', i,
                                     re.S | re.M)[0]
                for i in range(len(ho_name_list)):
                    try:
                        if 'font' in ho_name_list[i]:
                            ho_name = re.sub('<font.*?>', '', ho_name_list[i])
                        else:
                            ho_name = ho_name_list[i]
                        house = House(8)
                        house.ho_name = ho_name
                        house.ho_true_size = ho_true_size_list[i]
                        house.co_id = co_id
                        house.bu_id = bu_id
                        house.ho_type = ho_type
                        house.insert_db()

                    except Exception as e:
                        print(e)
        except BaseException as e:
            print(e)
Ejemplo n.º 2
0
 def bu_parse(self, bu_url, co_id, co_url):
     build_url = "http://61.143.241.154/" + bu_url
     global headers
     headers = {
         'User-Agent':
         'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
         'Referer': co_url
     }
     bu_res = requests.get(build_url, headers=headers)
     bu_con = bu_res.content.decode('gbk')
     bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a', bu_con,
                             re.S | re.M).group(1)
     bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td', bu_con,
                                  re.S | re.M).group(1)
     bu_html = etree.HTML(bu_con)
     bu_list = bu_html.xpath("//table[@id='donglist']//tr")
     for bo in bu_list:
         bu = Building(co_index)
         bu.co_id = co_id
         bo_url = bo.xpath("./td/a/@href")[0]
         bu.bu_id = re.search('dbh=(.*?)&', bo_url).group(1)
         bu.bu_num = bo.xpath("./td[3]/text()")[0]
         bu.bu_floor = bo.xpath("./td[4]/text()")[0]
         bu.bu_pre_sale = bu_pre_sale
         bu.bu_pre_sale_date = bu_pre_sale_date
         bu.insert_db()
         self.house_parse(bo_url, co_id, bu.bu_id)
Ejemplo n.º 3
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_code = re.search('xqbm=(.*?)$', i).group(1)
             build_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/donginfo.aspx?xqbm=' + build_code
             build.bu_num = 'Labeldongmc">(.*?)<'
             build.bu_pre_sale = 'Labelyszheng">(.*?)<'
             build.bu_floor = 'Labelsceng">(.*?)<'
             build.bu_address = 'Label1zuoluo">(.*?)<'
             build.bo_build_start_time = 'Label1kaigong">(.*?)<'
             build.co_build_structural = 'Labeljiegou">(.*?)<'
             build.co_id = 'donginfo.aspx\?xqbm=(.*?)"'
             build.bu_id = 'id="DropDownList1".*?value="(.*?)"'
             p = ProducerListUrl(page_url=build_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=build.to_dict(),
                                 current_url_rule='location\.href=(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
Ejemplo n.º 4
0
 def get_comm_info(self, comm_url,comm):
     try:
         response = requests.get(comm_url, headers=self.headers)
         html = response.text
         comm.co_id = re.search('jectcode=(.*?)"', html, re.S | re.M).group(1)
         comm.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", html, re.S | re.M).group(1)
         comm.co_address = re.search('项目地址:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_develops = re.search('开发企业:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.co_owner = re.search('国土证书:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.area = re.search('行政区划:</th>.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
         comm.insert_db()
         build_html = re.search('套房信息.*?</table>', html, re.S | re.M).group()
         build_info_list = re.findall('<tr.*?>.*?</tr>', build_html, re.S | re.M)
         for i in build_info_list:
             try:
                 build = Building(co_index)
                 build.co_id = comm.co_id
                 build.bu_num = re.search('<td.*?>(.*?)</td', i, re.S | re.M).group(1)
                 build.bu_id = re.search('buildingcode=(.*?)&', i, re.S | re.M).group(1)
                 build.co_build_structural = re.search('<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
                 build.bu_all_house = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
                 build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?<td.*?>(.*?)<', i, re.S | re.M).group(1)
                 build.insert_db()
                 house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
                 self.get_build_info(house_url, build.bu_id, comm.co_id)
             except Exception as e:
                 print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Ejemplo n.º 5
0
 def build_parse(self, co_id):
     list_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/loulist.jsp?Id_xmxq=' + co_id
     res = requests.get(list_url, headers=self.headers)
     con = res.content.decode()
     build_id_list = re.findall("searchByLid\('(\d+)'\)", con)
     for build_id in build_id_list:
         try:
             bu_url = 'http://www.ccfdw.gov.cn/ecdomain/lpcs/xmxx/lpbxx_new.jsp?lid=' + build_id
             bu_res = requests.get(bu_url, headers=self.headers)
             bu_con = bu_res.content.decode('gbk')
             bu = Building(co_index)
             bu.co_id = co_id
             bu.bu_id = build_id
             bu.bu_num = re.search('楼栋名称.*?">(.*?)</td', bu_con,
                                   re.S | re.M).group(1)
             bu.bu_all_house = re.search('总套数.*?">总(.*?)套</td', bu_con,
                                         re.S | re.M).group(1)
             bu.bu_floor = re.search('地上层数.*?">共(.*?)层</td', bu_con,
                                     re.S | re.M).group(1)
             bu.bu_build_size = re.search('总建筑面积.*?">(.*?)</td', bu_con,
                                          re.S | re.M).group(1)
             bu.bu_pre_sale = re.search("searchysxk\('(.*?)'\)", bu_con,
                                        re.S | re.M).group(1)
             bu.bu_type = re.search('房屋用途.*?">(.*?)</td', bu_con,
                                    re.S | re.M).group(1)
             bu.insert_db()
         except Exception as e:
             log.error('{}楼栋错误{}'.format(build_id, e))
         self.house_parse(co_id, build_id, bu_con)
Ejemplo n.º 6
0
 def build_info(self, build_detail, co_id):
     proxy = Proxy_contact(app_name='wuhan',
                           method='get',
                           url=build_detail,
                           headers=self.headers)
     # build_res = requests.get(build_detail,headers=self.headers)
     build_res = proxy.contact()
     html = etree.HTML(build_res.decode('gb18030'))
     info_list = html.xpath("//tr[@bgcolor='#FFFFFF']")
     for info in info_list:
         try:
             bu = Building(co_index)
             bu.co_id = co_id
             bu.bu_floor = info.xpath('./td[3]/text()')[0]
             bu.bu_all_house = info.xpath('./td[4]/text()')[0]
             bu.bu_num = info.xpath('./td//span/text()')[0]
             temp_url = info.xpath('./td/a/@href')[0]
             bu.bu_id = re.search('HouseDengjh=(.*?\d+)', temp_url).group(1)
             bu.insert_db()
         except Exception as e:
             log.error('楼栋错误{}'.format(e))
             continue
         a = parse.quote(re.search('DengJh=(.*?\d+)&', temp_url).group(1),
                         encoding='gbk')
         b = parse.quote(re.search('HouseDengjh=(.*?\d+)',
                                   temp_url).group(1),
                         encoding='gbk')
         bu_url = 'http://scxx.fgj.wuhan.gov.cn/5.asp?DengJh=' + a + '&HouseDengjh=' + b
         self.house_info(bu.bu_id, bu_url, co_id)
         time.sleep(3)
Ejemplo n.º 7
0
 def get_build_info(self,presell_url_list,co_id):
     for presell_url in presell_url_list:
         pre_url = self.url + presell_url
         res = requests.get(pre_url,headers=self.headers)
         build_url_list = re.findall('【<a href="(.*?)" target="_self"',res.text,re.S|re.M)
         for build_url in build_url_list:
             build_info_url = self.url+build_url
             try:
                 build_res = requests.get(build_info_url,headers=self.headers)
                 con = build_res.text
                 bu = Building(co_index)
                 bu.co_id = co_id
                 bu.bu_id = re.search('ID=(\d+)',build_url).group(1)
                 bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_floor = re.search('总 层 数.*?(\d+)</span',con,re.S|re.M).group(1)
                 bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_not_live_size = re.search('非住宅面积.*?Fzzmj">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span',con,re.S|re.M).group(1)
                 bu.bu_type = re.search('项目类型.*?Type">(.*?)</span',con,re.S|re.M).group(1)
                 bu.insert_db()
             except Exception as e:
                 print("co_index={},楼栋信息错误".format(co_index), e)
                 continue
             house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"",con,re.S|re.M)
             self.get_house_info(co_id,bu.bu_id,house_detail_list)
Ejemplo n.º 8
0
    def get_build_info(self, build_id_list, co_id):
        bu = Building(co_index)
        for build_id in build_id_list:
            formdata = {}
            formdata["action"] = "qeurySingleBuilding"
            formdata['pk'] = str(build_id)
            header = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
                'Referer':
                'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildingList.php'
            }
            try:
                build_info = self.s.post(
                    'http://hkrealestate.haikou.gov.cn/wp_myself/housequery/projectBuildHouseAction.php',
                    data=formdata,
                    headers=header)
            except Exception as e:
                print("co_idnex={},楼栋错误".format(co_index), e)

            build_con = build_info.text
            bu.bu_id = build_id
            bu.co_id = co_id
            bu.bu_num = re.search('幢名称.*?<td>(.*?)<', build_con,
                                  re.S | re.M).group(1)
            bu.bu_floor = re.search('总层数.*?<td>(.*?)<', build_con,
                                    re.S | re.M).group(1)
            bu.bu_build_size = re.search('>建筑面积.*?<td>(.*?)<', build_con,
                                         re.S | re.M).group(1)
            bu.bo_develops = re.search('房地产企业.*?">(.*?)</td', build_con,
                                       re.S | re.M).group(1)

            bu.insert_db()

            self.get_house_info(build_con, co_id, build_id)
Ejemplo n.º 9
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://www.ndjsj.gov.cn/House/' + i
             build.co_name = '项目名称:.*?<td.*?>(.*?)<'
             build.bu_num = '幢  号:.*?<td.*?>(.*?)<'
             build.bu_address = '坐落位置:.*?<td.*?>(.*?)<'
             build.co_build_structural = '建筑结构:.*?<td.*?>(.*?)<'
             build.bu_floor = '总 层 数:.*?<td.*?>(.*?)<'
             build.bu_build_size = '总 面 积:.*?<td.*?>(.*?)<'
             # build.bu_type = '设计用途:.*?<td.*?>(.*?)<'
             build.bu_all_house = '批准销售:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(
                 page_url=build_url,
                 request_type='get',
                 encode='utf-8',
                 analyzer_rules_dict=build.to_dict(),
                 current_url_rule='javascript:ShowTitle.*?href="(.*?)"',
                 analyzer_type='regex',
                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print('宁德楼栋错误,url={}'.format(build_url), e)
Ejemplo n.º 10
0
 def get_build_info(self, co_id):
     build_url = "http://202.103.219.149:7000/ajax/LeadingMIS.CommonModel.CommonQuery.WebUI.AjaxManage.QueryDataParser,LeadingMIS.CommonModel.CommonQuery.WebUI.ashx"
     querystring = {"_method": "GetDataToDynamicInXml", "_session": "rw"}
     payload = "xmlInfo=%263Croot%2620QueryCode%263D%2622BuildingsInfo%2622%2620PageIndex%263D%26221%2622%2620PageSize%263D%262215%2622%2620SortField%263D%2622%2620ORDER%2620BY%2620Name%2622%2620QueryString%263D%2622QueryCode%263DBuildingsInfo%2626amp%263BProjectID%263D" + co_id + "%2622%2620BeginDate%263D%2622%262000%263A00%263A00%2622%2620EndDate%263D%2622%262023%263A59%263A59%2622%2620Flag%263D%2622TitleBody%2622%2620TitlesWidthInfo%263D%2622BuildNo%267C0%2624Name%267C0%2624FloorCount%267C0%2624RoomCount%267C0%2624YCJZArea%267C0%2624Structure%267C0%2624YSXKCer%267C0%2624ZJJG%267C0%2622%2620IsUseOCache%263D%26220%2622%2620IsUserID%263D%26220%2622%2620SiteId%263D%26228907bd13-1d14-4f9e-8c01-e482d9590d10%2622%2620LockedColumn%263D%26220%2622%2620IsLocked%263D%26220%2622%2620ClientWidth%263D%26221601%2622%2620ShowModeCode%263D%2622default%2622%2620Language%263D%2622chinese%2622/%263E"
     try:
         response = requests.request("POST",
                                     build_url,
                                     data=payload,
                                     params=querystring)
         html = response.text
         build_info_list = re.findall('<tr.*?>.*?</tr>', html,
                                      re.S | re.M)[1:]
         for i in build_info_list:
             build = Building(co_index)
             build.co_id = co_id
             build.bu_num = re.search(
                 '<span class="spanctfield".*?<span class="spanctfield".*?>.*?<a.*?>(.*?)<',
                 i, re.S | re.M).group(1)
             build.bu_floor = re.search(
                 '<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?>(.*?)<',
                 i, re.S | re.M).group(1)
             build.bu_pre_sale = re.search(
                 '<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?<span class="spanctfield".*?>(.*?)<',
                 i, re.S | re.M).group(1)
             build.bu_id = re.search('id="Tr_(.*?)"', i,
                                     re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(co_id, build.bu_id)
     except Exception as e:
         print('请求错误,url={},data={},params={}'.format(
             build_url, payload, querystring))
Ejemplo n.º 11
0
    def get_build_info(self, build_lis, co_id):
        for build_ in build_lis:
            build_url = "http://xx.yyfdcw.com" + build_
            try:
                build_res = requests.get(build_url, headers=self.headers)
            except Exception as e:
                print("co_index={},楼栋信息错误".format(co_index), e)
                continue
            con = build_res.text
            bu = Building(co_index)
            bu.co_id = co_id
            bu.bu_id = re.search('Bid=(\d+)', build_).group(1)
            bu.bu_num = re.search('名称.*?">(.*?)</spa', con).group(1)
            bu.bu_pre_sale = re.search("编.*?red'>(.*?)</a", con).group(1)
            bu.bu_pre_sale_date = re.search('颁发日期.*?Date">(.*?)</span',
                                            con).group(1)
            bu.bo_build_start_time = re.search('开工日期.*?">(.*?)</span',
                                               con).group(1)
            bu.bo_build_end_time = re.search('竣工日期.*?">(.*?)</span',
                                             con).group(1)
            bu.bo_develops = re.search('单位.*?">(.*?)</span', con).group(1)
            bu.bu_floor = re.search('层数.*?">(.*?)</span', con).group(1)
            bu.bu_live_size = re.search('住宅面积.*?">(.*?)</span', con).group(1)
            bu.size = re.search('总面积.*?">(.*?)</span', con).group(1)

            bu.insert_db()

            id = re.search('测量号.*?">(.*?)</span', con).group(1)
            self.get_house_info(co_id, bu.bu_id, id)
Ejemplo n.º 12
0
 def get_build_info(self, bu_address_list, bu_num_list, bu_floor_list,
                    bu_url_list, co_id):
     for i in range(len(bu_url_list)):
         build = Building(co_index)
         build.bu_address = bu_address_list[i]
         build.bu_num = bu_num_list[i]
         build.bu_floor = bu_floor_list[i]
         build.co_id = co_id
         # response = self.request_proxy('http://183.63.60.194:8808/public/web/' + bu_url_list[i])
         time.sleep(1)
         response = self.s.get('http://183.63.60.194:8808/public/web/' +
                               bu_url_list[i],
                               headers=self.headers)
         build.bu_id = re.search('ljzid=(.*?)$', bu_url_list[i]).group(1)
         build.insert_db()
         html = response.text
         house_html = re.search('var _table_html_.*?</script>', html,
                                re.S | re.M).group()
         house_url_list = re.findall('房屋号:<a.*?href="(.*?)"', house_html,
                                     re.S | re.M)
         try:
             self.get_house_info(house_url_list, build.bu_id)
         except Exception as e:
             print(
                 '房号错误,co_index={},url={}'.format(
                     co_index, 'http://183.63.60.194:8808/public/web/' +
                     bu_url_list[i]), e)
Ejemplo n.º 13
0
 def get_build_info(self, build_url_list, comm):
     for i in build_url_list:
         try:
             build_url = 'http://58.51.240.121:8503/' + i
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             comm.co_pre_sale = re.search(
                 'id="PresellInfo1_lblXkzh">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.co_pre_sale_date = re.search(
                 'id="PresellInfo1_lblFzrq">(.*?)<', html,
                 re.S | re.M).group(1)
             comm.insert_db()
             build_info_list = re.findall('<tr bgcolor="#FFFFFF">.*?</tr>',
                                          html, re.S | re.M)
             for i in build_info_list:
                 build = Building(co_index)
                 build.co_id = comm.co_id
                 build.bu_num = re.search('<td.*?>(.*?)<', i,
                                          re.S | re.M).group(1)
                 build.bu_floor = re.search('<td.*?<td.*?>(.*?)<', i,
                                            re.S | re.M).group(1)
                 build.bu_all_house = re.search('<td.*?<td.*?<td.*?>(.*?)<',
                                                i, re.S | re.M).group(1)
                 build.bu_id = re.search('PresellId=(.*?)$',
                                         build_url).group(1)
                 build.insert_db()
                 house_url = re.search('a href="(.*?)"', i,
                                       re.S | re.M).group(1)
                 self.get_house_info(house_url, comm.co_id, build.bu_id)
         except Exception as e:
             print('请求错误,co_index={},url={}'.format(co_index, build_url), e)
Ejemplo n.º 14
0
    def get_build_info(self, co_id, co_name):
        url = 'http://www.czhome.com.cn/Presell.asp?projectID=' + co_id + '&projectname=' + co_name
        response = requests.get(url, headers=self.headers)
        html = response.content.decode('gbk')
        tree = etree.HTML(html)
        xpath_list = tree.xpath('//tr[@class="indextabletxt"]')
        for i in xpath_list[1:]:
            build_url = i.xpath('td[2]/a/@href')[0]
            url = 'http://www.czhome.com.cn/' + build_url
            result = requests.get(url, headers=self.headers)
            if result.status_code is not 200:
                print("co_index={},预售url:{}连接失败".format(co_index, url))
                continue
            html = result.content.decode('gbk')
            tree = etree.HTML(html)
            # 总套数
            bu_xpath = tree.xpath(
                '/html/body/table/tr/td/table/tr/td/table/tr')[1:]
            for i in bu_xpath:
                try:
                    building = Building(7)
                    global building_id
                    building_id += 1
                    building.bu_id = building_id
                    bu_all_house = i.xpath('td[7]/text()')[0]
                    bu_url = i.xpath('td[1]/a/@href')[0]
                    url = 'http://www.czhome.com.cn/' + bu_url
                    response = requests.get(url, headers=self.headers)
                    if response.status_code is not 200:
                        print("co_index={},楼栋url:{}连接失败".format(co_index, url))
                        continue
                    html = response.content.decode('gbk')
                    tree = etree.HTML(html)
                    # 楼层
                    bu_floor = tree.xpath(
                        '//*[@id="Table4"]/tr[2]/td/table[3]/tr/td[1]/u/text()'
                    )[-1]
                    house_url_list = tree.xpath(
                        '//*[@id="Table4"]/tr[2]/td/table[3]/tr/td/a/@href')
                    bu_address = re.search(
                        '<center><font color=.*?&nbsp;&nbsp;(.*?)<', html,
                        re.S | re.M).group(1)
                    building.bu_all_house = bu_all_house
                    building.bu_address = bu_address
                    building.bu_floor = bu_floor
                    building.bu_id = building_id
                    building.co_id = co_id
                    building.insert_db()
                    for i in house_url_list:
                        try:
                            house = House(7)
                            house_url = 'http://www.czhome.com.cn/' + i
                            self.get_house_info(house_url, house, co_id,
                                                building_id, building)
                        except Exception as e:
                            print(e)

                except Exception as e:
                    print(e)
Ejemplo n.º 15
0
    def comm_info(
        self,
        con,
    ):
        # 小区及楼栋
        comm = Comm(co_index)

        comm.co_name = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_web_item_retail1_lb_item_name']/text()"
        )[0]  # 小区名称
        co_id_str = con.xpath("//form[@id='aspnetForm']/@action")[0]  # 小区id
        comm.co_id = re.search(r"\d+", co_id_str).group(0)
        comm.co_address = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_seat']/text()")[
                0]  # 小区地址
        comm.co_develops = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_enter_name']/text()")[
                0]  # 开发商
        comm.co_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_area']/text()")[0]  # 总面积
        comm.co_build_size = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_area']/text()")[
                0]  # 建筑面积
        comm.co_build_end_time = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_ew_date']/text()")[
                0]  # 竣工时间
        comm.co_plan_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_program_pcode']/text()")[
                0]  # 用地规划许可
        comm.co_work_pro = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_jg']/text()")[0]  # 施工许可
        comm.co_green = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_item_green_rate']/text()"
        )[0]  # 绿地百分比
        comm.co_land_use = con.xpath(
            "//span[@id='ctl00_ContentPlaceHolder2_lb_td']/text()")[0]  # 土地使用证

        comm.insert_db()

        build = Building(co_index)
        build_table = con.xpath("//tr[@style='color:#000066;']")
        room_list = []
        for build_list in build_table:
            build.co_id = comm.co_id
            build.co_name = comm.co_name
            build_info = build_list.xpath("./td/text()")
            build.bu_id = build_info[0]
            build.bu_num = build_info[1]
            build.bu_all_house = build_info[2]
            build.size = build_info[3]
            build.bu_floor = build_info[4]
            build.bu_pre_sale = build_info[5]

            build.insert_db()

            room_url = build_list.xpath("./td/a/@href")[0]
            room_list.append(room_url)

        return room_list
Ejemplo n.º 16
0
 def build_info(self, bu_list, co_id):
     for bo in bu_list:
         ho_url = bo.xpath("./@href")[0]
         floor = bo.xpath(".//p[2]/text()")[0]
         bu = Building(co_index)
         bu.bu_pre_sale = bo.xpath(".//p[3]/text()")[0]
         bu.bu_num = re.search('zh=(.*?)', ho_url).group(1)
         bu.bu_id = re.search('n=(\d+)', ho_url).group(1)
         bu.co_id = co_id
         bu.bu_floor = re.search('总层数.*?(\d+)', floor).group(1)
         bu.insert_db()
         house_url = "http://www.ggsfcw.com/" + ho_url
         self.ho_info(house_url, co_id, bu.bu_id)
Ejemplo n.º 17
0
 def get_comm_detail(self, detail_url, area):
     try:
         comm = Comm(co_index)
         comm_detail_url = 'http://www.yfci.gov.cn:8080/HousePresell/' + detail_url
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_develops = re.search('id="kfsmc".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_name = re.search('id="PresellName".*?<a.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_address = re.search('id="HouseRepose".*?>(.*?)<', html,
                                     re.S | re.M).group(1)
         comm.co_build_size = re.search('id="PresellArea".*?>(.*?)<', html,
                                        re.S | re.M).group(1)
         comm.co_all_house = re.search('id="djrqtd".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_land_use = re.search('id="landinfo".*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_type = re.search('id="zczjtd".*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_pre_sale = re.search('id="bookid".*?<a.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_pre_sale_date = re.search('id="FZDatebegin".*?>(.*?)<',
                                           html, re.S | re.M).group(1)
         comm.co_open_time = re.search('id="kpdate".*?>(.*?)<', html,
                                       re.S | re.M).group(1)
         comm.co_id = re.search('FD=(.*?)&', detail_url,
                                re.S | re.M).group(1)
         comm.area = area
         comm.insert_db()
         build_html = re.search('id="donglist".*?</table>', html,
                                re.S | re.M).group()
         build_info_list = re.findall('<tr.*?</tr>', build_html,
                                      re.S | re.M)
         for i in build_info_list:
             build = Building(co_index)
             build.co_id = comm.co_id
             build.bu_address = re.search('<td.*?<td.*?>(.*?)<', i,
                                          re.S | re.M).group(1)
             build.bu_num = re.search('<td.*?<td.*?<td.*?>(.*?)<', i,
                                      re.S | re.M).group(1)
             build.bu_floor = re.search('<td.*?<td.*?<td.*?<td.*?>(.*?)<',
                                        i, re.S | re.M).group(1)
             house_url = re.search('href="(.*?)"', i, re.S | re.M).group(1)
             build.bu_id = re.search("LID=(.*?)$", house_url,
                                     re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(house_url, comm.co_id, build.bu_id)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url),
               e)
Ejemplo n.º 18
0
 def get_build_detail(self, all_building_url_list):
     house_url_list = []
     for i in all_building_url_list:
         try:
             response = requests.get(i, headers=self.headers)
             html = response.text
             tree = etree.HTML(html)
             bo_develops = tree.xpath('//*[@id="content_1"]/div[3]/text()[2]')[0]  # 开发商
             bu_build_size = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[6]/a/text()')  # 销售面积
             if bu_build_size:
                 bu_build_size = bu_build_size[0]
             bu_pre_sale = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[1]/a/text()')  # 预售证书
             if bu_pre_sale:
                 bu_pre_sale = bu_pre_sale[0]
             bu_floor = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[3]/a/text()')[0]  # 总层数
             bu_all_house = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[4]/a/text()')[0]  # 总套数
             bu_type = tree.xpath('//*[@id="houseTable_1"]/tr[2]/td[5]/a/text()')[0]  # 房屋用途
             build_html = re.search('houseTable_1.*?当前共有', html, re.S | re.M).group()
             build_detail_html = re.findall('class.*?</a></td>.*?</a></td>.*?</a></td>', build_html, re.S | re.M)
             bu_num = re.findall('项目名称:</b>(.*?)</div>', html, re.S | re.M)[0].strip()
             url_list = []
             for bu in build_detail_html:
                 try:
                     build = Building(co_index)
                     build.bu_id = re.search("href='roomTable.aspx\?id=(.*?)&", bu, re.S | re.M).group(1)
                     build.bu_address = re.search("_blank.*?_blank'>(.*?)</a></td><td>", bu, re.S | re.M).group(
                         1).strip()
                     build.bo_develops = bo_develops
                     build.bu_build_size = bu_build_size
                     build.bu_pre_sale = bu_pre_sale
                     build.bu_num = bu_num
                     build.bu_floor = bu_floor
                     build.bu_all_house = bu_all_house
                     build.bu_type = bu_type
                     for k in self.area_list:
                         if k in build.bu_address:
                             build.area = k
                             continue
                     build.insert_db()
                     house_url = re.search("(roomTable.aspx\?id=.*?&vc=.*?)'", bu, re.S | re.M).group(1)
                     url_list.append('http://dgfc.dg.gov.cn/dgwebsite_v2/Vendition/' + house_url)
                 except Exception as e:
                     print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
             house_url_list = url_list + house_url_list
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, i), e)
     return house_url_list
Ejemplo n.º 19
0
 def get_build_info(self, build_url_list, co_name):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build.co_name = co_name
             build_url = 'http://www.sxczfdc.com/pubinfo/' + i
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0]
             for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"',
                                 html, re.S | re.M):
                 try:
                     build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k
                     result = requests.get(build_url_detail,
                                           headers=self.headers)
                     content = result.text
                     build.bu_num = re.findall(
                         'BuildingInfo1_lblBuildingName">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_all_house = re.findall(
                         'BuildingInfo1_lblZts">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_floor = re.findall(
                         'BuildingInfo1_lblZcs">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_build_size = re.findall(
                         'BuildingInfo1_lblJzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_live_size = re.findall(
                         'BuildingInfo1_lblZzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale = re.findall(
                         'BuildingInfo1_lblYsxkzh">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale_date = re.findall(
                         'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content,
                         re.S | re.M)[0]
                     build.insert_db()
                     house_url_list = re.findall(
                         "onClick=.getMoreHouseInfo\('(.*?)'\)", content,
                         re.S | re.M)
                     self.get_house_info(house_url_list, co_name,
                                         build.bu_num)
                 except Exception as e:
                     print(e)
         except Exception as e:
             print(e)
Ejemplo n.º 20
0
 def build_parse(self, co_id):  # 楼栋信息解析
     bu = Building(co_index)
     build_info_url = "http://ys.tyfdc.gov.cn/Firsthand/tyfc/publish/ProNBList.do"
     formdata = {"pid": co_id, "pageNo": "1", "pageSize": "50"}
     res = requests.post(build_info_url,
                         data=formdata,
                         headers=self.headers)
     con = res.text
     info = re.findall('<tr objid.*?</tr>', con, re.S | re.M)
     for i in info:
         bu.co_id = co_id
         bu.bu_id = re.search('objid="(\d+)"', i).group(1)
         bu.bu_num = re.findall('<span>(.*?)<', i)[1]
         bu.bu_floor = re.search('<td>(\d+)\(', i).group(1)
         bu.bu_address = re.findall('<td>(.*?)</td>', i)[-1]
         bu.insert_db()
         self.house_parse(bu.bu_id, co_id)
Ejemplo n.º 21
0
 def get_build_info(self, build_url_list):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build_url = 'http://222.223.160.199:8088/website/buildquery/selectBuild.jsp?buildID=' + i[0]
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             build.bu_id = i[0]
             build.co_build_structural = re.search('结构类型.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bo_build_end_time = re.search('建成年份.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bu_build_size = re.search('总建筑面积.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.bu_num = re.search('幢号.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             build.size = re.search('占地面积.*?<td>(.*?)<', html, re.S | re.M).group(1)
             build.bu_floor = re.search('房屋层数.*?<td>(.*?)<', html, re.S | re.M).group(1)
             build.bu_all_house = re.search('房屋套数.*?<td>(.*?)<', html, re.S | re.M).group(1)
             build.area = re.search('坐落区.*?<td>(.*?)<', html, re.S | re.M).group(1)
             build.insert_db()
             self.get_house_info(build.bu_id)
         except Exception as e:
             print('请求错误,url={}'.format(build_url),e)
Ejemplo n.º 22
0
 def get_build_info(self, co_id):
     build_url = 'http://www.yanjifc.com/jdi'
     payload = "activityId=" + str(co_id) + "&module=jtsActBuildingInfo"
     result = requests.post(url=build_url,
                            data=payload,
                            headers=self.headers)
     data = result.json()
     build_list = data['ROWS']['ROW']
     for i in build_list:
         build = Building(co_index)
         build.bu_all_size = self.dict_get(i, 'BUILDING_AREA')
         build.bu_address = self.dict_get(i, 'LOCATION')
         build.bu_num = self.dict_get(i, 'LOCATION')
         build.bu_floor = self.dict_get(i, 'TOTAL_FLOORS')
         build.bu_all_house = self.dict_get(i, 'TOTAL_SET')
         build.co_build_structural = self.dict_get(i, 'STRUCTURE')
         build.bu_id = self.dict_get(i, 'RESOURCE_GUID')
         build.co_id = co_id
         build.insert_db()
         self.get_house_info(co_id, build.bu_id)
Ejemplo n.º 23
0
    def bu_parse(self, co_id, bulist):
        for bo in bulist:
            bu_url = "http://110.89.45.7:8082" + bo
            bu_res = requests.get(bu_url, headers=self.headers)
            con = bu_res.text
            bu = Building(co_index)
            bu.co_id = co_id
            bu.bu_id = re.search('buildingInfoID=(.*?)&', bo).group(1)
            bu.bu_num = re.search('幢号.*?">(.*?)</', con, re.S | re.M).group(1)
            bu.bu_floor = re.search('总 层 数.*?">(.*?)</', con,
                                    re.S | re.M).group(1)
            bu.bu_live_size = re.search('批准销售.*?">.*?</td.*?">(.*?)</td', con,
                                        re.S | re.M).group(1)
            bu.bu_all_size = re.search('总面积.*?">(.*?)</', con,
                                       re.S | re.M).group(1)
            bu.bu_type = re.search('设计用途.*?">(.*?)</', con,
                                   re.S | re.M).group(1)
            bu.insert_db()

            bu_html = etree.HTML(con)
            ho_list = bu_html.xpath("//td[@style]/a")
            self.ho_parse(co_id, bu.bu_id, ho_list)
Ejemplo n.º 24
0
    def  build_info(self,bu_list,co_id):
        for bu in bu_list:
            bu_url = bu.xpath("./td[4]/a/@href")[0]
            build_url = self.start_url+'/' + bu_url
            bu_res = requests.get(build_url,headers=self.headers)
            bu_res.encoding = 'gbk'
            con = bu_res.text
            bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a',con,re.S|re.M).group(1)
            bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td',con,re.S|re.M).group(1)

            bu_html = etree.HTML(con)
            donglist = bu_html.xpath("//table[@id='donglist']/tr")
            for dong in donglist:
                dong_url = dong.xpath("./td/a/@href")[0]
                bu = Building(co_index)
                bu.co_id = co_id
                bu.bu_id = re.search('ID={(.*?)}',dong_url).group(1)
                bu.bu_num = dong.xpath("./td[3]/text()")[0]
                bu.bu_floor = dong.xpath("./td[4]/text()")[0]
                bu.bu_pre_sale = bu_pre_sale
                bu.bu_pre_sale_date = bu_pre_sale_date
                bu.insert_db()
                self.house_info(co_id,bu.bu_id,dong_url)
Ejemplo n.º 25
0
 def get_build_info(self, build_url_list, co_id):
     for i in build_url_list:
         build_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             build = Building(co_index)
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             build.bu_id = re.search('buildingInfoID=(.*?)&',
                                     build_url).group(1)
             build.co_id = co_id
             build.bo_develops = re.search('开发商:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             build.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             build.bu_address = re.search('坐落位置:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             build.bu_num = re.search('幢号:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             build.co_build_structural = re.search('建筑结构:.*?<td.*?>(.*?)<',
                                                   html,
                                                   re.S | re.M).group(1)
             build.bu_type = re.search('设计用途:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             build.bu_floor = re.search('总 层 数:.*?<td.*?>(.*?)<', html,
                                        re.S | re.M).group(1)
             build.co_all_size = re.search('总面积:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             build.bo_build_start_time = re.search('开工日期:.*?<td.*?>(.*?)<',
                                                   html,
                                                   re.S | re.M).group(1)
             build.insert_db()
             house_url_list = re.findall(
                 'href="(/House/HouseInfo\?HouseCenterID=.*?)"', html,
                 re.S | re.M)
             self.get_house_info(house_url_list, build.bu_id, co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, build_url), e)
Ejemplo n.º 26
0
    def get_build_info(self, build_url_list):
        for i in build_url_list:
            try:
                build = Building(co_index)
                build_url = 'http://www.fjnpfdc.com/House/' + i
                res = requests.get(build_url, headers=self.headers)
                con = res.content.decode('gbk')
                build.co_name = re.search("项目名称:.*?<td.*?>(.*?)<", con,
                                          re.S | re.M).group(1)
                build.bu_num = re.search("幢  号:.*?<td.*?>(.*?)<", con,
                                         re.S | re.M).group(1)
                build.co_use = re.search("设计用途:.*?<td.*?>(.*?)<", con,
                                         re.S | re.M).group(1)
                build.co_build_structural = re.search("建筑结构:.*?<td.*?>(.*?)<",
                                                      con,
                                                      re.S | re.M).group(1)
                build.bu_floor = re.search("总 层 数:.*?<td.*?>(.*?)<", con,
                                           re.S | re.M).group(1)
                build.bu_build_size = re.search("总 面 积:.*?<td.*?>(.*?)<", con,
                                                re.S | re.M).group(1)
                build.co_build_end_time = re.search("竣工日期:.*?<td.*?>(.*?)<",
                                                    con, re.S | re.M).group(1)

                house_url_list = re.findall('<a href="(HouseInfo.*?)"', con)
                # p = ProducerListUrl(page_url=build_url,
                #                     request_type='get', encode='gbk',
                #                     analyzer_rules_dict=build.to_dict(),
                #                     current_url_rule='<a href="(HouseInfo.*?)"',
                #                     analyzer_type='regex',
                #                     headers=self.headers)
                build.co_id = re.search('ProjectId=(.*?)&', i).group(1)
                build.bu_id = re.search('BuildingId=(.*?)&P', i).group(1)
                build.insert_db()
                # house_url_list = p.get_details()
                self.get_house_info(house_url_list, build.bu_id, build.co_id)
            except Exception as e:
                print("co_index={},楼栋{}错误".format(co_index, i), e)
Ejemplo n.º 27
0
    def get_build_info(self, bu_pre_sale, bo_develops, bu_co_name, bu_con):

        build = Building(co_index)

        build.bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        build.bu_num = re.search('幢号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        build.bu_floor = re.search('总层数.*?>(\d+)<', bu_con,
                                   re.S | re.M).group(1)
        build.bu_build_size = re.search('预售建筑面积.*?>(\d+.\d+)<', bu_con,
                                        re.S | re.M).group(1)
        build.bu_address = re.search('楼房坐落.*?;">(.*?)</span', bu_con,
                                     re.S | re.M).group(1)
        build.bu_live_size = re.search('住宅建筑面积.*?>(\d+.\d+)<', bu_con,
                                       re.S | re.M).group(1)
        build.bu_not_live_size = re.search('非住宅建筑面积.*?;">(.*?)</span', bu_con,
                                           re.S | re.M).group(1)
        build.bo_build_start_time = re.search('开工日期.*?;">(.*?)</span', bu_con,
                                              re.S | re.M).group(1)
        build.bu_all_house = re.search('总套数.*?>(\d+)<', bu_con,
                                       re.S | re.M).group(1)
        build.bu_pre_sale = bu_pre_sale
        build.bo_develops = bo_develops
        build.co_name = bu_co_name
        build.insert_db()
Ejemplo n.º 28
0
 def get_build_info(self, more_build_url):
     for i in more_build_url:
         try:
             build = Building(co_index)
             build_url = 'http://www.jmfc.com.cn/' + i
             build.bu_num = '<tr bgcolor="#FFFFFF">.*?<td.*?>(.*?)<'
             build.co_id = '楼盘首页.*?aid-(.*?)/'
             build.bu_id = '&addno=12&action=loupantable&lzbm=(.*?)&ql_xh='
             build.bu_pre_sale = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>(.*?)<'
             build.bu_floor = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<'
             build.bu_all_house = '<tr bgcolor="#FFFFFF">.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>.*?<.*?<td.*?>(.*?)<'
             p = ProducerListUrl(
                 page_url=build_url,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=build.to_dict(),
                 current_url_rule=
                 '<tr bgcolor="#FFFFFF">.*?align="left".*?href="(.*?)"',
                 analyzer_type='regex',
                 headers=self.headers)
             house_url_list = p.get_details()
             self.get_house_info(house_url_list)
         except Exception as e:
             print(e)
Ejemplo n.º 29
0
    def bu_info(self,bu_list,co_id):
        for bu in bu_list:
            try:
                bu_url = 'http://www.fxfdcw.com/'+bu
                res = requests.get(bu_url,headers=self.headers)
                con = res.content.decode('gbk')
                html = etree.HTML(con)
                build = Building(co_index)
                build.co_id = co_id
                build.bu_id = re.search('bdid=(\d+)',bu).group(1)
                build.bu_num = re.search('楼号.*?">(.*?)</',con,re.S|re.M).group(1)
                build.bu_address =  re.search('坐落.*?">(.*?)</',con,re.S|re.M).group(1)
                build.bu_floor = re.search('地上层数.*?">(.*?)</',con,re.S|re.M).group(1)
                build.bu_build_size = re.search('建筑面积.*?wrap">(.*?)</',con,re.S|re.M).group(1)
                build.bu_all_house = re.search('套 数.*?">(.*?)</',con,re.S|re.M).group(1)
                build.bu_type = re.search('用  途.*?wrap">(.*?)</',con,re.S|re.M).group(1)
                build.insert_db()

                ho_list = html.xpath("//span[@title]")
            except Exception as e:
                # log.error("楼栋信息错误{}".format(e))
                print("楼栋信息错误{}".format(e))
                continue
            self.ho_info(ho_list,co_id,build.bu_id)
Ejemplo n.º 30
0
    def get_comm_info(self, comm_info):

        co = Comm(co_index)
        co.co_name = re.search('_blank">(.*?)</a', comm_info).group(1)
        try:
            co.co_address = re.findall('px">(.*?)</td', comm_info)[1]
        except:
            co.co_address = None
        co.area = re.search('center">(.*?)</td>', comm_info).group(1)
        co_detail_url = re.search("href='(.*?)'", comm_info).group(1)
        co_url = "http://www.qyfgj.cn/newys/" + co_detail_url
        try:
            res = requests.get(co_url, headers=self.headers)
        except Exception as e:
            print("co_index={}小区未请求到".format(co_index), e)
        con = res.content.decode('gbk')
        try:
            co.co_develops = re.search('开发商名称.*?px;">(.*?)</a', con,
                                       re.S | re.M).group(1)
            co.co_all_house = re.search('总套数.*?">(\d+)&nbsp', con,
                                        re.S | re.M).group(1)
            co.co_all_size = re.search('总面积.*?">(\d+.\d+)&nbsp;m', con,
                                       re.S | re.M).group(1)
        except:
            print("小区无开发商等信息")
        co.insert_db()

        try:
            build = re.findall('<tr bgcolor="white">(.*?)</tr>', con,
                               re.S | re.M)
        except:
            print("小区没有楼栋信息")
        build_headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
            'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
            'Referer': co_url
        }

        for build_info in build:
            if "进入" in build_info:
                build_url = re.search('href="(.*?)"><font',
                                      build_info).group(1)
                build_url = "http://www.qyfgj.cn/newys/" + build_url
                ho_headers = {
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                    'Cookie': 'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                    'Referer': build_url
                }
                build_res = requests.get(build_url, headers=build_headers)
                build_con = build_res.content.decode('gbk')

                if re.search('ID=(\d+)', build_url):  #现售
                    bu = Building(co_index)
                    bu_id = re.search('ID=(\d+)', build_url).group(1)
                    bu.bu_id = bu_id
                    bu.co_name = co.co_name
                    bu.insert_db()
                    self.get_house_info(headers=ho_headers,
                                        bu_id=bu_id,
                                        url=build_url)

                else:  #预售
                    bu = Building(co_index)
                    bu.co_name = co.co_name
                    bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con,
                                           re.S | re.M).group(1)
                    bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>',
                                               build_con, re.S | re.M).group(1)
                    bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>',
                                                    build_con,
                                                    re.S | re.M).group(1)
                    bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con,
                                              re.S | re.M).group(1)
                    ret = re.findall('<tr onmouseover(.*?)</tr', build_con,
                                     re.S | re.M)
                    for i in ret:
                        house_url = re.search('href="(.*?)"', i).group(1)
                        house_url = "http://www.qyfgj.cn/newys/" + house_url
                        bu.bu_id = re.search('dbh=(.*?)&', i).group(1)
                        bu.bu_num = re.search('<td width="89.*?">(.*?)</',
                                              i).group(1)
                        bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',
                                                i).group(1)
                        bu.insert_db()

                        ho_res = requests.get(house_url, headers=ho_headers)
                        ho_con = ho_res.content.decode('gbk')
                        new_headers = {
                            'User-Agent':
                            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                            'Cookie':
                            'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                            'Referer': house_url
                        }
                        self.get_house_info(ho_con=ho_con,
                                            headers=new_headers,
                                            bu_id=bu.bu_id)
            else:
                print("楼栋无链接地址")