예제 #1
0
 def bu_parse(self,bu_url,co_id,co_url):
     build_url = "http://61.143.241.154/" + bu_url
     global headers
     headers =  {
         'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
         'Referer':
             co_url
     }
     bu_res = requests.get(build_url,headers=headers)
     bu_con = bu_res.content.decode('gbk')
     bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a',bu_con,re.S|re.M).group(1)
     bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td',bu_con,re.S|re.M).group(1)
     bu_html = etree.HTML(bu_con)
     bu_list = bu_html.xpath("//table[@id='donglist']//tr")
     for bo in bu_list:
         bu = Building(co_index)
         bu.co_id = co_id
         bo_url = bo.xpath("./td/a/@href")[0]
         bu.bu_id = re.search('dbh=(.*?)&', bo_url).group(1)
         bu.bu_num = bo.xpath("./td[3]/text()")[0]
         bu.bu_floor = bo.xpath("./td[4]/text()")[0]
         bu.bu_pre_sale = bu_pre_sale
         bu.bu_pre_sale_date = bu_pre_sale_date
         bu.insert_db()
         self.house_parse(bo_url,co_id,bu.bu_id)
예제 #2
0
    def parse(self, res):
        html = etree.HTML(res.content.decode('gbk'))
        bu_list = html.xpath("//div[@class='listCon']")
        for i in bu_list:
            temp = i.xpath("./a[@class='listCon2']/@href")[0]
            name = i.xpath("./a[@class='listCon1']/@title")[0]
            url = "http://www.hyfc365.com" + temp
            try:
                bu_res = requests.get(url, headers=self.headers)
                content = bu_res.content.decode('gbk')
                bu = Building(co_index)
                bu.bu_num = name
                project_id = re.search('ID=(.*)', temp).group(1)
                bu.bu_pre_sale = re.search('预售证名称.*?NAME">(.*?)</span',
                                           content, re.S | re.M).group(1)
                bu.bu_pre_sale_date = re.search('申领时间.*?">(.*?)</span',
                                                content, re.S | re.M).group(1)
                bu.bo_develops = re.search('申领单位.*?">(.*?)</span', content,
                                           re.S | re.M).group(1)
                bu.bu_build_size = re.search('"SALE_HOUSE_AREA">(.*?)<',
                                             content, re.S | re.M).group(1)
                bu.bu_all_house = re.search('"SALE_HOUSE_COUNT">(.*?)<',
                                            content, re.S | re.M).group(1)

                detail_url = 'http://www.hyfc365.com/RealEstate/Project/BuildingList.aspx?ID=' + project_id
                detail_res = requests.get(detail_url)
                bu_id = re.search("BUILDING_ID=(.*?)'",
                                  detail_res.text).group(1)
                bu.bu_id = bu_id
                bu.insert_db()
            except Exception as e:
                log.error("{}楼栋页面解析失败{}".format(url, e))
                continue
            self.house_parse(bu_id)
예제 #3
0
    def get_build_info(self, build_lis, co_id):
        for build_ in build_lis:
            build_url = "http://xx.yyfdcw.com" + build_
            try:
                build_res = requests.get(build_url, headers=self.headers)
            except Exception as e:
                print("co_index={},楼栋信息错误".format(co_index), e)
                continue
            con = build_res.text
            bu = Building(co_index)
            bu.co_id = co_id
            bu.bu_id = re.search('Bid=(\d+)', build_).group(1)
            bu.bu_num = re.search('名称.*?">(.*?)</spa', con).group(1)
            bu.bu_pre_sale = re.search("编.*?red'>(.*?)</a", con).group(1)
            bu.bu_pre_sale_date = re.search('颁发日期.*?Date">(.*?)</span',
                                            con).group(1)
            bu.bo_build_start_time = re.search('开工日期.*?">(.*?)</span',
                                               con).group(1)
            bu.bo_build_end_time = re.search('竣工日期.*?">(.*?)</span',
                                             con).group(1)
            bu.bo_develops = re.search('单位.*?">(.*?)</span', con).group(1)
            bu.bu_floor = re.search('层数.*?">(.*?)</span', con).group(1)
            bu.bu_live_size = re.search('住宅面积.*?">(.*?)</span', con).group(1)
            bu.size = re.search('总面积.*?">(.*?)</span', con).group(1)

            bu.insert_db()

            id = re.search('测量号.*?">(.*?)</span', con).group(1)
            self.get_house_info(co_id, bu.bu_id, id)
예제 #4
0
    def build_info(self, bu_list, co_id):
        for bu in bu_list:
            bu_url = bu.xpath("./td[4]/a/@href")[0]
            build_url = self.start_url + '/' + bu_url
            bu_res = requests.get(build_url, headers=self.headers)
            bu_res.encoding = 'gbk'
            con = bu_res.text
            bu_pre_sale = re.search('预售许可证编号.*?blank">(.*?)</a', con,
                                    re.S | re.M).group(1)
            bu_pre_sale_date = re.search('预售证有效日期.*?">(.*?)</td', con,
                                         re.S | re.M).group(1)

            bu_html = etree.HTML(con)
            donglist = bu_html.xpath("//table[@id='donglist']/tr")
            for dong in donglist:
                dong_url = dong.xpath("./td/a/@href")[0]
                bu = Building(co_index)
                bu.co_id = co_id
                bu.bu_id = re.search('ID={(.*?)}', dong_url).group(1)
                bu.bu_num = dong.xpath("./td[3]/text()")[0]
                bu.bu_floor = dong.xpath("./td[4]/text()")[0]
                bu.bu_pre_sale = bu_pre_sale
                bu.bu_pre_sale_date = bu_pre_sale_date
                bu.insert_db()
                self.house_info(co_id, bu.bu_id, dong_url)
예제 #5
0
    def get_build_info(self, comm_url_list):
        for i in comm_url_list:
            try:
                sid = re.findall('\+(\d+)\+', i)[0]
                pid = re.findall('\+(\d+)\+', i)[1]
                build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid
                # print(build_url)
                response = requests.get(build_url)
                html = response.text
                build = Building(co_index)
                build.bu_id = pid
                build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
                build.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', html,
                                              re.S | re.M).group(1)
                build.bu_pre_sale_date = re.search('时间.*?">(.*?)&nbsp', html,
                                                   re.S | re.M).group(1)
                build.bu_all_house = re.search('dM.*?">(.*?)&nbsp', html,
                                               re.S | re.M).group(1)
                # build.bu_address = re.search('售楼处地址.*?">(.*?)&nbsp', html, re.S | re.M).group(1)
                build.insert_db()
            except Exception as e:
                print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url),
                      e)

            house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001&params=' + sid
            # print(house_url)
            result = requests.get(house_url)
            html_ = result.text

            for house_info in re.findall('<Result.*?</Result>', html_,
                                         re.S | re.M):
                try:
                    house = House(co_index)
                    house.bu_id = build.bu_id
                    house.bu_num = build.bu_num
                    house.ho_name = re.search('<ONAME>(.*?)</ONAME>',
                                              house_info, re.S | re.M).group(1)
                    house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info,
                                             re.S | re.M).group(1)
                    house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>',
                                                    house_info,
                                                    re.S | re.M).group(1)
                    house.ho_floor = re.search('<FORC>(.*?)</FORC>',
                                               house_info,
                                               re.S | re.M).group(1)
                    house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>',
                                                   house_info,
                                                   re.S | re.M).group(1)
                    house.insert_db()
                except Exception as e:
                    print('co_index={}, 房号错误'.format(co_index), e)
예제 #6
0
 def get_build_info(self, build_url_list, co_name):
     for i in build_url_list:
         try:
             build = Building(co_index)
             build.co_name = co_name
             build_url = 'http://www.sxczfdc.com/pubinfo/' + i
             response = requests.get(build_url, headers=self.headers)
             html = response.text
             # build_detail_url = re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"', html, re.S | re.M)[0]
             for k in re.findall('(Pub_dtxx.aspx\?ProjectBuildingID=.*?)"',
                                 html, re.S | re.M):
                 try:
                     build_url_detail = 'http://www.sxczfdc.com/pubinfo/' + k
                     result = requests.get(build_url_detail,
                                           headers=self.headers)
                     content = result.text
                     build.bu_num = re.findall(
                         'BuildingInfo1_lblBuildingName">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_all_house = re.findall(
                         'BuildingInfo1_lblZts">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_floor = re.findall(
                         'BuildingInfo1_lblZcs">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_build_size = re.findall(
                         'BuildingInfo1_lblJzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_live_size = re.findall(
                         'BuildingInfo1_lblZzmj">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale = re.findall(
                         'BuildingInfo1_lblYsxkzh">(.*?)<', content,
                         re.S | re.M)[0]
                     build.bu_pre_sale_date = re.findall(
                         'BuildingInfo1_lblYsxkzfzrq">(.*?)<', content,
                         re.S | re.M)[0]
                     build.insert_db()
                     house_url_list = re.findall(
                         "onClick=.getMoreHouseInfo\('(.*?)'\)", content,
                         re.S | re.M)
                     self.get_house_info(house_url_list, co_name,
                                         build.bu_num)
                 except Exception as e:
                     print(e)
         except Exception as e:
             print(e)
예제 #7
0
    def build_parse(self, co_id):
        bu = Building(co_index)

        url = "http://spf.tlfdc.cn/prjleft.aspx?projectid=" + str(co_id)
        res = requests.get(url, headers=self.headers)
        con_html = etree.HTML(res.text)
        build_url_list = con_html.xpath("//td[@colspan='2']/a/@href")[4:-1]
        a = con_html.xpath("//td[@width='54%']")

        for index in range(0, len(build_url_list)):
            try:
                build_info_url = "http://spf.tlfdc.cn/" + build_url_list[index]
                res = requests.get(build_info_url, headers=self.headers)
                con = res.text
                bu.co_id = co_id
                bu.bu_pre_sale_date = re.search('发证日期.*?Date">(.*?)<', con,
                                                re.S | re.M).group(1)
                bu.bu_num = re.search('幢.*?did">(.*?)<', con,
                                      re.S | re.M).group(1)
                bu.bu_pre_sale = re.search('编号.*?no">(.*?)<', con,
                                           re.S | re.M).group(1)
                bu.bu_address = re.search('位置.*?ss">(.*?)<', con,
                                          re.S | re.M).group(1)
                bu.bu_build_size = re.search('面积.*?Area">(.*?)<', con,
                                             re.S | re.M).group(1)
                bu.bu_type = re.search('性质.*?type">(.*?)<', con,
                                       re.S | re.M).group(1)
                bu.bu_all_house = re.search('套数.*?number">(.*?)<', con,
                                            re.S | re.M).group(1)
                bu.bu_id = re.search('id=(\d+)',
                                     build_url_list[index]).group(1)

                bu.insert_db()
            except Exception as e:
                print(
                    '楼栋错误,co_index={},url={}'.format(co_index, build_info_url),
                    e)
                continue
            try:
                house_url = a[index].xpath("./a/@href")[0]
                self.house_parse(house_url, co_id, bu.bu_id)
            except Exception as e:
                continue
예제 #8
0
    def get_build_info(self, build_url_list, co_id):
        for i in build_url_list:

            build_url = 'http://gold.ncfdc.com.cn/' + i.replace('amp;', '')
            res = requests.get(build_url)

            co_name = re.search('ctl15_proname">(.*?)<', res.text,
                                re.S | re.M).group(1)
            str = re.search('项目楼栋列表.*?ctl17_fLinks_pDataShow', res.text,
                            re.S | re.M).group()
            for info in re.findall('<tr>.*?</tr>', str, re.S | re.M):
                if 'href' not in info:
                    continue
                try:
                    build = Building(co_index)
                    build.co_name = co_name
                    build.bu_num = re.search(
                        '<tr>.*?<td>.*?<a href=.*?>(.*?)<', info,
                        re.S | re.M).group(1)
                    build.bu_pre_sale = re.search(
                        'onclick="BinSHouseInfo.*?>(.*?)<', info,
                        re.S | re.M).group(1)
                    build.bu_pre_sale_date = re.search(
                        'onclick="BinSHouseInfo.*?<td>(.*?)<', info,
                        re.S | re.M).group(1)
                    build.bu_all_house = re.search('color:#ec5f00;">(.*?)<',
                                                   info, re.S | re.M).group(1)
                    build.bu_id = re.search("DisplayB_ld&hrefID=(.*?)'", info,
                                            re.S | re.M).group(1)
                    build.co_id = co_id
                    build.insert_db()

                except Exception as e:
                    print(
                        '楼栋错误,co_index={},url={}'.format(co_index, build_url),
                        e)
            house_url_list = re.findall(
                "</span>.*?</td><td>.*?<a href='(.*?xs.*?)' target=\"_blank\">.*?查看",
                res.text, re.S | re.M)

            self.get_house_info(house_url_list)
예제 #9
0
 def build_parse(self, co_id):
     bu_url = "http://www.zyfgj.org/spf/GetBTable.ashx"
     bu_data = {"itemRecord": co_id, "houseCode": 0}
     res = requests.post(bu_url, data=bu_data, headers=self.headers)
     con = res.content.decode()
     bu_list = re.findall('<tr id.*?</tr>', con)
     for bo in bu_list:
         bu = Building(co_index)
         bu.co_id = co_id
         bu_id = re.search('GetData.*?,(.*?)\)', bo).group(1)
         bu.bu_id = bu_id.strip("'")
         try:
             bu.bu_num = re.search('预售证时间:.*?<td>(.*?)</td', bo).group(1)
             bu.bu_pre_sale = re.search('预售证号:(.*?)</td', bo).group(1)
             bu.bu_pre_sale_date = re.search('预售证时间:(.*?)</td', bo).group(1)
             bu.bu_all_house = re.search('预售证号:.*?<td>(\d+)</td',
                                         bo).group(1)
         except Exception as e:
             log.error("{}楼栋无预售号等信息{}".format(bo, e))
         bu.insert_db()
         self.house_parse(co_id, bu.bu_id)
예제 #10
0
 def get_build_info(self, presell_url_list, co_id):
     for presell_url in presell_url_list:
         pre_url = self.url + presell_url
         res = requests.get(pre_url, headers=self.headers)
         build_url_list = re.findall('【<a href="(.*?)" target="_self"',
                                     res.text, re.S | re.M)
         for build_url in build_url_list:
             build_info_url = self.url + build_url
             try:
                 build_res = requests.get(build_info_url,
                                          headers=self.headers)
                 con = build_res.text
                 bu = Building(co_index)
                 bu.co_id = co_id
                 bu.bu_id = re.search('ID=(\d+)', build_url).group(1)
                 bu.bu_num = re.search('栋.*?号.*?BuildingName">(.*?)</span',
                                       con, re.S | re.M).group(1)
                 bu.bu_floor = re.search('总 层 数.*?(\d+)</span', con,
                                         re.S | re.M).group(1)
                 bu.bu_build_size = re.search('建筑面积.*?Jzmj">(.*?)</span',
                                              con, re.S | re.M).group(1)
                 bu.bu_live_size = re.search('住宅面积.*?Zzmj">(.*?)</span',
                                             con, re.S | re.M).group(1)
                 bu.bu_not_live_size = re.search(
                     '非住宅面积.*?Fzzmj">(.*?)</span', con,
                     re.S | re.M).group(1)
                 bu.bu_pre_sale = re.search('预售许可证.*?xkzh">(.*?)</span',
                                            con, re.S | re.M).group(1)
                 bu.bu_pre_sale_date = re.search('发证日期.*?fzrq">(.*?)</span',
                                                 con, re.S | re.M).group(1)
                 bu.bu_type = re.search('项目类型.*?Type">(.*?)</span', con,
                                        re.S | re.M).group(1)
                 bu.insert_db()
             except Exception as e:
                 print("co_index={},楼栋信息错误".format(co_index), e)
                 continue
             house_detail_list = re.findall("getMoreHouseInfo\('(.*?)'\)\"",
                                            con, re.S | re.M)
             self.get_house_info(co_id, bu.bu_id, house_detail_list)
예제 #11
0
    def get_comm_info(self,comm_info):

        co = Comm(co_index)
        co.co_name = re.search('_blank">(.*?)</a',comm_info).group(1)
        try:
            co.co_address = re.findall('px">(.*?)</td',comm_info)[1]
        except:
            co.co_address = None
        co.area = re.search('center">(.*?)</td>',comm_info).group(1)
        co_detail_url = re.search("href='(.*?)'",comm_info).group(1)
        co_url = "http://www.qyfgj.cn/newys/"+co_detail_url
        try:
            res = requests.get(co_url,headers=self.headers)
        except Exception as e:
            print("co_index={}小区未请求到".format(co_index),e)
        con = res.content.decode('gbk')
        try:
            co.co_develops = re.search('开发商名称.*?px;">(.*?)</a',con,re.S|re.M).group(1)
            co.co_all_house = re.search('总套数.*?">(\d+)&nbsp',con,re.S|re.M).group(1)
            co.co_all_size = re.search('总面积.*?">(\d+.\d+)&nbsp;m',con,re.S|re.M).group(1)
        except:
            print("小区无开发商等信息")
        co.insert_db()

        try:
            build = re.findall('<tr bgcolor="white">(.*?)</tr>',con,re.S|re.M)
        except:
            print("小区没有楼栋信息")
        build_headers = {'User-Agent':
                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
            'Cookie':
                'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
            'Referer':
                co_url
        }

        for build_info in build:
            if "进入" in build_info:
                build_url = re.search('href="(.*?)"><font',build_info).group(1)
                build_url = "http://www.qyfgj.cn/newys/" + build_url
                ho_headers={
                    'User-Agent':
                        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                    'Cookie':
                        'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                    'Referer':
                        build_url
                }
                build_res = requests.get(build_url, headers=build_headers)
                build_con = build_res.content.decode('gbk')

                if re.search('ID=(\d+)',build_url):   #现售
                    bu = Building(co_index)
                    bu_id = re.search('ID=(\d+)',build_url).group(1)
                    bu.bu_id = bu_id
                    bu.co_name =co.co_name
                    bu.insert_db()
                    self.get_house_info(headers=ho_headers,bu_id=bu_id,url=build_url)

                else:                                  #预售
                    bu = Building(co_index)
                    bu.co_name = co.co_name
                    bu.bu_type = re.search('用途.*?">(.*?)</td>', build_con, re.S | re.M).group(1)
                    bu.bu_pre_sale = re.search('许可证编号.*?_blank">(.*?)</a>', build_con, re.S | re.M).group(1)
                    bu.bu_pre_sale_date = re.search('有效日期.*?">(.*?)</td>', build_con, re.S | re.M).group(1)
                    bu.bu_address = re.search('项目座落.*?">(.*?)</td>', build_con, re.S | re.M).group(1)
                    ret = re.findall('<tr onmouseover(.*?)</tr',build_con,re.S|re.M)
                    for i in ret:
                        house_url = re.search('href="(.*?)"',i).group(1)
                        house_url = "http://www.qyfgj.cn/newys/" + house_url
                        bu.bu_id = re.search('dbh=(.*?)&',i).group(1)
                        bu.bu_num = re.search('<td width="89.*?">(.*?)</',i).group(1)
                        bu.bu_floor = re.search('<td width="84.*?">(\d+)</td',i).group(1)
                        bu.insert_db()

                        ho_res = requests.get(house_url,headers=ho_headers)
                        ho_con = ho_res.content.decode('gbk')
                        new_headers = {
                            'User-Agent':
                                'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                            'Cookie':
                                'ASP.NET_SessionId=irv0qjamqztp1pb0shoqrx2j',
                            'Referer':
                                house_url
                        }
                        self.get_house_info(ho_con=ho_con,headers=new_headers,bu_id=bu.bu_id)
            else:
                print("楼栋无链接地址")
예제 #12
0
 def start_crawler(self):
     url = 'http://zzx.zzfc.com/ajaxpro/xy_ysxk_more,App_Web_mjeeodb-.ashx'
     for i in range(1, 21):
         payload = "{\"pageNo\":" + str(
             i) + ",\"pageSize\":30,\"rowcount\":589}"
         try:
             response = requests.post(url,
                                      data=payload,
                                      headers=self.headers)
             con = response.content.decode()
         except Exception as e:
             log.error('楼栋请求失败{}'.format(e))
             continue
         co_list = re.findall('\[\d+,.*?\d+\]', con)
         for comm in co_list:
             try:
                 sid = re.search('\[(\d+),', comm).group(1)
                 pid = re.search('",(\d+),', comm).group(1)
                 bu_url = 'http://zzx.zzfc.com/xy_bldg.aspx?pid=' + pid + '&sid=' + sid
                 bu_res = requests.get(bu_url, headers=self.headers)
                 bu_con = bu_res.content.decode()
                 bu = Building(co_index)
                 bu.bu_id = sid
                 bu.bu_address = re.search('楼栋座落.*?">(.*?)&nbsp', bu_con,
                                           re.S | re.M).group(1)
                 bu.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', bu_con,
                                            re.S | re.M).group(1)
                 bu.bu_pre_sale_date = re.search('预售日期.*?">(.*?)&nbsp',
                                                 bu_con,
                                                 re.S | re.M).group(1)
                 bu.bu_all_house = re.search('套数.*?">(.*?)&nbsp', bu_con,
                                             re.S | re.M).group(1)
                 bu.insert_db()
             except Exception as e:
                 log.error("{}楼栋解析失败{}".format(comm, e))
                 continue
             ho_url = 'http://zzx.zzfc.com/ajaxpro/xy_housetag,App_Web_xg4ulr9n.ashx'
             data = "{\"m_key\":\"WWW_LPB_001\",\"m_param\":\"" + sid + "\"}"
             headers = {
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                 'X-AjaxPro-Method': 'GETLPBDS'
             }
             try:
                 ho_res = requests.post(ho_url, data=data, headers=headers)
                 ho_con = ho_res.content.decode()
             except Exception as e:
                 log.error("房屋请求失败{}".format(e))
                 continue
             ho_list = re.findall('\["\d+.*?\d+\]', ho_con)
             for house in ho_list:
                 try:
                     ho = House(co_index)
                     ho.bu_id = sid
                     info_list = house.split(",")
                     ho.ho_name = info_list[4]
                     ho.ho_floor = re.search('(\d+)层', house).group(1)
                     ho.ho_build_size = info_list[-3]
                     ho.ho_true_size = info_list[-2]
                     ho.insert_db()
                 except Exception as e:
                     log.error("{}房屋解析错误{}".format(house, e))
                     continue
예제 #13
0
    def start_crawler(self):
        response = requests.get(url)
        html = response.text
        tree = etree.HTML(html)
        comm_list = tree.xpath('//tr[@class="Row"]/td[1]/text()')
        co_develops_list = tree.xpath('//tr[@class="Row"]/td[3]/text()')
        co_address_list = tree.xpath('//tr[@class="Row"]/td[8]/text()')
        co_open_time_list = tree.xpath('//tr[@class="Row"]/td[9]/text()')
        co_pre_sale_list = tree.xpath('//tr[@class="Row"]/td[5]/text()')
        co_all_house_list = tree.xpath('//tr[@class="Row"]/td[11]/text()')
        co_build_size_list = tree.xpath('//tr[@class="Row"]/td[10]/text()')
        co_name_list = tree.xpath('//tr[@class="Row"]/td[4]/text()')
        for co in range(0, len(comm_list)):
            try:
                comm = Comm(co_index)
                comm_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/PreSellLicenceDetailInfo.aspx?PreSellLicenceSN=' + \
                           comm_list[
                               co]
                result = requests.get(comm_url)
                html_build = result.text
                tree = etree.HTML(html_build)
                build_list = tree.xpath('//tr[@class="Row"]/td[1]/text()')
                area = tree.xpath('//*[@id="LabSCFW"]/text()')[0]
                comm.co_id = comm_list[co]
                comm.area = area
                comm.co_develops = co_develops_list[co]
                comm.co_address = co_address_list[co]
                comm.co_open_time = co_open_time_list[co]
                comm.co_pre_sale = co_pre_sale_list[co]
                comm.co_all_house = co_all_house_list[co]
                comm.co_build_size = co_build_size_list[co]
                comm.co_develops = co_develops_list[co]
                comm.co_name = co_name_list[co]
                comm.insert_db()
                for bu in range(0, len(build_list)):
                    try:

                        build_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/PubRptHouseList.aspx?BuildingSN=' + \
                                    build_list[bu]
                        res = requests.get(build_url, headers=self.headers)
                        con = res.content.decode('gbk')
                        building = Building(co_index)

                        building.co_id = comm.co_id
                        building.bu_id = build_list[bu]
                        building.bu_num = re.search(
                            '栋号.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_build_size = re.search(
                            '总建筑面积.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_floor = re.search(
                            '层数.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_all_house = re.search(
                            '预售套数.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_pre_sale_date = re.search(
                            '有效期.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_type = re.search(
                            '土地用途.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.bu_pre_sale = re.search(
                            '许可证编号.*?<span.*?">(.*?)</span', con,
                            re.S | re.M).group(1)
                        building.insert_db()

                        house_list = re.findall('房号:<a href="(.*?)"', con)
                        for ho in house_list:
                            try:
                                house = House(co_index)
                                house_url = 'http://www.jyfg.cn/HouseWebSetup/PublicReport/' + ho
                                respon = requests.get(house_url)
                                html = respon.text
                                house.co_id = comm.co_id
                                house.bu_id = building.bu_id
                                house.ho_name = re.search(
                                    '房号:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_build_size = re.search(
                                    '预测建筑面积:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_true_size = re.search(
                                    '预测套内面积:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_share_size = re.search(
                                    '预测分摊面积:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_type = re.search(
                                    '房屋用途:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)
                                house.ho_room_type = re.search(
                                    '户型结构:.*?<span.*?>(.*?)<', html,
                                    re.M | re.S).group(1)

                                house.insert_db()
                            except Exception as e:
                                print("co_index={},房屋{}信息提取失败".format(
                                    co_index, house_url))
                                print(e)
                                continue
                    except Exception as e:
                        print(e)
                        print('co_idnex={},楼栋{}提取失败'.format(
                            co_index, build_url))
                        continue
            except Exception as e:
                print('co_index={},小区{}提取失败'.format(co_index, comm_url))
                print(e)
                continue