Beispiel #1
0
    def house_crawler(self, house_url, bu_num, co_id, bu_id):
        ho = House(co_index, bu_num=bu_num, co_id=co_id, bu_id=bu_id)

        url = self.url + house_url
        con = requests.get(url, headers=self.headers)
        tr = con.text
        ho_name = re.findall('室号:(.*?)户', tr, re.S | re.M)  # 房号:3单元403
        # ho_num = re.findall('_td(\d+)"', tr)  # 房号id
        ho_floor = re.findall('(\d+)层', tr)  # 楼层
        ho_type = re.findall('房屋属性:(.*?)"', tr,
                             re.S | re.M)  # 房屋类型:普通住宅 / 车库仓库
        ho_room_type = re.findall('户型:(.*?)所', tr, re.S | re.M)  # 户型
        ho_build_size = re.findall('建筑面积:(.*?)房', tr, re.S | re.M)  # 建筑面积

        for floor in ho_floor:
            try:
                ho.ho_floor = floor
                for index in range(1, len(ho_name) + 1):
                    ho.ho_name = ho_name[index]
                    ho.ho_type = ho_type[index]
                    ho.ho_room_type = ho_room_type[index]
                    ho.ho_build_size = ho_build_size[index]
                    # ho.ho_num = ho_num[index]

                    ho.insert_db()
            except:
                continue
Beispiel #2
0
 def get_house_info(self, house_id_list, bu_id, co_id):
     for i in house_id_list:
         house_url = 'http://www.hbczfdc.com:4993/HPMS/RoomInfo.aspx?code=' + i
         try:
             house = House(co_index)
             response = requests.get(house_url, headers=self.headers)
             html = response.text
             house.bu_id = bu_id
             house.co_id = co_id
             house.ho_name = re.search('id="ROOM_HH">(.*?)<', html,
                                       re.S | re.M).group(1)
             house.ho_floor = re.search('id="ROOM_MYC">(.*?)<', html,
                                        re.S | re.M).group(1)
             house.ho_type = re.search('id="ROOM_FWYT">(.*?)<', html,
                                       re.S | re.M).group(1)
             house.ho_room_type = re.search('id="ROOM_HX">(.*?)<', html,
                                            re.S | re.M).group(1)
             house.ho_build_size = re.search('id="ROOM_YCJZMJ">(.*?)<',
                                             html, re.S | re.M).group(1)
             house.ho_true_size = re.search('id="ROOM_YCTNJZMJ">(.*?)<',
                                            html, re.S | re.M).group(1)
             house.ho_share_size = re.search('id="ROOM_YCFTJZMJ">(.*?)<',
                                             html, re.S | re.M).group(1)
             house.insert_db()
         except Exception as e:
             print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
Beispiel #3
0
    def get_build_info(self, url, response,co_id, bu_id):
        house = House(co_index)
        json_html = json.loads(response.text)
        for i in json_html:
                ho_name = i['roomno']  # 房号
                ho_type = i['ghyt']  # 用途
                ho_true_size = i['tnmj']  # 预测套内面积
                ho_floor = i['floorindex']  # 楼层
                ho_build_size = i['jzmj']  # 建筑面积
                house.co_id = co_id
                house.bu_id = bu_id
                house_code = i["fwcode"]
                house.ho_name = ho_name
                house.ho_type = ho_type
                house.ho_true_size = ho_true_size
                house.ho_floor = ho_floor
                house.ho_build_size = ho_build_size

                house_detail_url = "http://fsfc.fsjw.gov.cn/hpms_project/roomview.jhtml?id="+str(house_code)
                try:
                    res = requests.get(house_detail_url,headers=self.headers)
                    house.ho_share_size = re.search('实测分摊面积.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1)
                    house.ho_price = re.search('总价.*?<td>(.*?)</td>', res.text, re.S | re.M).group(1)
                except Exception as e:
                    print("co_index={},房屋详情页{}请求失败!".format(co_index,house_detail_url))
                    print(e)
                    continue

                house.insert_db()
Beispiel #4
0
 def house_info(self,ho_url,co_id,bu_id):
     url = "http://222.77.178.63:7002/" + ho_url
     url.rstrip('=')
     res = requests.get(url,headers=self.headers)
     res.encoding = 'gbk'
     html = etree.HTML(res.text)
     house_detail_list = html.xpath("//td/a[@target]/@href")
     for house_detail in house_detail_list:
         try:
             detail_url = "http://222.77.178.63:7002/" + house_detail
             detail_res = requests.get(detail_url,headers=self.headers)
             detail_res.encoding = 'gbk'
             con = detail_res.text
             ho = House(co_index)
             ho.co_id = co_id
             ho.bu_id = bu_id
             ho.ho_name = re.search('室号.*?">(.*?)<',con,re.S|re.M).group(1)
             ho.ho_floor = re.search('实际层.*?">(.*?)<',con,re.S|re.M).group(1)
             ho.ho_type = re.search('房屋类型.*?">(.*?)<',con,re.S|re.M).group(1)
             ho.ho_build_size = re.search('预测建筑面积.*?">(.*?)<',con,re.S|re.M).group(1)
             ho.ho_true_size = re.search('预测套内面积.*?">(.*?)<',con,re.S|re.M).group(1)
             ho.ho_share_size = re.search('预测分摊面积.*?">(.*?)<',con,re.S|re.M).group(1)
             ho.ho_price = re.search('总价.*?">(.*?)<',con,re.S|re.M).group(1)
             ho.insert_db()
         except Exception as e:
             # log.error('房屋信息错误{}'.format(e))
             print('房屋信息错误{}'.format(e))
Beispiel #5
0
 def get_house_info(self, house_url_list, bu_id, co_id):
     for i in house_url_list:
         try:
             house = House(co_index)
             response = requests.get(i, headers=self.headers)
             html = response.text
             house.ho_name = re.search('门牌号:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             house.ho_floor = re.search('所在层:.*?<td.*?>(.*?)<', html,
                                        re.S | re.M).group(1)
             house.ho_type = re.search('房屋性质:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             house.ho_build_size = re.search('预测建筑面积:.*?<td.*?>(.*?)<',
                                             html, re.S | re.M).group(1)
             house.ho_true_size = re.search('预测套内面积:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             house.ho_share_size = re.search('预测分摊面积:.*?<td.*?>(.*?)<',
                                             html, re.S | re.M).group(1)
             house.co_address = re.search('房屋坐落:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             house.bu_id = bu_id
             house.co_id = co_id
             house.insert_db()
         except Exception as e:
             print('房号错误,co_index={},url={}'.format(co_index, i), e)
Beispiel #6
0
 def get_house_info(self, house_url_list, co_name, bu_num):
     for i in house_url_list:
         try:
             house = House(co_index)
             house.co_name = co_name
             house.bu_num = bu_num
             house_url = 'http://www.sxczfdc.com/pubinfo/' + i
             response = requests.get(house_url, headers=self.headers)
             html = response.text
             house.ho_floor = re.findall('HouseInfo1_lblFwlc">(.*?)<', html,
                                         re.S | re.M)[0]
             house.ho_name = re.findall('HouseInfo1_lblFwfh">(.*?)<', html,
                                        re.S | re.M)[0]
             house.ho_type = re.findall('HouseInfo1_lblFwlx">(.*?)<', html,
                                        re.S | re.M)[0]
             house.ho_room_type = re.findall('HouseInfo1_lblFwhx">(.*?)<',
                                             html, re.S | re.M)[0]
             house.ho_build_size = re.findall(
                 'HouseInfo1_lblycfwjzmj">(.*?)<', html, re.S | re.M)[0]
             house.ho_true_size = re.findall(
                 'HouseInfo1_lblycfwtnmj">(.*?)<', html, re.S | re.M)[0]
             house.ho_share_size = re.findall(
                 'HouseInfo1_lblycfwftmj">(.*?)<', html, re.S | re.M)[0]
             house.orientation = re.findall('HouseInfo1_lblCx">(.*?)<',
                                            html, re.S | re.M)[0]
             house.insert_db()
         except Exception as e:
             print(e)
Beispiel #7
0
    def house_parse(self, house_url, co_id, bu_id):
        ho = House(co_index)
        url = "http://spf.tlfdc.cn/" + house_url
        res = requests.get(url, headers=self.headers)
        con = res.text

        ho_name = re.findall('室号:(.*?)套', con, re.S | re.M)
        ho_room_type = re.findall('套型:(.*?)建', con, re.S | re.M)
        ho_build_size = re.findall('建筑面积:(.*?)参', con, re.S | re.M)
        ho_price = re.findall('价格:(.*?)元', con, re.S | re.M)
        ho_detail = re.findall('href="(show.*?\?id=\d+&id2=\d+&prjid=\d+)"',
                               con, re.S | re.M)
        for index in range(0, len(ho_name)):
            try:
                ho.co_id = co_id
                ho.bu_id = bu_id
                ho.ho_name = ho_name[index]
                ho.ho_room_type = ho_room_type[index]
                ho.ho_build_size = ho_build_size[index]
                ho.ho_price = ho_price[index]
                ho_detail_url = "http://spf.tlfdc.cn/" + ho_detail[index]
                res = requests.get(ho_detail_url, headers=self.headers)
                res = res.content.decode('gb2312')
                ho.ho_floor = re.findall('楼层.*?">(.*?)</td>', res,
                                         re.S | re.M)[0].strip()

                ho.insert_db()
            except:
                print('房号错误,co_index={},url={}'.format(co_index, url), e)
                continue
Beispiel #8
0
 def get_house_info(self, bu_id, co_id):
     url = 'http://www.fzfgj.cn/website/presale/home/HouseTableControl/GetData.aspx?Building_ID=' + bu_id
     try:
         response = requests.get(url=url, headers=self.headers)
         xml = response.text
         tree = etree.XML(xml)
         logo = tree.xpath('//LOGICBUILDING_ID/text()')[0]
         url_2 = 'http://www.fzfgj.cn/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logo
         result = requests.get(url_2, headers=self.headers)
         xml_2 = result.text
         tree_2 = etree.XML(xml_2)
         house_info_list = tree_2.xpath('T_HOUSE')
         for i in house_info_list:
             try:
                 house = House(11)
                 ho_name = i.xpath('ROOM_NUMBER/text()')[0]
                 ho_build_size = i.xpath('BUILD_AREA/text()')[0]
                 ho_true_size = i.xpath('BUILD_AREA_INSIDE/text()')[0]
                 ho_share_size = i.xpath('BUILD_AREA_SHARE/text()')[0]
                 ho_floor = i.xpath('FLOOR_REALRIGHT/text()')[0]
                 ho_type = i.xpath('USE_FACT/text()')[0]
                 house.co_id = co_id
                 house.bu_id = bu_id
                 house.ho_build_size = ho_build_size
                 house.ho_true_size = ho_true_size
                 house.ho_share_size = ho_share_size
                 house.ho_floor = ho_floor
                 house.ho_name = ho_name
                 house.ho_type = ho_type
                 house.insert_db()
             except Exception as e:
                 print('房号错误,co_index={},url={}'.format(co_index, url_2), e)
     except BaseException as e:
         print('房号错误,co_index={},url={}'.format(co_index, url), e)
Beispiel #9
0
    def get_house_info(self, house_url_list, bu_id, co_id):
        for i in house_url_list:
            try:
                house = House(co_index)
                house_url = 'http://www.fjnpfdc.com/House/' + i
                house_res = requests.get(house_url, headers=self.headers)
                house_con = house_res.content.decode('gbk')

                house.bu_id = bu_id
                house.co_id = co_id
                house.bu_num = re.search('幢  号:.*?<td>(.*?)<', house_con,
                                         re.S | re.M).group(1)
                house.ho_name = re.search('房  号:.*?<td>(.*?)<', house_con,
                                          re.S | re.M).group(1)
                house.co_name = re.search('项目名称:.*?<td>(.*?)<', house_con,
                                          re.S | re.M).group(1)
                house.ho_build_size = re.search('建筑面积:.*?<td>(.*?)<',
                                                house_con,
                                                re.S | re.M).group(1)
                house.ho_true_size = re.search('套内面积:.*?<td>(.*?)<', house_con,
                                               re.S | re.M).group(1)
                house.ho_share_size = re.search('分摊面积:.*?<td>(.*?)<',
                                                house_con,
                                                re.S | re.M).group(1)
                house.ho_floor = re.search('所 在 层:.*?<td>(.*?)<', house_con,
                                           re.S | re.M).group(1)

                house.insert_db()
            except Exception as e:
                print("co_index={},房屋{}错误".format(co_index, i), e)
Beispiel #10
0
 def get_house_detail(self, house_id, bu_id):
     try:
         house = House(co_index)
         detail_url = 'http://222.223.160.199:8088/website/Hutu?id=' + house_id
         response = requests.get(detail_url, headers=self.headers)
         html = response.text
         house.ho_floor = re.search('层号.*?value="(.*?)"', html, re.S | re.M).group(1)
         house.ho_build_size = re.search('总面积.*?value="(.*?)"', html, re.S | re.M).group(1)
         house.ho_share_size = re.search('分摊面积.*?value="(.*?)"', html, re.S | re.M).group(1)
         house.ho_true_size = re.search('套内面积.*?value="(.*?)"', html, re.S | re.M).group(1)
         house.ho_type = re.search('房屋用途.*?value="(.*?)"', html, re.S | re.M).group(1)
         house.ho_floor = re.search('层号.*?value="(.*?)"', html, re.S | re.M).group(1)
         house.bu_id = bu_id
         house.insert_db()
     except Exception as e:
         print('请求错误,url={}'.format(detail_url), e)
Beispiel #11
0
    def house_info(self, co_id, bu_id, house_url_list):
        for house_ in house_url_list:
            house_url = "http://www.njhouse.com.cn/2016/spf/" + house_
            try:
                # ho_res = requests.get(house_url,headers=self.headers)
                ho_pro = Proxy_contact(app_name="nanjing",
                                       method='get',
                                       url=house_url,
                                       headers=self.headers)
                ho_con = ho_pro.contact()
                ho_con = ho_con.decode('gbk')

                # ho_con = ho_res.content.decode('gbk')
                ho = House(co_index)
                ho.co_id = co_id
                ho.bu_id = bu_id
                ho.ho_name = re.search('房号.*?;">(.*?)</td', ho_con,
                                       re.S | re.M).group(1)
                ho.ho_price = re.search('价格.*?<td>(.*?)元', ho_con,
                                        re.S | re.M).group(1)
                ho.ho_floor = re.search('楼层.*?;">(.*?)</td', ho_con,
                                        re.S | re.M).group(1)
                ho.ho_build_size = re.search('建筑面积.*?<td>(.*?)m', ho_con,
                                             re.S | re.M).group(1)
                ho.ho_true_size = re.search('套内面积.*?<td>(.*?)m', ho_con,
                                            re.S | re.M).group(1)
                ho.ho_share_size = re.search('分摊面积.*?<td>(.*?)m', ho_con,
                                             re.S | re.M).group(1)
                ho.ho_type = re.search('房屋类型.*?<td>(.*?)</td', ho_con,
                                       re.S | re.M).group(1)
            except Exception as e:
                log.error("房屋详情页错误{}".format(e))
                continue

            ho.insert_db()
Beispiel #12
0
    def get_house_info(self, co_id, bu_id, id):

        house_list_url = "http://xx.yyfdcw.com/hetong/fdc_xxdxx.asp?id=" + str(
            id)
        res = requests.get(house_list_url, headers=self.headers)
        con = res.content.decode('gbk')
        house_list = re.findall("onClick=.*?open\('(.*?)',", con, re.S | re.M)
        for house_ in house_list:
            try:
                house_url = "http://xx.yyfdcw.com/hetong/" + house_
            except Exception as e:
                print("co_index={},房屋信息错误".format(co_index), e)
                continue
            ho_res = requests.get(house_url, headers=self.headers)
            ho_con = ho_res.content.decode('gbk')

            ho = House(co_index)
            ho.co_id = co_id
            ho.bu_id = bu_id
            ho.ho_name = re.search('室号.*?fafa>(.*?)</TD', ho_con,
                                   re.S | re.M).group(1)
            ho.ho_floor = re.search('实际层.*?fafa>(.*?)</TD', ho_con,
                                    re.S | re.M).group(1)
            ho.ho_build_size = re.search('建筑面积.*?fafa>(.*?)</TD', ho_con,
                                         re.S | re.M).group(1)
            ho.ho_true_size = re.search('套内面积.*?fafa>(.*?)</TD', ho_con,
                                        re.S | re.M).group(1)
            ho.ho_share_size = re.search('分摊面积.*?fafa>(.*?)</TD', ho_con,
                                         re.S | re.M).group(1)
            ho.ho_price = re.search('价格.*?fafa>(.*?)</TD', ho_con,
                                    re.S | re.M).group(1)
            ho.ho_type = re.search('用途.*?fafa>(.*?)</TD', ho_con,
                                   re.S | re.M).group(1)

            ho.insert_db()
Beispiel #13
0
 def get_house_info(self, co_id, bu_id):
     house_url = "http://202.103.219.149:7000/LeadingEstate/buildingtable/ShowNewBuildingTable.aspx"
     payload = "IsShowHouse=1&BuidID=" + bu_id
     headers = {'Content-Type': "application/x-www-form-urlencoded"}
     try:
         response = requests.request("POST",
                                     house_url,
                                     data=payload,
                                     headers=headers)
         html = response.text
         house_info_list = re.findall('HouseID.*?\}', html, re.S | re.M)
         for i in house_info_list:
             house = House(co_index)
             house.bu_id = bu_id
             house.co_id = co_id
             house.ho_name = re.search('"YCHouseNo":"(.*?)"', i,
                                       re.S | re.M).group(1)
             house.ho_floor = re.search('"ActFLoor":"(.*?)"', i,
                                        re.S | re.M).group(1)
             house.ho_build_size = re.search('"YCJZArea":"(.*?)"', i,
                                             re.S | re.M).group(1)
             house.ho_true_size = re.search('"YCTNJZArea":"(.*?)"', i,
                                            re.S | re.M).group(1)
             house.ho_share_size = re.search('"YCFTJZArea":"(.*?)"', i,
                                             re.S | re.M).group(1)
             house.insert_db()
     except Exception as e:
         print('请求错误,url={},data={}'.format(house_url, payload))
Beispiel #14
0
    def get_build_info(self, comm_url_list):
        for i in comm_url_list:
            try:
                sid = re.findall('\+(\d+)\+', i)[0]
                pid = re.findall('\+(\d+)\+', i)[1]
                build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid
                # print(build_url)
                response = requests.get(build_url)
                html = response.text
                build = Building(co_index)
                build.bu_id = pid
                build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
                build.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', html,
                                              re.S | re.M).group(1)
                build.bu_pre_sale_date = re.search('时间.*?">(.*?)&nbsp', html,
                                                   re.S | re.M).group(1)
                build.bu_all_house = re.search('dM.*?">(.*?)&nbsp', html,
                                               re.S | re.M).group(1)
                # build.bu_address = re.search('售楼处地址.*?">(.*?)&nbsp', html, re.S | re.M).group(1)
                build.insert_db()
            except Exception as e:
                print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url),
                      e)

            house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001&params=' + sid
            # print(house_url)
            result = requests.get(house_url)
            html_ = result.text

            for house_info in re.findall('<Result.*?</Result>', html_,
                                         re.S | re.M):
                try:
                    house = House(co_index)
                    house.bu_id = build.bu_id
                    house.bu_num = build.bu_num
                    house.ho_name = re.search('<ONAME>(.*?)</ONAME>',
                                              house_info, re.S | re.M).group(1)
                    house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info,
                                             re.S | re.M).group(1)
                    house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>',
                                                    house_info,
                                                    re.S | re.M).group(1)
                    house.ho_floor = re.search('<FORC>(.*?)</FORC>',
                                               house_info,
                                               re.S | re.M).group(1)
                    house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>',
                                                   house_info,
                                                   re.S | re.M).group(1)
                    house.insert_db()
                except Exception as e:
                    print('co_index={}, 房号错误'.format(co_index), e)
Beispiel #15
0
    def room_crawler(self, room):  # 房屋

        ho = House(co_index)

        house_url = "http://www.hzszjj.gov.cn" + room

        res = requests.get(house_url, )
        con = etree.HTML(res.text)

        ho_table = con.xpath("//tr[@bgcolor='#fbf3e6']")
        for ho_list in ho_table[1:-1]:
            ho_floor = ho_list.xpath("./td[@align='center']/text()")[0]
            honum_list = ho_list.xpath(".//tr/td[@height='40']")
            for house in honum_list:
                ho.ho_floor = ho_floor  # 楼层
                id_num = re.search(r"(\d+)&\w+=(\d+)", room)
                ho.co_id = id_num.group(1)  # 小区id
                ho.bu_id = id_num.group(2)  # 楼栋id
                ho_url = house.xpath("./a/@href")[0]
                if len(ho_url) == 1:
                    ho_info = house.xpath("./a/@wf")[0]

                    ho.ho_name = house.xpath("./a/text()")[0]
                    info = re.search(
                        r":(.*?)<br>.*?:(.*?)<br>(.*?)<br><hr>.*?:(.*?)m.*?<br>.*?:(.*?)<br>.*?:(.*?)m",
                        ho_info)
                    ho.ho_type = info.group(5)
                    ho.ho_build_size = info.group(4)
                    ho.ho_room_type = info.group(2)

                else:
                    detail_url = "http://www.hzszjj.gov.cn/ts_web_dremis/web_house_dir/" + ho_url
                    res = requests.get(detail_url)
                    con = etree.HTML(res.text)
                    ho.ho_name = con.xpath(
                        "//span[@id='ctl00_ContentPlaceHolder2_lb_house_name']/text()"
                    )[0]
                    ho.ho_type = con.xpath(
                        "//span[@id='ctl00_ContentPlaceHolder2_lb_house_type']/text()"
                    )[0]
                    ho.ho_build_size = con.xpath(
                        "//span[@id='ctl00_ContentPlaceHolder2_lb_house_build_area']/text()"
                    )[0]
                    ho.ho_share_size = con.xpath(
                        "//span[@id='ctl00_ContentPlaceHolder2_lb_house_share_area']/text()"
                    )[0]
                    ho.ho_true_size = con.xpath(
                        "//span[@id='ctl00_ContentPlaceHolder2_lb_house_inside_area']/text()"
                    )[0]

                ho.insert_db()
Beispiel #16
0
    def get_house_info(self, bu_url, bu_id):
        qrykey = re.search('qrykey=(.*?)&', bu_url).group(1)
        house_url = 'http://old.newhouse.cnnbfdc.com/GetHouseTable.aspx?qrykey=' + qrykey
        response = requests.get(house_url, headers=self.headers)
        html = response.text
        house_code_list = re.findall("onclick=select_room\('(.*?)'", html,
                                     re.S | re.M)
        for i in house_code_list:
            house_detail_url = 'http://old.newhouse.cnnbfdc.com/openRoomData.aspx?roomId=' + str(
                i)
            # while True:
            #     proxies = self.proxy_pool()
            try:
                res = requests.get(
                    house_detail_url,
                    headers=self.headers,
                )
            except Exception as e:
                print("{}城市无法访问房屋页面{}".format(city, house_detail_url), e)
                continue
                # if res.status_code ==200:
            time.sleep(2)
            #     self.proxy_status(proxies,0)
            #     break
            # else:
            #     self.proxy_status(proxies,1)
            #     continue
            content = res.text
            ho = House(co_index)
            ho.bu_id = bu_id
            try:
                ho.ho_name = re.search('室号.*?">(.*?)</td>', content,
                                       re.S | re.M).group(1)
                ho.ho_floor = re.search('楼层.*?">(.*?)</td>', content,
                                        re.S | re.M).group(1)
                ho.ho_room_type = re.search('户型.*?">(.*?)</td>', content,
                                            re.S | re.M).group(1)
                ho.ho_type = re.search('用途.*?">(.*?)</td>', content,
                                       re.S | re.M).group(1)
                ho.ho_build_size = re.search('预测建筑面积.*?">(.*?)</td>', content,
                                             re.S | re.M).group(1)
                ho.ho_true_size = re.search('预测套内面积.*?">(.*?)</td>', content,
                                            re.S | re.M).group(1)
                ho.ho_share_size = re.search('预测分摊面积.*?">(.*?)</td>', content,
                                             re.S | re.M).group(1)

                ho.insert_db()
            except Exception as e:
                print("{}房号错误,请求频繁,当前页面{}未提取".format(city, house_detail_url),
                      e)
                continue
Beispiel #17
0
 def get_house_info(self, house_url, bu_id, co_id):
     try:
         house = House(co_index)
         house.bu_id = bu_id
         house.co_id = co_id
         response = requests.post(house_url, headers=self.headers)
         html = response.content.decode('gbk')
         house.ho_floor = re.search('所在楼层:.*?<td>(.*?)<', html, re.M | re.S).group(1)
         house.ho_name = re.search('房号:.*?<td>(.*?)<', html, re.M | re.S).group(1)
         house.ho_build_size = re.search('预测总面积:.*?<td>(.*?)<', html, re.M | re.S).group(1)
         house.ho_true_size = re.search('预测套内面积.*?<td>(.*?)<', html, re.M | re.S).group(1)
         house.ho_share_size = re.search('预测公摊面积.*?<td>(.*?)<', html, re.M | re.S).group(1)
         house.insert_db()
     except Exception as e:
         print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
Beispiel #18
0
    def comm(self, id):
        bu = Building(co_index)

        house_url = self.start_url + "/api/buildInfos/getHouseInfosByPannelNumber?pannelNumber=" + str(id)
        comm_url = self.start_url + "/api/buildInfos/getHomePageBuildingInfo?blockNumber=" + str(id)
        comm_detail_url = self.start_url + "/api/buildInfos/getDetailsBuildingInfo?blockNumber=" + str(id)

        comm_res = requests.get(comm_url)
        comm_detail_res = requests.get(comm_detail_url)
        house_res = requests.get(house_url)
        comm_dict = json.loads(comm_res.text)
        comm_detail_dict = json.loads(comm_detail_res.text)
        house_dict = json.loads(house_res.text)

        bu.bu_id = id
        bu.bu_num = comm_dict["data"]["nameBuildings"]
        bu.area = comm_detail_dict['data']['houseingArea']
        bu.bu_address = comm_dict["data"]["houseaddress"]
        bu.bu_pre_sale = comm_detail_dict["data"]["yszh"]
        bu.bu_type = comm_dict["data"]["propertycategory"]
        bu.bo_develops = comm_dict["data"]["companyName"]

        bu.insert_db()

        house_num = house_dict["data"]
        for hu in house_num:
            ho = House(co_index)
            h = hu["data"]
            if len(h) > 0:
                for i in h:
                    try:
                        room_id = i["houseNumber"]
                        room_url = self.start_url + "/api/buildInfos/getHouseInfoByHouseNumber?houseNumber=" + str(
                            room_id)
                        res = requests.get(room_url, headers=self.headers)
                        dict = json.loads(res.text)
                        ho.bu_id = id
                        # ho.ho_num = room_id
                        ho.ho_name = dict["data"]["houseNo"]
                        ho.ho_build_size = dict["data"]["buildArea"]
                        ho.ho_true_size = dict["data"]["jacketArea"]
                        ho.ho_share_size = dict["data"]["apportionedArea"]
                        ho.ho_floor = dict["data"]["nominalLevel"]
                        ho.insert_db()
                    except Exception as e:
                        print(e)
            else:
                continue
Beispiel #19
0
 def get_house_info(self, co_id, bu_id):
     house_url = 'http://www.yanjifc.com/jdi'
     payload = "page=1&rows=10000&module=jtsActHouses&buildingGuid=" + bu_id + "&activityId=" + co_id
     response = requests.post(house_url, data=payload, headers=self.headers)
     html = response.json()
     house_list = html['ROWS']['ROW']
     for i in house_list:
         house = House(co_index)
         house.ho_build_size = self.dict_get(i, 'BUILDING_AREA')
         house.ho_floor = self.dict_get(i, 'UNIT')
         house.ho_type = self.dict_get(i, 'PLANNING_USAGE')
         house.ho_true_size = self.dict_get(i, 'INNER_AREA')
         house.co_build_structural = self.dict_get(i, 'STRUCTURE')
         house.ho_name = self.dict_get(i, 'PART')
         house.bu_id = bu_id
         house.co_id = co_id
         house.insert_db()
Beispiel #20
0
 def get_house_info(self, house_url, bu_id, co_id):
     ho_url = 'http://www.fangdi.com.cn/' + house_url
     response = requests.get(ho_url, headers=self.headers)
     html = response.content.decode('gbk')
     house_html = re.search('室号 <.*?</table>.*?</table>', html,
                            re.S | re.M).group()
     house_info_list = re.findall('title.*?</td>', house_html, re.S | re.M)
     for i in house_info_list:
         try:
             house = House(co_index)
             house.ho_build_size = re.search('实测面积:(.*?)>', i,
                                             re.S | re.M).group(1)
             house.ho_name = re.search('实测面积.*?>(.*?)<br>', i,
                                       re.S | re.M).group(1).strip()
             house.bu_id = bu_id
             house.co_id = co_id
             if '<a' in house.ho_name:
                 house_detail_url_code = re.search('href="(.*?)"',
                                                   house.ho_name,
                                                   re.S | re.M).group(1)
                 house_detail_url = 'http://www.fangdi.com.cn/' + house_detail_url_code
                 result = requests.get(house_detail_url,
                                       headers=self.headers)
                 html_str = result.content.decode('gbk')
                 house.ho_floor = re.search('实际层.*?<TD.*?>(.*?)<', html_str,
                                            re.S | re.M).group(1)
                 house.ho_name = re.search('室号.*?<TD.*?>(.*?)<', html_str,
                                           re.S | re.M).group(1)
                 house.ho_type = re.search('房屋类型.*?<TD.*?>(.*?)<', html_str,
                                           re.S | re.M).group(1)
                 house.ho_room_type = re.search('房型.*?<TD.*?>(.*?)<',
                                                html_str,
                                                re.S | re.M).group(1)
                 house.ho_build_size = re.search('实测建筑面积.*?<TD.*?>(.*?)<',
                                                 html_str,
                                                 re.S | re.M).group(1)
                 house.ho_true_size = re.search('实测套内面积.*?<TD.*?>(.*?)<',
                                                html_str,
                                                re.S | re.M).group(1)
                 house.ho_share_size = re.search('实测分摊面积.*?<TD.*?>(.*?)<',
                                                 html_str,
                                                 re.S | re.M).group(1)
             house.insert_db()
         except Exception as e:
             print('房号错误,co_index={},url={}'.format(co_index, ho_url), e)
Beispiel #21
0
 def house_info(self, house_list, bu_id, co_id):
     ho = House(co_index)
     for house_url in house_list:
         url = "http://ris.szpl.gov.cn/bol/" + house_url
         res = requests.get(url, headers=self.headers)
         ho.ho_num = re.search('id=(\d+)', house_url).group(1)
         con = res.text
         ho.bu_num = re.search('情况.*?">(.*?)&', con).group(1)
         ho.bu_id = bu_id
         ho.co_id = co_id
         ho.ho_floor = re.search('楼层.*?">(\d+)&', con).group(1)
         ho.ho_num = re.search('房号.*?">(\d+)&', con).group(1)
         ho.ho_type = re.search('用途.*?">(\d+)&', con).group(1)
         ho.ho_room_type = re.search('户型.*?">(\d+)&', con).group(1)
         ho.ho_build_size = re.search('建筑面积<.*?">(\d+.\d+)平方米',
                                      con).group(1)
         ho.ho_true_size = re.search('户内面积<.*?">(\d+.\d+)平方米', con).group(1)
         ho.insert_db()
Beispiel #22
0
 def house_info(self, bu_id, bu_url, co_id):
     proxy = Proxy_contact(app_name='wuhan',
                           method='get',
                           url=bu_url,
                           headers=self.headers)
     res = proxy.contact()
     # res = requests.get(bu_url,headers=self.headers)
     html = etree.HTML(res.decode('gb18030'))
     con = html.xpath("//tr[@bgcolor='#FFFFFF']")
     for i in con:
         try:
             ho = House(co_index)
             ho.bu_id = bu_id
             ho.co_id = co_id
             ho.ho_floor = i.xpath("./td/text()")[2]
             house_num_list = i.xpath("./td[@bgcolor='#CCFFFF']")
             for house_num in house_num_list:
                 ho.ho_name = house_num.xpath(".//a/text()")[0]
                 ho.insert_db()
         except Exception as e:
             log.error('房号错误{}'.format(e))
Beispiel #23
0
 def get_house_info(self, house_url, co_id, bu_id):
     response = requests.get(house_url)
     html = response.text
     info = re.search('var houselist =.*?eval\((.*?)\);', html, re.S | re.M).group(1)
     data_list = json.loads(info)
     for data in data_list:
         try:
             house = House(co_index)
             house.ho_name = data['HouseName']
             house.unit = data['UnitName']
             house.co_build_structural = data['StruTypeName']
             house.ho_build_size = data['PreBuildArea']
             house.ho_true_size = data['PreInnerArea']
             house.ho_share_size = data['PreApportionArea']
             house.ho_floor = data['FloorName']
             house.ho_type = data['LayoutTypeName']
             house.co_id = co_id
             house.bu_id = bu_id
             house.insert_db()
         except Exception as e:
             print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
Beispiel #24
0
 def get_house_info(self, house_url_list, bu_id, co_id):
     for i in house_url_list:
         house_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             response = requests.get(house_url, headers=self.headers)
             html = response.text
             house = House(co_index)
             house.bu_id = bu_id
             house.co_id = co_id
             house.ho_name = re.search('房  号:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             house.ho_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
             house.ho_true_size = re.search('套内面积:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             house.ho_share_size = re.search('分摊面积:.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
             house.ho_floor = re.search('所 在 层:.*?<td.*?>(.*?)<', html,
                                        re.S | re.M).group(1)
             house.insert_db()
         except Exception as e:
             print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
Beispiel #25
0
    def get_house_info(self, house_url_list):
        for url in house_url_list:
            response = requests.get(url)

            html = etree.HTML(response.text)
            con = html.xpath("//tr[@align='center']")
            for i in con:
                try:
                    house = House(co_index)
                    # house.ho_num = 'NHOUSENO">(.*?)<'
                    house.ho_name = i.xpath("./td/text()")[1]
                    house.ho_floor = i.xpath("./td/text()")[0]
                    house.ho_build_size = i.xpath("./td/text()")[3]
                    house.ho_true_size = i.xpath("./td/text()")[4]
                    house.ho_share_size = i.xpath("./td/text()")[5]
                    house.ho_room_type = i.xpath("./td/text()")[2]
                    house.ho_price = i.xpath("./td/text()")[-1]
                    house.orientation = i.xpath("./td/text()")[-2]
                    house.bu_id = re.search('ID=(\d+)',url).group(1)
                    house.insert_db()
                except Exception as e:
                    print('房号错误,co_index={},url={}'.format(co_index, url), e)
Beispiel #26
0
 def get_house_info(self, house_url_list):
     for i in house_url_list:
         try:
             house = House(co_index)
             house_url = 'http://www.ndjsj.gov.cn/House/' + i
             house.bu_num = '幢  号:.*?<td.*?>(.*?)<'
             house.ho_name = '房  号:.*?<td.*?>(.*?)<'
             house.co_name = '项目名称:.*?<td.*?>(.*?)<'
             house.ho_build_size = '建筑面积:.*?<td.*?>(.*?)<'
             house.ho_true_size = '套内面积:.*?<td.*?>(.*?)<'
             house.ho_share_size = '分摊面积:.*?<td.*?>(.*?)<'
             house.ho_type = '房屋用途:.*?<td.*?>(.*?)<'
             house.ho_floor = '所 在 层:.*?<td.*?>(.*?)<'
             house.ho_room_type = '房屋户型:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(page_url=house_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=house.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
         except Exception as e:
             print('宁德房号错误,url={}'.format(house_url), e)
Beispiel #27
0
    def get_house_info(self, bu_con):
        bu_html = etree.HTML(bu_con)
        house = House(co_index)
        ho = bu_html.xpath("//tr[@height='30']//span/a")
        bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        for ho_info in ho:
            try:
                ho_detail = "http://www.hcsfcglj.com/Templets/BoZhou/aspx/" + ho_info.xpath(
                    "./@value")[0]
                try:
                    ho_res = requests.get(ho_detail, headers=self.headers)
                    ho_con = ho_res.text
                except Exception as e:
                    print("co_index={},房屋详情页{}请求失败".format(
                        co_index, ho_detail))
                    print(e)
                    continue
                house.ho_name = re.search('房号.*?<td>(.*?)<', ho_con,
                                          re.S | re.M).group(1)
                house.ho_floor = re.search('所在层.*?<td>(.*?)<', ho_con,
                                           re.S | re.M).group(1)
                house.ho_share_size = re.search('分摊共有面积.*?<td>(.*?)<', ho_con,
                                                re.S | re.M).group(1)
                house.ho_build_size = re.search('建筑面积.*?<td>(.*?)<', ho_con,
                                                re.S | re.M).group(1)
                house.ho_true_size = re.search('套内面积.*?<td>(.*?)<', ho_con,
                                               re.S | re.M).group(1)
                house.ho_type = re.search('房屋用途.*?<td>(.*?)<', ho_con,
                                          re.S | re.M).group(1)
                house.bu_num = re.search('幢号.*?<td>(.*?)<', ho_con,
                                         re.S | re.M).group(1)
                house.bu_id = bu_id
            except:
                house.ho_name = ho_info.xpath("./@id")[0]
                house.bu_id = bu_id

            house.insert_db()
Beispiel #28
0
 def ho_parse(self, co_id, bu_id, ho_list):
     for ho in ho_list:
         ho_url = ho.xpath("./@href")[0]
         house_url = "http://110.89.45.7:8082" + ho_url
         # while True:
         #     try:
         #         proxy = self.proxies[random.randint(0,9)]
         try:
             ho_res = requests.get(
                 house_url,
                 headers=self.headers,
             )
         except:
             continue
             #     break
             # except:
             #     continue
         con = ho_res.text
         house = House(co_index)
         house.co_id = co_id
         house.bu_id = bu_id
         house.ho_name = re.search('房  号.*?<td>(.*?)</td', con,
                                   re.S | re.M).group(1)
         house.ho_build_size = re.search('建筑面积.*?<td>(.*?)</td', con,
                                         re.S | re.M).group(1)
         house.ho_true_size = re.search('套内面积.*?<td>(.*?)</td', con,
                                        re.S | re.M).group(1)
         house.ho_share_size = re.search('分摊面积.*?<td>(.*?)</td', con,
                                         re.S | re.M).group(1)
         house.ho_floor = re.search('所 在 层.*?<td>(.*?)</td', con,
                                    re.S | re.M).group(1)
         house.ho_price = re.search('申报单价.*?">(.*?)</td', con,
                                    re.S | re.M).group(1)
         house.ho_type = re.search('房屋用途.*?<td>(.*?)</td', con,
                                   re.S | re.M).group(1)
         house.insert_db()
         time.sleep(random.randint(0, 3))
Beispiel #29
0
    def get_house_info(self,co_id,bu_id,house_detail_list):
        for house_detail in house_detail_list:
            house_url = self.url + house_detail
            try:
                house_res = requests.get(house_url,headers=self.headers)
                house_res.status_code == 200
            except Exception as e:
                print("co_index={},房屋信息错误".format(co_index),e)
                continue
            house_con = house_res.text

            ho = House(co_index)
            ho.co_id = co_id
            ho.bu_id = bu_id
            ho.ho_name = re.search('房号.*?fh">(.*?)</span',house_con,re.S|re.M).group(1)
            ho.orientation = re.search('朝向.*?Cx">(.*?)</span',house_con,re.S|re.M).group(1)
            ho.ho_floor = re.search('层.*?lc">(.*?)</span',house_con,re.S|re.M).group(1)
            ho.ho_room_type = re.search('房型.*?hx">(.*?)</span',house_con,re.S|re.M).group(1)
            ho.ho_build_size = re.search('建筑面积.*?jzmj">(.*?)</span',house_con,re.S|re.M).group(1)
            ho.ho_share_size = re.search('分摊面积.*?ftmj">(.*?)</span',house_con,re.S|re.M).group(1)
            ho.ho_true_size= re.search('套内面积.*?tnmj">(.*?)</span',house_con,re.S|re.M).group(1)
            ho.ho_type = re.search('用途.*?lx">(.*?)</span',house_con,re.S|re.M).group(1)

            ho.insert_db()
Beispiel #30
0
 def start_crawler(self):
     url = 'http://zzx.zzfc.com/ajaxpro/xy_ysxk_more,App_Web_mjeeodb-.ashx'
     for i in range(1, 21):
         payload = "{\"pageNo\":" + str(
             i) + ",\"pageSize\":30,\"rowcount\":589}"
         try:
             response = requests.post(url,
                                      data=payload,
                                      headers=self.headers)
             con = response.content.decode()
         except Exception as e:
             log.error('楼栋请求失败{}'.format(e))
             continue
         co_list = re.findall('\[\d+,.*?\d+\]', con)
         for comm in co_list:
             try:
                 sid = re.search('\[(\d+),', comm).group(1)
                 pid = re.search('",(\d+),', comm).group(1)
                 bu_url = 'http://zzx.zzfc.com/xy_bldg.aspx?pid=' + pid + '&sid=' + sid
                 bu_res = requests.get(bu_url, headers=self.headers)
                 bu_con = bu_res.content.decode()
                 bu = Building(co_index)
                 bu.bu_id = sid
                 bu.bu_address = re.search('楼栋座落.*?">(.*?)&nbsp', bu_con,
                                           re.S | re.M).group(1)
                 bu.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', bu_con,
                                            re.S | re.M).group(1)
                 bu.bu_pre_sale_date = re.search('预售日期.*?">(.*?)&nbsp',
                                                 bu_con,
                                                 re.S | re.M).group(1)
                 bu.bu_all_house = re.search('套数.*?">(.*?)&nbsp', bu_con,
                                             re.S | re.M).group(1)
                 bu.insert_db()
             except Exception as e:
                 log.error("{}楼栋解析失败{}".format(comm, e))
                 continue
             ho_url = 'http://zzx.zzfc.com/ajaxpro/xy_housetag,App_Web_xg4ulr9n.ashx'
             data = "{\"m_key\":\"WWW_LPB_001\",\"m_param\":\"" + sid + "\"}"
             headers = {
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.119Safari/537.36',
                 'X-AjaxPro-Method': 'GETLPBDS'
             }
             try:
                 ho_res = requests.post(ho_url, data=data, headers=headers)
                 ho_con = ho_res.content.decode()
             except Exception as e:
                 log.error("房屋请求失败{}".format(e))
                 continue
             ho_list = re.findall('\["\d+.*?\d+\]', ho_con)
             for house in ho_list:
                 try:
                     ho = House(co_index)
                     ho.bu_id = sid
                     info_list = house.split(",")
                     ho.ho_name = info_list[4]
                     ho.ho_floor = re.search('(\d+)层', house).group(1)
                     ho.ho_build_size = info_list[-3]
                     ho.ho_true_size = info_list[-2]
                     ho.insert_db()
                 except Exception as e:
                     log.error("{}房屋解析错误{}".format(house, e))
                     continue