Exemple #1
0
    def get_house_info(self, house_url_list, bu_id, co_id):
        for i in house_url_list:
            try:
                house = House(co_index)
                house_url = 'http://www.fjnpfdc.com/House/' + i
                house_res = requests.get(house_url, headers=self.headers)
                house_con = house_res.content.decode('gbk')

                house.bu_id = bu_id
                house.co_id = co_id
                house.bu_num = re.search('幢  号:.*?<td>(.*?)<', house_con,
                                         re.S | re.M).group(1)
                house.ho_name = re.search('房  号:.*?<td>(.*?)<', house_con,
                                          re.S | re.M).group(1)
                house.co_name = re.search('项目名称:.*?<td>(.*?)<', house_con,
                                          re.S | re.M).group(1)
                house.ho_build_size = re.search('建筑面积:.*?<td>(.*?)<',
                                                house_con,
                                                re.S | re.M).group(1)
                house.ho_true_size = re.search('套内面积:.*?<td>(.*?)<', house_con,
                                               re.S | re.M).group(1)
                house.ho_share_size = re.search('分摊面积:.*?<td>(.*?)<',
                                                house_con,
                                                re.S | re.M).group(1)
                house.ho_floor = re.search('所 在 层:.*?<td>(.*?)<', house_con,
                                           re.S | re.M).group(1)

                house.insert_db()
            except Exception as e:
                print("co_index={},房屋{}错误".format(co_index, i), e)
Exemple #2
0
    def get_house_detail(self, house_url_list):
        for i in house_url_list:
            res = requests.get(i)
            html = res.content.decode('gbk')
            bu_name = re.search('楼号:.*?HouseNum">(.*?)</span>', html, re.S | re.M).group(1)
            co_name = re.search('项目名称.*?PrjName">(.*?)</span>', html, re.S | re.M).group(1)
            ho_id = re.findall("aspx\?Room=(.*?)'.*?<b>(.*?)</b>", html, re.S | re.M)
            # 房号和房号id对应的字段
            ho_id_dict = {}
            for k in ho_id:
                ho_id_dict[k[0]] = k[1]

            house_info = re.findall("<Room><Cell RoomID='(.*?)'.*?BArea='(.*?)'.*?HouseUse='(.*?)'.*?</Room>",
                                    html,
                                    re.S | re.M)
            for j in house_info:
                try:
                    h = House(self.co_index)
                    h.ho_name = ho_id_dict[j[0]]
                    h.ho_true_size = j[1]
                    h.ho_type = j[2]
                    h.co_name = co_name
                    h.bu_num = bu_name
                    h.insert_db()
                except Exception as e:
                    print('房屋错误,co_index={},url={}'.format(co_index, i), e)
                    continue
Exemple #3
0
 def get_house_info(self, house_url_list, co_name, bu_num):
     for i in house_url_list:
         try:
             house = House(co_index)
             house.co_name = co_name
             house.bu_num = bu_num
             house_url = 'http://www.sxczfdc.com/pubinfo/' + i
             response = requests.get(house_url, headers=self.headers)
             html = response.text
             house.ho_floor = re.findall('HouseInfo1_lblFwlc">(.*?)<', html,
                                         re.S | re.M)[0]
             house.ho_name = re.findall('HouseInfo1_lblFwfh">(.*?)<', html,
                                        re.S | re.M)[0]
             house.ho_type = re.findall('HouseInfo1_lblFwlx">(.*?)<', html,
                                        re.S | re.M)[0]
             house.ho_room_type = re.findall('HouseInfo1_lblFwhx">(.*?)<',
                                             html, re.S | re.M)[0]
             house.ho_build_size = re.findall(
                 'HouseInfo1_lblycfwjzmj">(.*?)<', html, re.S | re.M)[0]
             house.ho_true_size = re.findall(
                 'HouseInfo1_lblycfwtnmj">(.*?)<', html, re.S | re.M)[0]
             house.ho_share_size = re.findall(
                 'HouseInfo1_lblycfwftmj">(.*?)<', html, re.S | re.M)[0]
             house.orientation = re.findall('HouseInfo1_lblCx">(.*?)<',
                                            html, re.S | re.M)[0]
             house.insert_db()
         except Exception as e:
             print(e)
Exemple #4
0
 def get_house_detail(self, house_detail_url_list, co_id, bu_id):
     for i in house_detail_url_list:
         detail_url = 'http://www.yzfdc.cn/' + i
         try:
             house = House(co_index)
             time.sleep(3)
             response = self.s.get(detail_url, headers=self.headers)
             html = response.text
             house.co_name = re.search('lblxmmc.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             house.bu_num = re.search('lbldh.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             house.ho_name = re.search('lblfh.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             house.ho_build_size = re.search('lbljzmj.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
             house.ho_true_size = re.search('lbltnmj.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             house.ho_share_size = re.search('lblftmj.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
             house.ho_type = re.search('lblfwxz.*?>(.*?)<', html,
                                       re.S | re.M).group(1)
             house.ho_room_type = re.search('lblhuxin.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             house.bu_id = bu_id
             house.co_id = co_id
             house.insert_db()
         except Exception as e:
             print('房号错误,co_index={},url={}'.format(co_index, detail_url),
                   e)
Exemple #5
0
 def get_house_info(self, house_url_list):
     for i in house_url_list:
         try:
             dongid = re.search('dongid=(.*?)&', i).group(1)
             roomid = re.search('roomid=(.*?)&', i).group(1)
             house_url = 'http://zjjg.0557fdc.com:9555/xiaoqu/roominfo.aspx?dongid=' + dongid + '&roomid=' + roomid
             house = House(co_index)
             house.co_name = 'Labelxqmc">(.*?)<'
             house.area = 'Labelxzq">(.*?)<'
             house.bu_num = 'Labeldongmc">(.*?)<'
             house.ho_type = 'Labelyxyongtu">(.*?)<'
             house.ho_name = '<span id="Labelroommc".*?>(.*?)</span>'
             house.ho_build_size = 'Labeljzmianji">(.*?)<'
             house.ho_true_size = 'Labeltaonei">(.*?)<'
             house.ho_share_size = 'Labelgongtan">(.*?)<'
             house.ho_room_type = 'Labelhuxing">(.*?)<'
             house.bu_id = 'dongid=(.*?)&'
             p = ProducerListUrl(page_url=house_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=house.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
         except Exception as e:
             print('房号错误,co_index={},url={}'.format(co_index, house_url), e)
Exemple #6
0
    def get_build_info(self, comm_url_list):
        for i in comm_url_list:
            try:
                sid = re.findall('\+(\d+)\+', i)[0]
                pid = re.findall('\+(\d+)\+', i)[1]
                build_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/bldg_query.aspx?pid=' + pid + '&sid=' + sid
                # print(build_url)
                response = requests.get(build_url)
                html = response.text
                build = Building(co_index)
                build.bu_id = pid
                build.bu_num = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
                build.bu_address = re.search('楼栋座落.*?<td.*?>(.*?)<', html,
                                             re.S | re.M).group(1)
                build.bu_pre_sale = re.search('预售证号.*?">(.*?)&nbsp', html,
                                              re.S | re.M).group(1)
                build.bu_pre_sale_date = re.search('时间.*?">(.*?)&nbsp', html,
                                                   re.S | re.M).group(1)
                build.bu_all_house = re.search('dM.*?">(.*?)&nbsp', html,
                                               re.S | re.M).group(1)
                # build.bu_address = re.search('售楼处地址.*?">(.*?)&nbsp', html, re.S | re.M).group(1)
                build.insert_db()
            except Exception as e:
                print('co_index={}, 楼栋错误,url={}'.format(co_index, build_url),
                      e)

            house_url = 'http://www.jjzzfdc.com.cn/WebClient/ClientService/proxp.aspx?key=WWW_LPB_001&params=' + sid
            # print(house_url)
            result = requests.get(house_url)
            html_ = result.text

            for house_info in re.findall('<Result.*?</Result>', html_,
                                         re.S | re.M):
                try:
                    house = House(co_index)
                    house.bu_id = build.bu_id
                    house.bu_num = build.bu_num
                    house.ho_name = re.search('<ONAME>(.*?)</ONAME>',
                                              house_info, re.S | re.M).group(1)
                    house.ho_num = re.search('<OSEQ>(.*?)</OSEQ>', house_info,
                                             re.S | re.M).group(1)
                    house.ho_build_size = re.search('<BAREA>(.*?)</BAREA>',
                                                    house_info,
                                                    re.S | re.M).group(1)
                    house.ho_floor = re.search('<FORC>(.*?)</FORC>',
                                               house_info,
                                               re.S | re.M).group(1)
                    house.ho_true_size = re.search('<PAREA>(.*?)</PAREA>',
                                                   house_info,
                                                   re.S | re.M).group(1)
                    house.insert_db()
                except Exception as e:
                    print('co_index={}, 房号错误'.format(co_index), e)
Exemple #7
0
 def get_house_info(self, zu_house_url, bu_num, co_id):
     try:
         house = House(co_index)
         house.bu_num = bu_num
         house.co_id = co_id
         result = self.s.get(zu_house_url, headers=self.headers).text
         house.info = re.search('ItemName.*?>(.*?)<',
                                result).group(1).strip()
         ho_code_list = re.findall("OnClick=.__doPostBack\(.*?,'(.*?)'\)",
                                   result, re.S | re.M)
         ho_msg_list = re.findall("OnClick=.__doPostBack\('(.*?)'", result,
                                  re.S | re.M)
         self.get_house_detail(zu_house_url, ho_msg_list, ho_code_list,
                               house)
     except Exception as e:
         print(e)
Exemple #8
0
 def house_info(self, house_list, bu_id, co_id):
     ho = House(co_index)
     for house_url in house_list:
         url = "http://ris.szpl.gov.cn/bol/" + house_url
         res = requests.get(url, headers=self.headers)
         ho.ho_num = re.search('id=(\d+)', house_url).group(1)
         con = res.text
         ho.bu_num = re.search('情况.*?">(.*?)&', con).group(1)
         ho.bu_id = bu_id
         ho.co_id = co_id
         ho.ho_floor = re.search('楼层.*?">(\d+)&', con).group(1)
         ho.ho_num = re.search('房号.*?">(\d+)&', con).group(1)
         ho.ho_type = re.search('用途.*?">(\d+)&', con).group(1)
         ho.ho_room_type = re.search('户型.*?">(\d+)&', con).group(1)
         ho.ho_build_size = re.search('建筑面积<.*?">(\d+.\d+)平方米',
                                      con).group(1)
         ho.ho_true_size = re.search('户内面积<.*?">(\d+.\d+)平方米', con).group(1)
         ho.insert_db()
Exemple #9
0
    def get_build_info(self, build_url_list, bu_pre_sale_list, co_name, co_id):
        for i in range(len(build_url_list)):
            try:
                build = Building(co_index)
                build.co_id = co_id

                build.co_name = co_name
                build.bu_pre_sale = bu_pre_sale_list[i]
                build.bu_id = re.search('lh=(\d+)', build_url_list[i]).group(1)
                build_url = 'http://221.2.144.162:8090/' + build_url_list[i]
                response = requests.get(build_url, headers=self.headers)
                html = response.content.decode('gbk')
                build.bu_num = re.findall('<font color=white.*?><b>(.*?)<',
                                          html, re.S | re.M)[0]
                build.bu_address = re.findall('坐落位置:</b>(.*?)<', html,
                                              re.S | re.M)[0]
                build.insert_db()
                ho_url_list = re.findall('background-.*?href=(.*?) ', html,
                                         re.S | re.M)
                ho_name_list = re.findall('background-color.*?<a.*?>(.*?)<',
                                          html, re.S | re.M)
                for i in range(len(ho_url_list)):
                    try:
                        house = House(co_index)
                        house_url = 'http://221.2.144.162:8090/' + ho_url_list[
                            i]
                        result = requests.get(
                            house_url,
                            headers=self.headers).content.decode('gbk')
                        house.bu_id = build.bu_id
                        house.co_id = co_id
                        house.ho_type = re.findall(
                            '用&nbsp;&nbsp;&nbsp;途:.*?<td.*?>(.*?)<', result,
                            re.S | re.M)[0]
                        house.ho_build_size = re.findall(
                            '建筑面积:.*?<td>(.*?)<', result, re.S | re.M)[0]
                        house.bu_num = build.bu_num
                        house.co_name = co_name
                        house.ho_name = ho_name_list[i]
                        house.insert_db()
                    except Exception as e:
                        print("co_index={},房屋信息错误".format(co_index), e)
            except Exception as e:
                print("co_index={},楼栋信息错误".format(co_index), e)
Exemple #10
0
 def get_house_info(self, house_url_list):
     for i in house_url_list:
         try:
             house = House(co_index)
             house_url = 'http://www.ndjsj.gov.cn/House/' + i
             house.bu_num = '幢  号:.*?<td.*?>(.*?)<'
             house.ho_name = '房  号:.*?<td.*?>(.*?)<'
             house.co_name = '项目名称:.*?<td.*?>(.*?)<'
             house.ho_build_size = '建筑面积:.*?<td.*?>(.*?)<'
             house.ho_true_size = '套内面积:.*?<td.*?>(.*?)<'
             house.ho_share_size = '分摊面积:.*?<td.*?>(.*?)<'
             house.ho_type = '房屋用途:.*?<td.*?>(.*?)<'
             house.ho_floor = '所 在 层:.*?<td.*?>(.*?)<'
             house.ho_room_type = '房屋户型:.*?<td.*?>(.*?)<'
             p = ProducerListUrl(page_url=house_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=house.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
         except Exception as e:
             print('宁德房号错误,url={}'.format(house_url), e)
Exemple #11
0
 def get_house_info(self, code, co_name):
     house_url = 'http://house.bffdc.gov.cn/Common/Agents/ExeFunCommon.aspx?'
     payload = "<?xml version=\"1.0\" encoding=\"utf-8\" standalone=\"yes\"?>\r\n<param funname=\"SouthDigital.Wsba.CBuildTableEx.GetBuildHTMLEx\">\r\n<item>" + \
               code[
                   0] + "</item>\r\n<item>1</item>\r\n<item>1</item>\r\n<item>55</item>\r\n<item>840</item>\r\n<item>g_oBuildTable</item>\r\n<item>false</item>\r\n<item> 1=1</item>\r\n</param>\r\n"
     headers = {
         'Content-Type': "text/xml",
     }
     response = requests.post(url=house_url, data=payload, headers=headers)
     html = response.text
     info = re.findall("title='(.*?)'", html, re.S | re.M)
     for i in info:
         try:
             house = House(co_index)
             house.bu_num = code[1]
             house.ho_name = re.search('房号:(.*?)\r\n', i).group(1)
             house.ho_type = re.search('用途:(.*?)\r\n', i).group(1)
             house.ho_room_type = re.search('户型:(.*?)\r\n', i).group(1)
             house.ho_build_size = re.search('总面积:(.*?)\r\n', i).group(1)
             house.co_name = co_name
             house.insert_db()
         except Exception as e:
             print(e)
Exemple #12
0
    def get_house_info(self, bu_con):
        bu_html = etree.HTML(bu_con)
        house = House(co_index)
        ho = bu_html.xpath("//tr[@height='30']//span/a")
        bu_id = re.search('编号.*?>(\d+)<', bu_con, re.S | re.M).group(1)
        for ho_info in ho:
            try:
                ho_detail = "http://www.hcsfcglj.com/Templets/BoZhou/aspx/" + ho_info.xpath(
                    "./@value")[0]
                try:
                    ho_res = requests.get(ho_detail, headers=self.headers)
                    ho_con = ho_res.text
                except Exception as e:
                    print("co_index={},房屋详情页{}请求失败".format(
                        co_index, ho_detail))
                    print(e)
                    continue
                house.ho_name = re.search('房号.*?<td>(.*?)<', ho_con,
                                          re.S | re.M).group(1)
                house.ho_floor = re.search('所在层.*?<td>(.*?)<', ho_con,
                                           re.S | re.M).group(1)
                house.ho_share_size = re.search('分摊共有面积.*?<td>(.*?)<', ho_con,
                                                re.S | re.M).group(1)
                house.ho_build_size = re.search('建筑面积.*?<td>(.*?)<', ho_con,
                                                re.S | re.M).group(1)
                house.ho_true_size = re.search('套内面积.*?<td>(.*?)<', ho_con,
                                               re.S | re.M).group(1)
                house.ho_type = re.search('房屋用途.*?<td>(.*?)<', ho_con,
                                          re.S | re.M).group(1)
                house.bu_num = re.search('幢号.*?<td>(.*?)<', ho_con,
                                         re.S | re.M).group(1)
                house.bu_id = bu_id
            except:
                house.ho_name = ho_info.xpath("./@id")[0]
                house.bu_id = bu_id

            house.insert_db()
Exemple #13
0
    def get_build_url_list(self, url_list):
        for i in url_list:
            try:
                res = requests.get(i)
                html = res.content.decode('gbk')
                for k in re.findall('项目名称.*?</dl>', html, re.S | re.M):
                    try:
                        c = Comm(self.co_index)
                        c.co_name = re.search('html">(.*?)</a>', k,
                                              re.S | re.M).group(1)
                        c.co_address = re.search('class="address"(.*?)</dd>',
                                                 k, re.S | re.M).group(1)
                        c.area = re.search('"city">(.*?)</dd>', k,
                                           re.S | re.M).group(1)
                        c.co_develops = re.search('"average">(.*?)</dd>', k,
                                                  re.S | re.M).group(1)
                        c.insert_db()
                        global count
                        count += 1
                        print(count)

                        url = re.search('a href="(.*?)">', k,
                                        re.S | re.M).group(1)
                        complete_url = self.url_source + url
                        res = requests.get(complete_url)
                        html = res.content.decode('gbk')
                        build_info_str = re.search('楼盘表</td>(.*?)合  计', html,
                                                   re.S | re.M).group(1)
                        for j in re.findall('<tr.*?</tr>', build_info_str,
                                            re.S | re.M):
                            try:
                                b = Building(self.co_index)
                                b.co_name = re.search('html">(.*?)</a>', k,
                                                      re.S | re.M).group(1)
                                b.bu_all_house = re.search(
                                    'absmiddle"  />(.*?)</a>', j,
                                    re.S | re.M).group(1)
                                b.bu_num = re.search(
                                    '="absmiddle"  />(.*?)</a></strong></', j,
                                    re.S | re.M).group(1)
                                b.bu_build_size = re.search(
                                    'td class="t_c">.*?td class="t_c">(.*?㎡)</td>',
                                    j, re.S | re.M).group(1)
                                b.insert_db()

                                url = re.search('a href="(.*?)"', j,
                                                re.S | re.M).group(1)
                                complete_url = self.url_source + url
                                res = requests.get(complete_url)
                                html = res.content.decode('gbk')
                                # 解析html获取iframe表单的数据
                                house_url = self.url_source + re.search(
                                    '<iframe.*?"(.*?)"', html,
                                    re.S | re.M).group(1)
                                logic_house_url = house_url.replace(
                                    'Default', 'GetData')
                                logic_house_html = requests.get(
                                    url=logic_house_url).content.decode()
                                logic_id = re.search(
                                    '<LOGICBUILDING_ID>(.*?)<',
                                    logic_house_html, re.S | re.M).group(1)
                                final_url = 'http://www.yingtanfdc.com/website/presale/home/HouseTableControl/GetData.aspx?LogicBuilding_ID=' + logic_id
                                final_html = requests.get(
                                    url=final_url).content.decode('gbk')
                                for l in re.findall(
                                        '<ROOM_NUMBER>(.*?)</ROOM_NUMBER>',
                                        final_html, re.S | re.M):
                                    try:
                                        h = House(self.co_index)
                                        h.info = final_html
                                        h.ho_name = l
                                        h.co_name = re.search(
                                            'html">(.*?)</a>', k,
                                            re.S | re.M).group(1)
                                        h.bu_num = re.search(
                                            '="absmiddle"  />(.*?)</a></strong></',
                                            j, re.S | re.M).group(1)
                                        h.insert_db()
                                    except Exception as e:
                                        continue
                            except Exception as e:
                                continue
                    except Exception as e:
                        continue
            except Exception as e:
                continue
Exemple #14
0
    def comm_crawler(self, comm_url, co_develops, co_pre_sale, co_name,
                     co_pre_sale_date):
        ho = House(co_index)
        comm_res = requests.get(comm_url, headers=self.headers)
        comm_html = etree.HTML(comm_res.text)
        value = comm_html.xpath("//input[@id='propertyid']/@value")[0]
        sid = comm_html.xpath("//input[@id='sid']/@value")[0]
        # detail_url = "http://hu.tmsf.com/newhouse/property_"+str(sid)+"_"+str(value)+"_price.htm"

        bu = Building(co_index)
        bu_num = comm_html.xpath("//div[@id='building_dd']//a")[1:]
        # bu_info,bu_num_list = self.build(comm_html,value)
        self.comm_info(co_develops, co_pre_sale, co_name, co_pre_sale_date,
                       value)
        # page_html = requests.get(detail_url,headers=self.headers)
        for bu_ in bu_num:
            bu.bu_num = bu_.xpath("./text()")[0]
            bu_id = bu_.xpath("./@id")[0]
            bu.bu_id = re.search('\d+', bu_id).group(0)
            bu.co_id = value
            bu.insert_db()
            detail_url = "http://hu.tmsf.com/newhouse/property_" + str(
                sid) + "_" + str(value) + "_price.htm?buildingid=" + str(
                    bu.bu_id)
            page_html = requests.get(detail_url, headers=self.headers)

            page = re.search('页数 \d+/(\d+)', page_html.text).group(1)
            for i in range(1, int(page) + 1):
                detail_url = detail_url + "?page=" + str(i)

                detail_res = requests.get(detail_url, headers=self.headers)
                house_html = etree.HTML(detail_res.text)
                house_url_list = house_html.xpath("//td[@width='100']/a/@href")
                house_bu_num = house_html.xpath("//td[@width='100']/a/text()")
                house_name = house_html.xpath(
                    "//td[@width='101'][1]/a/div/text()")

                for index in range(1, len(house_url_list) + 1):
                    try:
                        ho.bu_num = house_bu_num[index]  # 楼号 栋号
                        house_url = "http://hu.tmsf.com" + house_url_list[index]
                        house_res = requests.get(house_url,
                                                 headers=self.headers)
                        house_html = house_res.text
                        ho.bu_id = bu.bu_id
                        ho.co_id = re.search('楼盘主页.*?_\d+_(\d+)_info',
                                             house_html).group(1)  # 小区id
                        ho.ho_name = house_name[index]  # 房号:3单元403
                        # ho.ho_num =  re.search('_(\d+).htm',house_url).group(1) # 房号id

                        ho.ho_type = re.search('房屋用途:.*?>(.*?)<',
                                               house_html).group(
                                                   1)  # 房屋类型:普通住宅 / 车库仓库
                        ho.ho_floor = re.search('第(.*?)层', house_html).group(1)

                        build_text = re.search('建筑面积:(.*?)平方米',
                                               house_html).group(1)
                        build_num = re.findall('class="(.*?)"', build_text)
                        ho.ho_build_size = self.number(build_num)  # 建筑面积

                        size_text = re.search('套内面积:(.*?)平方米',
                                              house_html).group(1)
                        size_num = re.findall('class="(.*?)"', size_text)
                        ho.ho_true_size = self.number(size_num)  # 预测套内面积,实际面积

                        price_text = re.search('总  价:(.*?)万元',
                                               house_html).group(1)  # 价格
                        price_num = re.findall('class="(.*?)"', price_text)
                        ho.ho_price = self.number(price_num)

                        ho.insert_db()
                    except:
                        continue