Esempio n. 1
0
    def comm_info(self, comm_url_list):  # 小区信息
        co = Comm(co_index)
        build_url_list = []
        for comm_url in comm_url_list:
            co.co_id = re.search('id=(\d+)', comm_url).group(1)
            detail_url = "http://ris.szpl.gov.cn/bol/" + comm_url.lstrip(".")
            url = "http://ris.szpl.gov.cn/bolprojectdetail.aspx?id=" + str(co.co_id)
            try:
                res = requests.get(detail_url, headers=self.headers)
                con = res.text

                co.co_pre_sale = re.search('许可证号.*?">(.*?)&', con).group(1)
                co.co_name = re.search('项目名称.*?">(.*?)&', con).group(1)
                co.co_address = re.search('所在位置.*?">(.*?)&', con).group(1)
                co.co_develops = re.search('发展商.*?">(.*?)&', con).group(1)
                co_type = re.search('住宅.*?面积.*?">(.*?)平方米.*?套数.*?">(.*?)&', con)
                co.co_build_size = co_type.group(1)
                co.co_all_house = co_type.group(2)
                co.insert_db()

                response = requests.get(url, headers=self.headers)
                content = etree.HTML(response.text)
                build_url = content.xpath("//td/a/@href")
                build_url_list.extend(build_url)
            except:
                continue
        self.build_info(build_url_list)
Esempio n. 2
0
 def get_comm_info(self, comm_url):
     comm = Comm(co_index)
     comm_url = comm_url.replace('buildingdetail', 'buildinfo')
     response = self.request_proxy(comm_url, headers=self.headers)
     html = response.content.decode('gbk')
     comm.co_name = re.search('class="sf_xq_xmmc">(.*?)<', html,
                              re.S | re.M).group(1).strip()
     comm.area = re.search('id="Label_CityArea">(.*?)<', html,
                           re.S | re.M).group(1).strip()
     comm.co_pre_sale_date = re.search('class="sf_xq_jfsj">(.*?)<', html,
                                       re.S | re.M).group(1).strip()
     comm.co_build_type = re.search('id="lbl_JZJG".*?>(.*?)<', html,
                                    re.S | re.M).group(1).strip()
     comm.co_address = re.search('id="Label_ProjectAdress">(.*?)<', html,
                                 re.S | re.M).group(1).strip()
     comm.co_pre_sale = re.search('id="Label_SallPreDocuments">(.*?)<',
                                  html, re.S | re.M).group(1).strip()
     comm.co_all_house = re.search('id="lbl_ZTS".*?>(.*?)<', html,
                                   re.S | re.M).group(1).strip()
     comm.co_build_size = re.search('id="lbl_JZMJ".*?>(.*?)<', html,
                                    re.S | re.M).group(1).strip()
     comm.co_all_size = re.search('id="lbl_ZDMJ".*?>(.*?)<', html,
                                  re.S | re.M).group(1).strip()
     comm.co_develops = re.search('id="Label_DevName">.*?>(.*?)<', html,
                                  re.S | re.M).group(1).strip()
     comm.co_id = re.search('action=.*?buildingid=(.*?)"', html,
                            re.S | re.M).group(1).strip()
     comm.insert_db()
     buildingid = re.search('buildingid=(.*?)$', comm_url,
                            re.S | re.M).group(1)
     self.get_build_info(buildingid, comm.co_id)
Esempio n. 3
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://221.2.144.162:8090/' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.content.decode('gbk')
             comm.co_id = re.search('id=(\d+)', i).group(1)
             comm.co_name = re.findall('项目名称:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M)[0]
             comm.co_develops = re.findall('开 发 商:.*?<td.*?>(.*?)<', html,
                                           re.S | re.M)[0]
             comm.area = re.findall(
                 '城 &nbsp;&nbsp;&nbsp;区:.*?<td.*?>(.*?)<', html,
                 re.S | re.M)[0]
             comm.co_type = re.findall('物业类型:.*?<td.*?>(.*?)<', html,
                                       re.S | re.M)[0]
             comm.co_address = re.findall('物业位置:.*?<td.*?>(.*?)<', html,
                                          re.S | re.M)[0]
             comm.co_build_size = re.findall('建筑面积:.*?<td.*?>(.*?)<', html,
                                             re.S | re.M)[0]
             comm.insert_db()
             build_url_list = re.findall("height=20.*?<a href=(.*?) ", html,
                                         re.S | re.M)
             bu_pre_sale_list = re.findall("height=20.*?<Td>(.*?)<", html,
                                           re.S | re.M)
             self.get_build_info(build_url_list, bu_pre_sale_list,
                                 comm.co_name, comm.co_id)
         except Exception as e:
             print("co_index={},小区信息错误".format(co_index), e)
Esempio n. 4
0
    def get_comm_info(self, all_comm_url):
        for i in all_comm_url:
            try:
                comm = Comm(co_index)
                comm_url = 'http://gold.ncfdc.com.cn/' + i
                res = requests.get(comm_url, headers=self.headers)
                comm.co_name = re.search('ctl15_proname">(.*?)<', res.text,
                                         re.S | re.M).group(1)
                comm.co_address = re.search('ctl20_ADDRESS">(.*?)<', res.text,
                                            re.S | re.M).group(1)
                comm.co_develops = re.search('ctl20_developer_name">(.*?)<',
                                             res.text, re.S | re.M).group(1)
                comm.co_build_size = re.search('ctl20_build_area">(.*?)<',
                                               res.text, re.S | re.M).group(1)
                comm.area = re.search('ctl20_region_name">(.*?)<', res.text,
                                      re.S | re.M).group(1)
                comm.co_type = re.search('ctl20_PropertyType">(.*?)<',
                                         res.text, re.S | re.M).group(1)
                comm.co_green = re.search('ctl20_VIRESCENCE">(.*?)<', res.text,
                                          re.S | re.M).group(1)
                comm.co_volumetric = re.search('ctl20_PLAT_RATIO">(.*?)<',
                                               res.text, re.S | re.M).group(1)
                comm.co_id = re.search('name="form1.*?hrefID=(.*?)"', res.text,
                                       re.S | re.M).group(1)
                comm.insert_db()

                build_url_list = []
                for j in re.findall('doc_nav_LD" href="(.*?)"', res.text,
                                    re.S | re.M):
                    build_url_list.append(j)
                self.get_build_info(build_url_list, comm.co_id)

            except Exception as e:
                print('小区错误,co_index={}, url={}'.format(co_index, comm_url), e)
Esempio n. 5
0
 def get_comm_detail(self, comm_url):
     comm = Comm(co_index)
     co_url = 'http://tz.tmsf.com' + comm_url
     response = requests.get(co_url, headers=self.headers)
     html = response.content.decode('utf-8')
     comm.co_name = re.search('<span class="buidname colordg">(.*?)<', html,
                              re.S | re.M).group(1)
     comm.co_address = re.search('楼盘地址:.*?<span.*?>(.*?)<', html,
                                 re.S | re.M).group(1)
     if '[' in comm.co_address:
         comm.area = re.search('\[(.*?)\]', comm.co_address,
                               re.S | re.M).group(1)
     comm.co_type = re.search('物业类型:.*?<span title="(.*?)"', html,
                              re.S | re.M).group(1)
     comm.co_open_time = re.search('最新开盘:</strong>(.*?)<', html,
                                   re.S | re.M).group(1)
     comm.co_develops = re.search('项目公司:</strong>(.*?)<', html,
                                  re.S | re.M).group(1)
     comm.co_build_type = re.search('建筑形式:</strong>(.*?)<', html,
                                    re.S | re.M).group(1)
     comm.co_id = re.search('id="propertyid".*?value="(.*?)"', html,
                            re.S | re.M).group(1)
     comm.insert_db()
     sid = re.search('id="sid" name="sid" value="(.*?)"', html,
                     re.S | re.M).group(1)
     build_url = re.search('id="index_bar">楼盘主页.*?href="(.*?)"', html,
                           re.S | re.M).group(1)
     self.get_build_info(build_url, comm.co_id, sid)
Esempio n. 6
0
 def comm_parse(self, url_list, region):
     for co_url in url_list:
         comm_url = "http://110.89.45.7:8082" + co_url
         comm_res = requests.get(comm_url, headers=self.headers)
         con = comm_res.text
         co = Comm(co_index)
         co.co_id = re.search('ProjectId=(.*)', co_url).group(1)
         co.co_name = re.search('项目名称.*?">(.*?)</td', con,
                                re.S | re.M).group(1)
         co.co_develops = re.search('公司名称.*?">(.*?)</td', con,
                                    re.S | re.M).group(1)
         co.co_address = re.search('项目坐落.*?">(.*?)</td', con,
                                   re.S | re.M).group(1)
         co.co_use = re.search('规划用途.*?">(.*?)</td', con,
                               re.S | re.M).group(1)
         co.co_build_size = re.search('建筑面积.*?">(.*?)</td', con,
                                      re.S | re.M).group(1)
         co.area = region
         co.co_residential_size = re.search(
             '批准销售.*?">.*?</td.*?">(.*?)</td', con, re.S | re.M).group(1)
         co.co_pre_sale = re.search('预售许可证.*?">(.*?)</td', con,
                                    re.S | re.M).group(1)
         co.insert_db()
         co_html = etree.HTML(comm_res.text)
         bu_urllist = co_html.xpath("//span/a/@href")
         self.bu_parse(co.co_id, bu_urllist)
Esempio n. 7
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'https://www.qdfd.com.cn/qdweb/realweb/fh/FhProjectInfo.jsp'
             data = {'projectID': i}
             response = requests.post(url=comm_url,
                                      data=data,
                                      headers=self.headers)
             html = response.text
             comm.co_id = i
             comm.co_name = re.findall('bszn_title">(.*?)<', html,
                                       re.S | re.M)[0].strip()
             comm.area = re.findall('所在区县:.*?<span>(.*?)<', html,
                                    re.S | re.M)[0].strip()
             comm.co_address = re.findall('项目地址:.*?<span>(.*?)<', html,
                                          re.S | re.M)[0].strip()
             comm.co_develops = re.findall('企业名称:.*?<a.*?>(.*?)<', html,
                                           re.S | re.M)[0].strip()
             comm.co_all_house = re.findall(
                 '<td>总套数.*?<td class="xxxx_list3">(.*?)<', html,
                 re.S | re.M)[0].strip()
             comm.co_build_size = re.findall(
                 '<td>总面积.*?<td class="xxxx_list3">(.*?)<', html,
                 re.S | re.M)[0].strip()
             comm.insert_db()
             build_logo_list = re.findall(
                 'javascript:getBuilingList\("(.*?)"', html, re.S | re.M)
             self.get_build_info(build_logo_list, i)
         except Exception as e:
             print('青岛小区问题,url post data is:={}'.format(data), e)
Esempio n. 8
0
    def comm_info(self, comm_url_list):
        for comm_url in comm_url_list:
            try:
                co_url = 'http://222.77.178.63:7002/' + comm_url
                co_res = requests.get(co_url, headers=self.headers)
                con = co_res.content.decode('gbk')
                co = Comm(co_index)
                co.co_id = re.search('projectID=(.*)', comm_url).group(1)
                co.co_name = re.search('项目名称:.*?">(.*?)</', con,
                                       re.S | re.M).group(1)
                co.area = re.search('所在区县:.*?">(.*?)</', con,
                                    re.S | re.M).group(1)
                co.co_address = re.search('项目地址:.*?">(.*?)</', con,
                                          re.S | re.M).group(1)
                co.co_develops = re.search('企业名称:.*?blank">(.*?)</', con,
                                           re.S | re.M).group(1)
                co.co_all_house = re.search('>总套数.*?">(\d+)<', con,
                                            re.S | re.M).group(1)
                co.co_all_size = re.search('>总面积.*?">(.*?)<', con,
                                           re.S | re.M).group(1)
                project_name = parse.quote(co.co_name)
                co.insert_db()
            except Exception as e:
                # log.error('小区信息错误{}'.format(e))
                print('小区信息错误{}'.format(e))

            sale_url = "http://222.77.178.63:7002/Presell.asp?projectID=" + co.co_id + "&projectname=" + project_name
            res = requests.get(sale_url, headers=self.headers)
            html = etree.HTML(res.content.decode('gbk'))
            temp_url_list = html.xpath("//a/@href")
            self.build_info(co.co_id, temp_url_list)
Esempio n. 9
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.gzbjfc.com/' + i
             comm.co_name = 'cph_hif1_xmmc.*?<.*?>(.*?)<'
             comm.co_pre_sale = 'cph_hif1_xsxkz.*?<.*?>(.*?)<'
             comm.co_address = 'cph_hif1_zl.*?<.*?>(.*?)<'
             comm.co_develops = 'cph_hif1_kfs.*?<.*?>(.*?)<'
             comm.co_handed_time = 'cph_hif1_jfsj.*?<.*?>(.*?)<'
             comm.co_build_size = 'cph_hif1_jzmj.*?>(.*?)<'
             comm.co_all_house = 'cph_hif1_fwts.*?>(.*?)<'
             comm.co_id = 'hdl1_hfYszh" value="(.*?)"'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=comm.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
             # 楼栋信息
             build_url = comm_url.replace('Info', 'Building')
             self.get_build_info(build_url)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
Esempio n. 10
0
 def start_crawler(self):
     res = requests.get(self.start_url,headers=self.headers)
     content = res.content.decode()
     count = re.search('共(\d+)条记录',content).group(1)
     page = math.ceil(int(count)/15)
     for i in range(1,int(page)+1):
         url = "http://218.28.223.13/zzzfdc/zhengzhou/permission.jsp?pn=&cn=&it=&pager.offset=15&page="+str(i)
         page_res = requests.get(url,headers=self.headers)
         html = etree.HTML(page_res.content.decode())
         project_list = html.xpath("//table//td/a/@href")
         for project in project_list:
             try:
                 project_url = 'http://218.28.223.13'+project
                 co_res = requests.get(project_url,headers=self.headers)
                 con = co_res.content.decode()
                 co_html = etree.HTML(con)
                 co = Comm(co_index)
                 co.co_id = re.search('number=(.*)',project).group(1)
                 co.co_name = re.search('LpName=(.*?)"',con,re.S|re.M).group(1)
                 co.co_pre_sale = re.search('预售许可证号.*?">(.*?)</td',con,re.S|re.M).group(1)
                 co.co_pre_sale_date = re.search('发证日期.*?">(.*?)</td',con,re.S|re.M).group(1)
                 co.co_develops = re.search('开发建设单位.*?">(.*?)</td',con,re.S|re.M).group(1)
                 co.co_address = re.search('项 目 坐 落.*?">(.*?)</td',con,re.S|re.M).group(1)
                 co.co_build_start_time = re.search('竣工时间.*?">(.*?)-----',con,re.S|re.M).group(1)
                 co.co_build_end_time = re.search('-----(.*?)</td',con,re.S|re.M).group(1)
                 co.co_build_size = co_html.xpath("//tr[9]/td[1]/text()")[0]
                 co.co_plan_pro = co_html.xpath("//tr[13]/td[4]/text()")[0]
                 co.co_land_use = co_html.xpath("//tr[13]/td[3]/text()")[0]
                 co.co_work_pro = co_html.xpath("//tr[13]/td[5]/text()")[0]
                 co.insert_db()
             except:
                 log.error("{}小区解析失败".format(project_url))
Esempio n. 11
0
    def comm_info(self, comm_url_list):
        for comm_url in comm_url_list:
            try:
                co_res = requests.get(comm_url, headers=self.headers)
                co = Comm(co_index)
                co.co_id = re.search('bh=(\d+)', comm_url).group(1)
                co.co_name = re.search('项目名称.*?td>(.*?)</', co_res.text,
                                       re.S | re.M).group(1)
                co.co_develops = re.search('公司名称.*?strong>(.*?)</s',
                                           co_res.text, re.S | re.M).group(1)
                co.co_address = re.search('项目坐落.*?">(.*?)</', co_res.text,
                                          re.S | re.M).group(1)
                co.co_pre_sale = re.search('预售证号.*?td>(.*?)</', co_res.text,
                                           re.S | re.M).group(1)
                co.co_pre_sale_date = re.search('批准时间.*?td>(.*?)</',
                                                co_res.text,
                                                re.S | re.M).group(1)
                co.co_build_size = re.search('预售面积.*?">(.*?)</', co_res.text,
                                             re.S | re.M).group(1)
                co.insert_db()

                html = etree.HTML(co_res.text)
                bu_info_list = html.xpath("//tr[@style]")
            except Exception as e:
                log.error('小区信息错误', e)
                continue
            self.build_info(bu_info_list, co.co_id)
            bu_url_list = re.findall("window.open\('(.*?)'\)", co_res.text,
                                     re.S | re.M)
            self.ho_info(bu_url_list, co.co_id)
Esempio n. 12
0
    def start_crawler(self):
        res = requests.get(self.start_url, headers=self.headers)
        html = etree.HTML(res.text)
        comm_url_list = html.xpath("//div[@class='post']//a/@href")
        for comm_url in comm_url_list:
            try:
                url = 'http://www.ggsfcw.com/' + comm_url
                comm_res = requests.get(url, headers=self.headers)
                com_html = etree.HTML(comm_res.text)
                comm = Comm(co_index)
                comm.co_name = re.search('<h3.*?">(.*?)</',
                                         comm_res.text).group(1)
                comm.co_id = re.search('n=(\d+)', comm_res.text).group(1)
                comm.co_address = re.search('地址.*?">(.*?)</',
                                            comm_res.text).group(1)
                comm.area = re.search('区县.*?">(.*?)</', comm_res.text).group(1)
                comm.co_develops = re.search('开发商.*?">(.*?)</',
                                             comm_res.text).group(1)
                comm.co_use = re.search('规划用途.*?">(.*?)</',
                                        comm_res.text).group(1)
                comm.insert_db()
            except Exception as e:
                log.error("小区信息错误", e)
                continue

            bu_list = com_html.xpath("//div[@id='MainContent_divResult']/a")
            self.build_info(bu_list, comm.co_id)
Esempio n. 13
0
 def start_crawler(self):
     for i in range(1, 10000):
         formdata = {
             "currentpage": i,
             "pagesize": 20,
         }
         try:
             res = requests.post(
                 "http://fdc.xmtfj.gov.cn:8001/home/Getzslp",
                 data=formdata,
                 headers=self.headers)
             con = json.loads(res.text)
             body = con['Body']
             info_dict = json.loads(body)['bodylist']
             for i in info_dict:
                 comm = Comm(co_index)
                 comm.co_name = i['XMMC']
                 comm.co_id = i['TRANSACTION_ID']
                 comm.co_address = i['XMDZ']
                 comm.co_pre_sale = i['YSXKZH']
                 comm.co_all_house = i['PZTS']
                 comm.co_build_size = i['PZMJ']
                 comm.co_area = i['XMDQ']
                 comm.co_pre_date = i['GETDATE']
                 comm.insert_db()
         except Exception as e:
             print(
                 '小区错误,co_index={},url={},data'.format(
                     co_index, 'http://fdc.xmtfj.gov.cn:8001/home/Getzslp',
                     formdata), e)
Esempio n. 14
0
    def co_parse(self, url_list):
        for url in url_list:
            try:
                co_url = url.xpath("./@href")[0]
                new_url = "http://tmsf.qzfdcgl.com" + co_url
                co_res = requests.get(new_url, headers=self.headers)
                con = co_res.text
                co = Comm(co_index)
                co.co_id = re.search('property_(.*?)_info', co_url).group(1)
                co.co_name = re.search('楼盘名称:</span>(.*)', con).group(1)
                co.co_develops = re.search('项目公司:</span>(.*)', con).group(1)
                co.co_address = re.search('物业地址:</span>(.*?)</p', con,
                                          re.S | re.M).group(1)
                co.area = re.search('所属城区:</span>(.*)', con).group(1)
                co.insert_db()
                sid = re.search('property_(\d+)_', co_url).group(1)
                propertyid = re.search('(\d+)_info', co_url).group(1)
                bu_url = new_url.replace('info', 'price')
                res = requests.get(bu_url, headers=self.headers)
                bu_html = etree.HTML(res.text)
                bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a")
            except:
                continue
            for bu_ in bu_idlist[1:]:
                id = bu_.xpath("./@id")[0]
                bu_id = re.search('.*?(\d+)', id).group(1)
                bu = Building(co_index)
                bu.bu_id = bu_id
                bu.co_id = co.co_id
                bu.bu_num = bu_.xpath("./text()")[0]

                bu.insert_db()
                self.house_parse(bu_id, co.co_id, sid, propertyid)
Esempio n. 15
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = i.replace('view', 'detail')
             comm.co_type = '物业类型:.*?<dd>(.*?)<'
             comm.area = '区域所属:.*?<dd>(.*?)<'
             comm.co_build_size = '建筑面积:.*?<dd>(.*?)<'
             comm.co_size = '占地面积:.*?<dd>(.*?)<'
             comm.co_green = '绿化率:.*?<dd><.*?>(.*?)<'
             comm.co_build_type = '楼  层:.*?<dd>(.*?)<'
             comm.co_volumetric = '容积率:.*?<dd><.*?>(.*?)<'
             comm.co_id = '楼盘首页.*?newhouse/.*?/(.*?)/'
             comm.co_name = '<h1 class="title">(.*?)<'
             comm.co_address = '楼盘地址:.*?<dd>(.*?)<'
             comm.co_develops = '开发商:.*?<dd(.*?)<'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='gbk',
                                 analyzer_rules_dict=comm.to_dict(),
                                 analyzer_type='regex',
                                 headers=self.headers)
             p.get_details()
         except Exception as e:
             print(e)
 def get_comm_detail(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://house.bffdc.gov.cn/public/project/' + i
             response = requests.get(comm_url)
             html = response.text
             comm.co_name = re.search('PROJECT_XMMC">(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_develops = re.search('PROJECT_KFQY_NAME">(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_address = re.search('PROJECT_XMDZ">(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.area = re.search('PROJECT_SZQY">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_pre_sale = re.search('YSXKZH">(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.insert_db()
             build_info = re.search('id="buildInfo".*?value="(.*?)"', html,
                                    re.S | re.M).group(1)
             build_url_list = build_info.split(';;')
             self.get_build_info(build_url_list, comm.co_name)
             global count
             count += 1
             print(count)
         except Exception as e:
             print(e)
Esempio n. 17
0
 def get_comm_detail(self, comm_detail_url):
     comm_url = 'http://www.kmhouse.org' + comm_detail_url
     try:
         comm = Comm(co_index)
         response = requests.get(comm_url, headers=self.headers)
         html = response.content.decode('gbk')
         co_id = re.search('Preid=(.*?)&', comm_detail_url).group(1)
         co_name = re.search('楼盘名称.*?<td.*?>(.*?)<', html,
                             re.S | re.M).group(1)
         area = re.search('所在地区.*?<td.*?>(.*?)<', html,
                          re.S | re.M).group(1)
         co_address = re.search('楼盘地址.*?<td.*?>(.*?)<', html,
                                re.S | re.M).group(1)
         co_pre_sale = re.search('预售证号.*?<td.*?>(.*?)<', html,
                                 re.S | re.M).group(1)
         co_volumetric = re.search('容&nbsp;积&nbsp;率.*?<td.*?>(.*?)<', html,
                                   re.S | re.M).group(1)
         co_green = re.search('绿&nbsp;化&nbsp;率.*?<td.*?>(.*?)<', html,
                              re.S | re.M).group(1)
         co_build_start_time = re.search('开工时间.*?<td.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
         comm.co_name = co_name
         comm.area = area
         comm.co_id = co_id
         comm.co_address = co_address
         comm.co_pre_sale = co_pre_sale
         comm.co_volumetric = co_volumetric
         comm.co_green = co_green
         comm.co_build_start_time = co_build_start_time
         comm.insert_db()
         global count
         count += 1
         print('count:', count)
     except Exception as e:
         print('小区详情错误,co_index={},url={}'.format(co_index, comm_url), e)
Esempio n. 18
0
 def get_comm_info(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.ytfcjy.com/public/project/' + i
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html, re.S | re.M)[0]
             comm.co_id = re.findall('ProjectInfo.aspx\?code=(.*?)&', html, re.S | re.M)[0]
             comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html, re.S | re.M)[0]
             comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<', html, re.S | re.M)[0]
             comm.area = re.findall('PROJECT_SZQY">(.*?)<', html, re.S | re.M)[0]
             comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html, re.S | re.M)[0]
             comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<', html, re.S | re.M)[0]
             comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html, re.S | re.M)[0]
             comm.co_all_house = re.findall('YSZTS">(.*?)<', html, re.S | re.M)[0]
             comm.co_plan_pro = re.findall('id="ghxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.co_work_pro = re.findall('id="sgxkzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.co_land_use = re.findall('id="tdzInfo" value=".*?,,(.*?)"', html, re.S | re.M)[0]
             comm.insert_db()
             global count
             count += 1
             print(count)
             build_url_list = re.findall('id="buildInfo" value="(.*?)"', html, re.S | re.M)
             self.get_build_info(build_url_list, comm.co_id)
         except Exception as e:
             print(e)
Esempio n. 19
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm.co_id = '楼盘首页.*?aid-(.*?)/'
             comm.co_name = 'class="ls">(.*?)<'
             comm.co_type = '物业类型</em>(.*?)<'
             comm.area = '区域所属:</em>(.*?)<'
             comm.co_green = '绿 化 率:</em>(.*?)<'
             comm.co_volumetric = '容 积 率:</em>(.*?)<'
             comm.co_build_type = '楼&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;层:</em>(.*?)<'
             comm.co_size = '占地面积:</em>(.*?)<'
             comm.co_build_size = '建筑面积:</em>(.*?)<'
             comm.co_develops = '开&nbsp;&nbsp;发&nbsp;&nbsp;商:</em><.*?target="_blank">(.*?)<'
             comm.co_address = '项目地址:</em>(.*?)<'
             data_list = comm.to_dict()
             p = ProducerListUrl(
                 page_url=i,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=data_list,
                 current_url_rule=
                 'colspan="3" align="right"><a href="(.*?)"',
                 analyzer_type='regex',
                 headers=self.headers)
             more_build_url = p.get_details()
             self.get_build_info(more_build_url)
         except Exception as e:
             print(e)
Esempio n. 20
0
 def get_comm_detail(self, comm_list):
     for i in comm_list:
         comm_url = 'http://www.yzfdc.cn/' + i
         try:
             comm = Comm(co_index)
             content = self.s.get(comm_url, headers=self.headers)
             html = content.text
             comm.co_name = re.search('class="zxlp_08".*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_id = re.search(
                 'class="zxlp_08" href=.*?ProjectId=(.*?)"', html,
                 re.S | re.M).group(1)
             comm.co_develops = re.search('开 发 商:.*?<span.*?>(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_type = re.search('项目类型:.*?<span.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.area = re.search('所属区位:.*?<span.*?>(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_build_size = re.search('建筑面积:.*?<span.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
             comm.co_open_time = re.search('开盘日期:.*?<span.*?>(.*?)<', html,
                                           re.S | re.M).group(1)
             comm.co_handed_time = re.search('交付日期:.*?<span.*?>(.*?)<',
                                             html, re.S | re.M).group(1)
             comm.co_address = re.search('项目具体地址:.*?<span.*?>(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.insert_db()
             build_url = re.search(
                 '(/BuildingDish_Publicity.aspx\?Projectid=.*?)"', html,
                 re.S | re.M).group(1)
             self.get_build_info(build_url, comm.co_id)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
 def start_crawler(self):
     for i in range(1, 478):
         data = {
             "method": "GetYszData",
             "page": str(i),
             "ysxkz": '',
             "kfs": '',
             "lpmc": ''
         }
         res = requests.post(self.start_url,
                             headers=self.headers,
                             data=data)
         info = res.json()
         comm = json.loads(info)
         for detail in comm['Rows']:
             co = Comm(co_index)
             co.co_name = detail['PRJNAME']
             co.co_pre_sale = detail['PRENUM']
             co.area = detail['CZAREA']
             co.co_pre_sale_date = detail['PresaleCertificateDate']
             co.co_address = detail['BSIT']
             co.co_develops = detail['NAME']
             co.co_build_size = detail['YSROOMBAREA']
             co.co_all_house = detail['YSROOMNUMS']
             co.insert_db()
Esempio n. 22
0
    def start_crawler(self):
        for region in self.region.items():
            region_code = region[0]
            region_name = region[1]
            url = self.start_url + region_code + '.html'
            b = AllListUrl(
                first_page_url=url,
                request_method='get',
                analyzer_type='regex',
                encode='utf-8',
                page_count_rule='共(\d+)页>',
            )
            page = b.get_page_count()
            for i in range(1, int(page) + 1):
                new_url = url + "?page=" + str(i)
                res = requests.get(new_url, headers=self.headers)
                html = etree.HTML(res.text)
                co_list = html.xpath("//dl[@class='spf_lp_searchlist bg1']")
                for co in co_list:
                    comm = Comm(co_index)
                    co_url = co.xpath("./dt/h4/a/@href")[0]
                    comm.co_name = co.xpath("./dt/h4/a/text()")[0]
                    comm.co_address = co.xpath(".//address/text()")[0]
                    comm.co_id = re.search('\d+', co_url).group(0)
                    comm.co_develops = co.xpath(
                        "./dd[@class='dev']/a/text()")[0]
                    comm.co_plan_pro = co.xpath("./dt/h4/span/text()")[0]
                    comm.co_type = co.xpath(".//p/span[2]/text()")[0]
                    comm.area = region_name
                    comm.insert_db()

                    detail_url = "http://www.zstmsf.com" + co_url
                    self.bu_parse(detail_url, comm.co_id)
Esempio n. 23
0
    def start_crawler(self):
        data = {"Submit": "(unable to decode value)"}
        res = requests.post(self.start_url, data=data, headers=self.headers)
        html = etree.HTML(res.content.decode('gbk'))
        comm_url_list = html.xpath(
            "//tr//span[@style='width:270px; color:#006']//a/@href")
        for comm_url in comm_url_list:
            try:
                url = 'http://www.fxfdcw.com/' + comm_url
                com_res = requests.get(url, headers=self.headers)
                con = com_res.content.decode('gbk')
                co = Comm(co_index)
                co.co_id = re.search('xmid=(\d+)', comm_url).group(1)
                co.co_name = re.search('项目名称.*?">(.*?)</', con,
                                       re.S | re.M).group(1)
                co.co_develops = re.search('开发企业:(.*?) &nbsp', con,
                                           re.S | re.M).group(1)
                co.co_address = re.search('项目地址.*?">(.*?)</', con,
                                          re.S | re.M).group(1)
                co.co_build_size = re.search('建筑面积.*?">(.*?)</', con,
                                             re.S | re.M).group(1)
                co.co_all_house = re.search('总套数.*?">(.*?)</', con,
                                            re.S | re.M).group(1)
                co.insert_db()

                bu_list = re.findall("window.open\('(.*?)'\)", con,
                                     re.S | re.M)
            except Exception as e:
                # log.error("小区信息错误{}".format(e))
                print("小区信息错误{}".format(e))
                continue

            self.bu_info(bu_list, co.co_id)
Esempio n. 24
0
 def start_crawler(self):
     querystring = {"_method": "GetDataToDynamicInXml", "_session": "rw"}
     payload = "xmlInfo=%263Croot%2620QueryCode%263D%2622ProjectIntroduce%2622%2620PageIndex%263D%26221%2622%2620PageSize%263D%262215%2622%2620SortField%263D%2622%2620ORDER%2620BY%2620Name%2622%2620QueryString%263D%2622QueryCode%263DProjectIntroduce%2626amp%263BShowModeCode%263Ddefault%2622%2620BeginDate%263D%2622%262000%263A00%263A00%2622%2620EndDate%263D%2622%262023%263A59%263A59%2622%2620Flag%263D%2622TitleBody%2622%2620TitlesWidthInfo%263D%2622EnterPriseName%267C0%2624Name%267C0%2624Location%267C0%2624SoilUse%267C0%2622%2620IsUseOCache%263D%26220%2622%2620IsUserID%263D%26220%2622%2620SiteId%263D%26228907bd13-1d14-4f9e-8c01-e482d9590d10%2622%2620LockedColumn%263D%26220%2622%2620IsLocked%263D%26220%2622%2620ClientWidth%263D%26221601%2622%2620ShowModeCode%263D%2622default%2622%2620Language%263D%2622chinese%2622/%263E"
     response = requests.request("POST",
                                 url,
                                 data=payload,
                                 params=querystring)
     html = response.text
     comm_info_list = re.findall('class="tdctfield tdctwidthset ".*?</tr>',
                                 html, re.S | re.M)
     for i in comm_info_list:
         comm = Comm(co_index)
         comm.co_develops = re.search('class="spanctfield".*?>(.*?)<', i,
                                      re.S | re.M).group(1)
         comm.co_name = re.search(
             'class="spanctfield".*?class="spanctfield".*?<a.*?>(.*?)<', i,
             re.S | re.M).group(1)
         comm.co_address = re.search(
             'class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?>(.*?)<',
             i, re.S | re.M).group(1)
         comm.co_type = re.search(
             'class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?class="spanctfield".*?>(.*?)<',
             i, re.S | re.M).group(1)
         comm.co_id = re.search('EnterPriseName_(.*?)"', i,
                                re.S | re.M).group(1)
         comm.insert_db()
         self.get_build_info(comm.co_id)
Esempio n. 25
0
 def get_comm_detail(self, comm_detail_url, co_id):
     comm = Comm(co_index)
     try:
         response = requests.get(comm_detail_url, headers=self.headers)
         html = response.text
         comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_type = re.search('项目主体性质:.*?<td.*?>(.*?)<', html,
                                  re.S | re.M).group(1)
         comm.co_develops = re.search('主开发商:.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_address = re.search('项目建设地址:.*?<td.*?>(.*?)<', html,
                                     re.S | re.M).group(1)
         comm.co_all_size = re.search('项目总规划面积(㎡):.*?<td.*?>(.*?)<', html,
                                      re.S | re.M).group(1)
         comm.co_build_start_time = re.search('计划开工日期:.*?<td.*?>(.*?)<',
                                              html, re.S | re.M).group(1)
         comm.co_build_end_time = re.search('计划竣工日期:.*?<td.*?>(.*?)<', html,
                                            re.S | re.M).group(1)
         comm.co_id = co_id
         comm.insert_db()
         build_info_list = re.findall('id="lpan".*?</tr>', html,
                                      re.S | re.M)
         self.get_build_info(build_info_list, co_id)
     except Exception as e:
         print('小区错误,co_index={},url={}'.format(co_index, comm_detail_url),
               e)
Esempio n. 26
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             code = i.split(',')
             comm_url = 'http://www.tmsf.com/newhouse/property_' + code[
                 0] + '_' + code[1] + '_info.htm'
             comm = Comm(co_index)
             comm.co_name = 'buidname.*?>(.*?)<'
             comm.co_address = '--位置行--.*?<span.*?title="(.*?)"'
             comm.co_build_type = '建筑形式:<.*?>(.*?)<'
             comm.co_develops = '项目公司:<.*?>(.*?)<'
             comm.co_volumetric = '容 积 率:</span>(.*?)<'
             comm.co_green = '绿 化 率:</span>(.*?)<'
             comm.co_size = '占地面积:</span>(.*?)<'
             comm.co_build_size = '总建筑面积:</span>(.*?)<'
             comm.co_all_house = '总户数:</span>(.*?)<'
             comm.co_id = 'info" href="/newhouse/property_(.*?)_info'
             p = ProducerListUrl(page_url=comm_url,
                                 request_type='get',
                                 encode='utf-8',
                                 analyzer_rules_dict=comm.to_dict(),
                                 current_url_rule='一房一价<.*?href="(.*?)"',
                                 analyzer_type='regex',
                                 headers=self.headers)
             build_all_url = p.get_details()
             global count
             count += 1
             print('comm:', count)
             self.get_build_info(build_all_url)
         except Exception as e:
             print('小区页面,co_index={},url={}'.format(co_index, comm_url), e)
Esempio n. 27
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         comm_url = 'http://www.fjlyfdc.com.cn/' + i
         try:
             comm = Comm(co_index)
             response = requests.get(comm_url, headers=self.headers)
             html = response.text
             comm.co_develops = re.search('公司名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_name = re.search('项目名称:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_pre_sale = re.search('预售许可证:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_address = re.search('项目坐落:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_type = re.search('规划用途:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_build_size = re.search('建筑面积:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_volumetric = re.search('容积率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_green = re.search('绿地率:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_open_time = re.search('开工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_build_end_time = re.search('竣工日期:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_all_house = re.search('批准销售:.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_all_size = re.search('批准销售:.*?<td.*?<td.*?>(.*?)<', html, re.S | re.M).group(1)
             comm.co_id = re.search('CaseId=(.*?)$', comm_url).group(1)
             comm.insert_db()
             build_url_list = re.findall('href="(/House/BuildingInfo\?buildingInfoID=.*?&amp;caseID=.*?)"', html,
                                         re.S | re.M)
             self.get_build_info(build_url_list, comm.co_id)
         except Exception as e:
             print('楼栋错误,co_index={},url={}'.format(co_index, comm_url), e)
Esempio n. 28
0
    def start_crawler(self):
        b = AllListUrl(first_page_url=self.start_url,
                       request_method='get',
                       analyzer_type='regex',
                       encode='utf-8',
                       page_count_rule='共(\d+)页',
                       )
        page = b.get_page_count()
        for i in range(1,int(page)+1):
            url = self.start_url + '?pageIndex=2' + str(page)
            page_res = requests.get(url,headers=self.headers)

            html = etree.HTML(page_res.text)
            comm_info_list = html.xpath("//ul/li/div")
            for comm_info in comm_info_list:
                try:
                    co = Comm(co_index)
                    co.co_name = comm_info.xpath("./p/a/text()")[0]
                    deve = comm_info.xpath("./p[2]/text()")[0]
                    addr = comm_info.xpath("./p[3]/text()")[0]
                    co.co_develops = re.search('开发商:(.*)',deve).group(1)
                    co.co_address = re.search('楼盘地址.*?:(.*)',addr).group(1)
                    comm_url = comm_info.xpath("./p/a/@href")[0]
                    co.co_id = re.search('projectId=(\d+)',comm_url).group(1)
                    co.insert_db()
                    co_url = 'http://www.bdfdc.net' + comm_url
                    co_res = requests.get(co_url,headers=self.headers)
                    time.sleep(5)
                    bu_html = etree.HTML(co_res.text)
                    bu_url_list = bu_html.xpath("//div[@style]/a")[1:]
                except Exception as e:
                    # log.error("小区信息错误{}".format(e))
                    print("小区信息错误{}".format(e))
                    continue
                self.bu_info(bu_url_list,co.co_id)
Esempio n. 29
0
 def start_crawler(self):
     for i in range(1, 40):
         url = 'http://zfbzj.baotou.gov.cn/index.php?m=content&c=permit&a=init&page=' + str(
             i)
         res = requests.get(url, headers=self.headers)
         html = etree.HTML(res.content.decode())
         temp_list = html.xpath("//tr/td[@align='left']/a")
         for temp in temp_list:
             try:
                 temp_url = temp.xpath("./@href")[0]
                 co_res = requests.get(temp_url, headers=self.headers)
                 content = co_res.content.decode()
                 co = Comm(co_index)
                 co.co_id = re.search('id=(\d+)', temp_url).group(1)
                 co.co_name = re.search('项 目 名 称.*?">(.*?)</td', content,
                                        re.S | re.M).group(1)
                 co.co_develops = re.search('开发建设单位.*?">(.*?)</td', content,
                                            re.S | re.M).group(1)
                 co.co_address = re.search('项 目 座 落.*?">(.*?)</td', content,
                                           re.S | re.M).group(1)
                 co.co_pre_sale = re.search('预销售许可证号.*?">(.*?)</td',
                                            content, re.S | re.M).group(1)
                 co.co_pre_sale_date = re.search('发证日期.*?">(.*?)</td',
                                                 content,
                                                 re.S | re.M).group(1)
                 co.insert_db()
             except Exception as e:
                 log.error("{}小区解析失败{}".format(temp_url, e))
                 continue
Esempio n. 30
0
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             response = requests.get(i, headers=self.headers)
             html = response.text
             comm = Comm(co_index)
             comm.co_name = re.findall('PROJECT_XMMC">(.*?)<', html,
                                       re.S | re.M)[0]
             comm.co_develops = re.findall('PROJECT_KFQY_NAME">(.*?)<',
                                           html, re.S | re.M)[0]
             comm.co_address = re.findall('PROJECT_XMDZ">(.*?)<', html,
                                          re.S | re.M)[0]
             comm.area = re.findall('PROJECT_SZQY">(.*?)<', html,
                                    re.S | re.M)[0]
             comm.co_volumetric = re.findall('PROJECT_RJL">(.*?)<', html,
                                             re.S | re.M)[0]
             comm.co_build_size = re.findall('PROJECT_GHZJZMJ">(.*?)<',
                                             html, re.S | re.M)[0]
             comm.co_pre_sale = re.findall('YSXKZH">(.*?)<', html,
                                           re.S | re.M)[0]
             comm.co_id = re.findall('PROJECT_XMBH">(.*?)<', html,
                                     re.S | re.M)[0]
             comm.insert_db()
             global count
             count += 1
             print(count)
             bu_info = re.search('id="buildInfo".*?value="(.*?)"', html,
                                 re.S | re.M).group(1)
             self.get_build_info(bu_info, comm.co_id, i)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, i), e)