Exemple #1
0
    def start_crawler(self):
        res = requests.get(self.start_url, headers=self.headers)
        html = etree.HTML(res.text)
        comm_url_list = html.xpath("//div[@class='post']//a/@href")
        for comm_url in comm_url_list:
            try:
                url = 'http://www.ggsfcw.com/' + comm_url
                comm_res = requests.get(url, headers=self.headers)
                com_html = etree.HTML(comm_res.text)
                comm = Comm(co_index)
                comm.co_name = re.search('<h3.*?">(.*?)</',
                                         comm_res.text).group(1)
                comm.co_id = re.search('n=(\d+)', comm_res.text).group(1)
                comm.co_address = re.search('地址.*?">(.*?)</',
                                            comm_res.text).group(1)
                comm.area = re.search('区县.*?">(.*?)</', comm_res.text).group(1)
                comm.co_develops = re.search('开发商.*?">(.*?)</',
                                             comm_res.text).group(1)
                comm.co_use = re.search('规划用途.*?">(.*?)</',
                                        comm_res.text).group(1)
                comm.insert_db()
            except Exception as e:
                log.error("小区信息错误", e)
                continue

            bu_list = com_html.xpath("//div[@id='MainContent_divResult']/a")
            self.build_info(bu_list, comm.co_id)
 def get_comm_detail(self, comm_list):
     for i in comm_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://house.bffdc.gov.cn/public/project/' + i
             response = requests.get(comm_url)
             html = response.text
             comm.co_name = re.search('PROJECT_XMMC">(.*?)<', html,
                                      re.S | re.M).group(1)
             comm.co_develops = re.search('PROJECT_KFQY_NAME">(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.co_address = re.search('PROJECT_XMDZ">(.*?)<', html,
                                         re.S | re.M).group(1)
             comm.area = re.search('PROJECT_SZQY">(.*?)<', html,
                                   re.S | re.M).group(1)
             comm.co_pre_sale = re.search('YSXKZH">(.*?)<', html,
                                          re.S | re.M).group(1)
             comm.insert_db()
             build_info = re.search('id="buildInfo".*?value="(.*?)"', html,
                                    re.S | re.M).group(1)
             build_url_list = build_info.split(';;')
             self.get_build_info(build_url_list, comm.co_name)
             global count
             count += 1
             print(count)
         except Exception as e:
             print(e)
    def co_parse(self, url_list):
        for url in url_list:
            try:
                co_url = url.xpath("./@href")[0]
                new_url = "http://tmsf.qzfdcgl.com" + co_url
                co_res = requests.get(new_url, headers=self.headers)
                con = co_res.text
                co = Comm(co_index)
                co.co_id = re.search('property_(.*?)_info', co_url).group(1)
                co.co_name = re.search('楼盘名称:</span>(.*)', con).group(1)
                co.co_develops = re.search('项目公司:</span>(.*)', con).group(1)
                co.co_address = re.search('物业地址:</span>(.*?)</p', con,
                                          re.S | re.M).group(1)
                co.area = re.search('所属城区:</span>(.*)', con).group(1)
                co.insert_db()
                sid = re.search('property_(\d+)_', co_url).group(1)
                propertyid = re.search('(\d+)_info', co_url).group(1)
                bu_url = new_url.replace('info', 'price')
                res = requests.get(bu_url, headers=self.headers)
                bu_html = etree.HTML(res.text)
                bu_idlist = bu_html.xpath("//dd[@id='building_dd']/a")
            except:
                continue
            for bu_ in bu_idlist[1:]:
                id = bu_.xpath("./@id")[0]
                bu_id = re.search('.*?(\d+)', id).group(1)
                bu = Building(co_index)
                bu.bu_id = bu_id
                bu.co_id = co.co_id
                bu.bu_num = bu_.xpath("./text()")[0]

                bu.insert_db()
                self.house_parse(bu_id, co.co_id, sid, propertyid)
 def comm_info(self, co_develops, co_pre_sale, co_name, co_pre_sale_date,
               sid):
     co = Comm(co_index)
     co.co_pre_sale = co_pre_sale
     co.co_id = sid
     co.co_name = co_name
     co.co_pre_sale_date = co_pre_sale_date
     co.co_develops = co_develops
     co.insert_db()
 def get_data_obj(self, analyzer, co_index):
     if analyzer == 'comm':
         return Comm(co_index)
     elif analyzer == 'build':
         return Building(co_index)
     elif analyzer == 'house':
         return House(co_index)
Exemple #6
0
 def get_comm_info(self, comm_info_list):
     for i in comm_info_list:
         try:
             comm = Comm(co_index)
             comm.co_name = re.search('<td>(.*?)</td>', i,
                                      re.S | re.M).group(1)
             comm.co_all_house = re.search('<td.*?<td>(.*?)</td>', i,
                                           re.S | re.M).group(1)
             comm.co_all_size = re.search('<td.*?<td.*?<td>(.*?)</td>', i,
                                          re.S | re.M).group(1)
             comm.insert_db()
         except Exception as e:
             print('小区错误,co_index={},html_str={}'.format(co_index, i), e)
 def get_comm_info(self, comm_url_list):
     for i in comm_url_list:
         try:
             comm = Comm(co_index)
             comm_url = 'http://www.fjnpfdc.com/House/' + i
             comm.co_develops = '公司名称:.*?<td.*?>(.*?)<'
             comm.co_pre_sale = '预售许可证:.*?<td.*?>(.*?)<'
             comm.co_name = '项目名称:.*?<td.*?>(.*?)<'
             comm.co_address = '项目坐落:.*?<td.*?>(.*?)<'
             comm.co_use = '规划用途:.*?<td.*?>(.*?)<'
             comm.co_build_size = '建筑面积:.*?<td.*?>(.*?)<'
             comm.co_id = 'ProjectId=(.*?)&'
             p = ProducerListUrl(
                 page_url=comm_url,
                 request_type='get',
                 encode='gbk',
                 analyzer_rules_dict=comm.to_dict(),
                 current_url_rule="<a href='(BuildingInfo.*?)'",
                 analyzer_type='regex',
                 headers=self.headers)
             build_url_list = p.get_details()
             self.get_build_info(build_url_list)
         except Exception as e:
             print("co_index={},小区{}错误".format(co_index, i), e)
 def get_comm_info(self, all_html_list):
     for html in all_html_list:
         try:
             comm_info_paper_list = re.findall('<tr>.*?</tr>', html,
                                               re.S | re.M)
             for i in comm_info_paper_list[1:]:
                 comm = Comm(co_index)
                 comm.area = re.search('align="center">(.*?)<', i,
                                       re.S | re.M).group(1)
                 comm.co_name = re.search(
                     'align="center".*?align="center".*?>(.*?)<', i,
                     re.S | re.M).group(1)
                 comm.co_address = re.search(
                     'align="center".*?align="center".*?align="center".*?title="(.*?)"',
                     i, re.S | re.M).group(1)
                 comm.co_all_house = re.search(
                     'align="center".*?align="center".*?align="center".*?align="center".*?>(.*?)<',
                     i, re.S | re.M).group(1)
                 comm.co_id = re.search('projectID=(.*?)&', i,
                                        re.S | re.M).group(1)
                 comm.insert_db()
                 self.get_build_info(comm.co_id)
         except Exception as e:
             print('解析错误,co_index={},方法:get_comm_info'.format(co_index), e)
    def start(self):
        page = self.get_all_page()
        count = 0
        for i in range(1, int(page) + 1):
            try:
                url = 'http://www.czfdc.gov.cn/spf/gs.php?pageid=' + str(i)
                response = requests.get(url, headers=self.headers)
                html = response.content.decode('gbk')
                tree = etree.HTML(html)
                comm_url_list = tree.xpath('//td[@align="left"]/a/@href')

                for j in comm_url_list:
                    count += 1
                    print(count)
                    comm = Comm(6)
                    comm_url = 'http://www.czfdc.gov.cn/spf/' + j
                    self.get_comm_info(comm_url, comm)
            except Exception as e:
                print('co_index={},翻页有问题,url={}'.format(self.co_index, url), e)
                continue