Exemple #1
0
    def start_crawler(self):
        for region in self.region.items():
            region_code = region[0]
            region_name = region[1]
            url = self.start_url + region_code + '.html'
            b = AllListUrl(
                first_page_url=url,
                request_method='get',
                analyzer_type='regex',
                encode='utf-8',
                page_count_rule='共(\d+)页>',
            )
            page = b.get_page_count()
            for i in range(1, int(page) + 1):
                new_url = url + "?page=" + str(i)
                res = requests.get(new_url, headers=self.headers)
                html = etree.HTML(res.text)
                co_list = html.xpath("//dl[@class='spf_lp_searchlist bg1']")
                for co in co_list:
                    comm = Comm(co_index)
                    co_url = co.xpath("./dt/h4/a/@href")[0]
                    comm.co_name = co.xpath("./dt/h4/a/text()")[0]
                    comm.co_address = co.xpath(".//address/text()")[0]
                    comm.co_id = re.search('\d+', co_url).group(0)
                    comm.co_develops = co.xpath(
                        "./dd[@class='dev']/a/text()")[0]
                    comm.co_plan_pro = co.xpath("./dt/h4/span/text()")[0]
                    comm.co_type = co.xpath(".//p/span[2]/text()")[0]
                    comm.area = region_name
                    comm.insert_db()

                    detail_url = "http://www.zstmsf.com" + co_url
                    self.bu_parse(detail_url, comm.co_id)
Exemple #2
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='green1">1/(.*?)<',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_url = 'http://www.tmsf.com/newhouse/property_searchall.htm?&page=' + str(
             i)
         while True:
             try:
                 response = requests.get(all_url,
                                         headers=self.headers,
                                         timeout=10)
                 if response.status_code is 200:
                     break
             except Exception as e:
                 print(
                     '小区列表页加载不出来,co_index={},url={}'.format(
                         co_index, all_url), e)
         html = response.text
         comm_url_list = re.findall(
             'build_word01" onclick="toPropertyInfo\((.*?)\);', html,
             re.S | re.M)
         self.get_comm_info(comm_url_list)
Exemple #3
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=self.start_url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='共.*?Count">(\d+)</span>页',
     )
     page = b.get_page_count()
     data = {'__EVENTTARGET': 'navigate$LnkBtnGoto'}
     for i in range(1, int(page) + 1):
         if i == 1:
             res = requests.get(self.start_url, headers=self.headers)
             con = res.content.decode()
             html = etree.HTML(con)
             view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
             valid = html.xpath(
                 "//input[@id='__EVENTVALIDATION']/@value")[0]
             data['__VIEWSTATE'] = view_state
             data['__EVENTVALIDATION'] = valid
             self.comm_list(html)
         else:
             data['navigate$txtNewPageIndex'] = i
             res = requests.post(self.start_url,
                                 data=data,
                                 headers=self.headers)
             con = res.content.decode()
             html = etree.HTML(con)
             view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
             valid = html.xpath(
                 "//input[@id='__EVENTVALIDATION']/@value")[0]
             data['__VIEWSTATE'] = view_state
             data['__EVENTVALIDATION'] = valid
             self.comm_list(html)
Exemple #4
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.start_url,
            request_method='get',
            analyzer_type='regex',
            encode='gbk',
            page_count_rule='总共<b>(\d+)<',
        )
        page = b.get_page_count()

        formdata = {}
        comm_url_list = []
        for i in range(1, int(page) + 1):

            res = requests.post(
                self.start_url,
                data=formdata,
            )
            con = res.content.decode('gbk')
            con = etree.HTML(con)
            view_state = con.xpath("//input[@name='__VIEWSTATE']/@value")[0]
            valid = con.xpath("//input[@name='__EVENTVALIDATION']/@value")[0]
            view_state = parse.quote_plus(view_state, encoding='gbk')
            valid = parse.quote_plus(valid, encoding='gbk')
            formdata["__VIEWSTATE"] = view_state  # 保存当前页的信息作为下一页请求参数
            formdata["__EVENTVALIDATION"] = valid
            formdata["__EVENTTARGET"] = 'AspNetPager1'
            formdata["__VIEWSTATEGENERATOR"] = "248CD702"
            formdata["__EVENTARGUMENT"] = str(i + 1)
            formdata["AspNetPager1_input"] = str(i)

            url_list = con.xpath(
                "//tr[@bgcolor='#F5F9FC']/td[@bgcolor='white']/a/@href")
            comm_url_list.extend(url_list)
        self.comm_info(comm_url_list)
Exemple #5
0
    def start_crawler(self):
        b = AllListUrl(first_page_url=self.start_url,
                       request_method='get',
                       analyzer_type='regex',
                       encode='utf-8',
                       page_count_rule='共(\d+)页',
                       )
        page = b.get_page_count()
        for i in range(1,int(page)+1):
            url = self.start_url + '?pageIndex=2' + str(page)
            page_res = requests.get(url,headers=self.headers)

            html = etree.HTML(page_res.text)
            comm_info_list = html.xpath("//ul/li/div")
            for comm_info in comm_info_list:
                try:
                    co = Comm(co_index)
                    co.co_name = comm_info.xpath("./p/a/text()")[0]
                    deve = comm_info.xpath("./p[2]/text()")[0]
                    addr = comm_info.xpath("./p[3]/text()")[0]
                    co.co_develops = re.search('开发商:(.*)',deve).group(1)
                    co.co_address = re.search('楼盘地址.*?:(.*)',addr).group(1)
                    comm_url = comm_info.xpath("./p/a/@href")[0]
                    co.co_id = re.search('projectId=(\d+)',comm_url).group(1)
                    co.insert_db()
                    co_url = 'http://www.bdfdc.net' + comm_url
                    co_res = requests.get(co_url,headers=self.headers)
                    time.sleep(5)
                    bu_html = etree.HTML(co_res.text)
                    bu_url_list = bu_html.xpath("//div[@style]/a")[1:]
                except Exception as e:
                    # log.error("小区信息错误{}".format(e))
                    print("小区信息错误{}".format(e))
                    continue
                self.bu_info(bu_url_list,co.co_id)
Exemple #6
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule='第1页 / 共(.*?)页',
                    )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_url = 'http://113.106.199.148/web/presale.jsp?page=' + str(i)
         try:
             self.get_comm_url(all_url)
         except Exception as e:
             print('page页错误,co_index={},url={}'.format(co_index, all_url), e)
Exemple #7
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=self.start_url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule='共(\d+)页',
                    )
     page = b.get_page_count()
     for i in range(1,int(page+1)):
         url = self.start_url + '?page2=' + str(i)
         res = requests.get(url,headers=self.headers)
         html = etree.HTML(res.content.decode('gbk'))
         comm_url_list = html.xpath("//td/a[@target]/@href")
         self.comm_info(comm_url_list)
Exemple #8
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule=' 1/(.*?)页',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(int(page)):
         all_page_url = url + '?page=' + str(i)
         response = requests.get(all_page_url, headers=self.headers)
         html = response.text
         comm_detail_url_list = re.findall(
             '(/House/ProjectInfo\?ProjectId=.*?)"', html, re.S | re.M)
         self.get_comm_info(comm_detail_url_list)
Exemple #9
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='>><.*?aspx\?p=(.*?)"',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_page_url = url + '?p=' + str(i)
         response = requests.get(all_page_url, headers=self.headers)
         html = response.text
         tree = etree.HTML(html)
         comm_url_list = tree.xpath('//a[@class="sp_zi12c"]/@href')
         self.get_comm_info(comm_url_list)
Exemple #10
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule=' 1/(.*?)页',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(int(page)):
         all_page_url = 'http://www.fjnpfdc.com/House/ListCanSell?page=' + str(
             i)
         response = requests.get(all_page_url)
         html = response.text
         comm_url_list = re.findall('<tr align="center">.*?<a href="(.*?)"',
                                    html, re.S | re.M)
         self.get_comm_info(comm_url_list)
Exemple #11
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='" >(\d+)</a>&nbsp',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):  #翻页
            url = "http://www.hufdc.com/presell.jspx?pageno=" + str(i)
            response = requests.get(url, headers=self.headers)

            url_html = etree.HTML(response.text)
            self.comm_parse(url_html)
Exemple #12
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='1/(.*?)页',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(0, int(page) + 1):
         page_url = 'http://www.fjlyfdc.com.cn/House/Link/YSXXCX.cshtml?pagenumber=' + str(
             i)
         response = requests.get(page_url)
         html = response.text
         comm_url_list = re.findall('class="c".*?href="(.*?)"', html,
                                    re.S | re.M)
         self.get_comm_info(comm_url_list)
Exemple #13
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='共.*?Count">(\d+)</span>页',
        )
        page = b.get_page_count()
        formdata = {}
        for i in range(1, int(page) + 1):
            formdata["__EVENTTARGET"] = "navigate$LnkBtnGoto"
            formdata["navigate$txtNewPageIndex"] = i
            try:
                res = requests.post(self.url, headers=self.headers)
            except Exception as e:
                print("co_index={},第{}页翻页失败".format(co_index, i))
                print(e)
                continue
            con = etree.HTML(res.text)
            formdata["__VIEWSTATE"] = con.xpath(
                "//input[@id='__VIEWSTATE']/@value")[0]
            formdata["__EVENTVALIDATION"] = con.xpath(
                "//input[@id='__EVENTVALIDATION']/@value")[0]

            bu_url_list = con.xpath("//td[@style='width:13%']/a/@href")
            bu_pre = con.xpath("//td[@style='width:13%']/a/text()")
            bu_dev = con.xpath("//td[@style='width:24%']/text()")
            co_name = con.xpath("//td[@style='width:15%']/text()")
            # print(i)
            for index in range(len(bu_url_list)):
                bu_detail = "http://www.hcsfcglj.com/Templets/BoZhou/aspx/" + bu_url_list[
                    index]
                bu_pre_sale = bu_pre[index]
                bo_develops = bu_dev[index]
                bu_co_name = co_name[index]
                try:
                    bu_res = requests.get(bu_detail, headers=self.headers)
                except Exception as e:
                    print("co_index={},楼栋{}无法访问".format(co_index, bu_detail))
                    print(e)
                    continue
                bu_con = bu_res.text

                self.get_build_info(bu_pre_sale, bo_develops, bu_co_name,
                                    bu_con)
                self.get_house_info(bu_con)
Exemple #14
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='下页</a>.*?page=(.*?)"',
                    )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         try:
             all_page_url = url + '?page=' + str(i)
             response = requests.get(all_page_url, headers=self.headers)
             html = response.text
             comm_url_list = re.findall('项目名称:.*?href="(.*?)"', html, re.S | re.M)
             self.get_comm_info(comm_url_list)
         except Exception as e:
             print('page页面错误,co_index={},url={}'.format(co_index, all_page_url), e)
Exemple #15
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.start_url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='cite>.*?/(.*?)页<',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):

            url = self.start_url + '&page=' + str(i)
            res = requests.get(url, headers=self.headers)
            comm_url_list = re.findall("window.open\('(.*?)'\)", res.text,
                                       re.S | re.M)
            self.comm_info(comm_url_list)
Exemple #16
0
 def start_crawler(self):
     for region in self.region_list:
         region_url = self.start_url + region
         b = AllListUrl(
             first_page_url=region_url,
             request_method='get',
             analyzer_type='regex',
             encode='utf-8',
             page_count_rule='1/(\d+)页',
         )
         page = b.get_page_count()
         for i in range(1, int(page) + 1):
             url = region_url + "&pagenumber=" + str(i)
             res = requests.get(url, headers=self.headers)
             html = etree.HTML(res.text)
             url_list = html.xpath("//tr/td/a/@href")
             self.comm_parse(url_list, region)
Exemple #17
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='> ..<.*?>(.*?)<',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         page_all_url = 'http://www.dzfgj.com/index.php?m=content&c=index&a=lists&catid=61&page=' + str(
             i)
         response = requests.get(page_all_url, headers=self.headers)
         html = response.text
         comm_html = re.search('<tbody>.*?</tbody>', html,
                               re.S | re.M).group()
         comm_info_list = re.findall('<tr>.*?</tr>', comm_html, re.S | re.M)
         self.get_comm_info(comm_info_list)
Exemple #18
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='> ..<a.*?>(.*?)<',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         page_url = "http://zfbzj.baotou.gov.cn/index.php?m=content&c=permit&a=init&page=" + str(
             i)
         response = requests.get(page_url, headers=self.headers)
         html = response.text
         comm_url_list = re.findall(
             'href="(http://zfbzj.baotou\.gov\.cn/index\.php\?m=content&c=permit&a=show&id=.*?)".*?http://zfbzj',
             html, re.S | re.M)
         self.get_comm_info(comm_url_list)
Exemple #19
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule='页次:<b><font color=red>1</font></b>/<b>(.*?)<',
                    headers=self.headers
                    )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         page_url = 'http://123.131.127.13/xy/dzlist.asp?page=' + str(i)
         time.sleep(5)
         response = requests.get(page_url, headers=self.headers)
         html = response.content.decode('gbk')
         comm_list_html = re.search('项目电子手册列表.*?<table(.*?)</table>', html, re.S | re.M).group(1)
         comm_html_list = re.findall('<tr>(.*?)</tr>', comm_list_html, re.S | re.M)[1:]
         self.get_comm_info(comm_html_list)
Exemple #20
0
    def start_crawler(self):
        b = AllListUrl(first_page_url=self.start_url,
                       request_method='get',
                       analyzer_type='regex',
                       encode='utf-8',
                       page_count_rule='页数.*?/(\d+)',
                       )
        page = b.get_page_count()

        for i in range(1,int(page)+1):
            formdata = {
                'page':i,
                'keytype':1,
            }
            res = requests.post(self.start_url,data=formdata,headers=self.headers)
            html = etree.HTML(res.text)
            url_list = html.xpath("//h3/a")
            self.co_parse(url_list)
Exemple #21
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='green1">1/(.*?)<',
        )
        page = b.get_page_count()
        for i in range(1, int(page) + 1):
            all_url = 'http://tz.tmsf.com/newhouse/property_searchall.htm'
            data = {'keytype': 1, 'page': page}
            response = requests.post(all_url, data=data, headers=self.headers)

            html = response.text
            comm_url_list = re.findall(
                '<div class="build_txt">.*?<a href="(.*?)"', html, re.S | re.M)
            for i in comm_url_list:
                self.get_comm_detail(i)
Exemple #22
0
 def start_crawler(self):
     start_url = self.start_url + "searchSpf.jsp?nowPage=1"
     b = AllListUrl(
         first_page_url=start_url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='/(\d+)页',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         url = self.start_url + "searchSpf.jsp?nowPage=" + str(i)
         res = requests.get(url, headers=self.headers)
         html = etree.HTML(res.content.decode())
         url_list = html.xpath("//b/a/@href")
         for comm_temp in url_list:
             try:
                 comm_url = self.start_url + comm_temp.replace(
                     "./xmxxmainNew", 'xmxx/xmjbxx')
                 com_res = requests.get(comm_url, headers=self.headers)
                 con = com_res.content.decode('gbk')
                 co = Comm(co_index)
                 co.co_id = re.search('Id_xmxq=(.*)', comm_temp).group(1)
                 co.co_name = re.search('3a3a3a">(.*?)</b', con).group(1)
                 co.co_address = re.search('项目地址.*?">(.*?)</td', con,
                                           re.S | re.M).group(1)
                 co.co_develops = re.search('开 发 商.*?">(.*?)</td', con,
                                            re.S | re.M).group(1)
                 co.co_all_house = re.search('总 套 数.*?<td>(.*?)</td', con,
                                             re.S | re.M).group(1)
                 co.co_green = re.search('绿 化 率.*?<td>(.*?)</td', con,
                                         re.S | re.M).group(1)
                 co.co_volumetric = re.search('容 积 率.*?<td>(.*?)</td', con,
                                              re.S | re.M).group(1)
                 try:
                     co.co_build_size = re.search('建设规模.*?" >(.*?)平', con,
                                                  re.S | re.M).group(1)
                 except:
                     co.co_build_size = None
                 co.insert_db()
             except Exception as e:
                 log.error('{}小区错误{}'.format(comm_temp, e))
             self.build_parse(co.co_id)
Exemple #23
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='共(.*?)页',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         if i is 1:
             all_page_url = 'http://www.xyfcj.com/html/jplp/index.html'
         else:
             all_page_url = 'http://www.xyfcj.com/html/jplp/index_' + str(
                 i) + '.html'
         response = requests.get(all_page_url, headers=self.headers)
         html = response.text
         comm_url_list = re.findall(
             '<a style="COLOR: #000000" target="_blank" href="(.*?)"', html,
             re.S | re.M)
         self.get_comm_info(comm_url_list)
Exemple #24
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule='strong>1/(.*?)<',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         index_url = 'http://www.kmhouse.org/moreHousePriceList.asp?page=' + str(
             i)
         try:
             response = requests.get(url=index_url, headers=self.headers)
             html = response.content.decode('gbk')
             comm_url_list = re.findall("cellspacing='3'.*?<a href='(.*?)'",
                                        html)
             self.get_comm_info(comm_url_list)
         except Exception as e:
             print('page页错误,co_index={},url={}'.format(co_index, index_url),
                   e)
Exemple #25
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='pg.pageCount = (.*?);',
     )
     page = b.get_page_count()
     all_url_list = []
     for i in range(1, int(page) + 1):
         all_url = 'http://www.gafdc.cn/newhouse/houselist.aspx?hou=0-0-0-0-0-0-&page=' + str(
             i)
         comm_url_list = self.get_comm_url(all_url)
         all_url_list += comm_url_list
     # 遍历所有小区url
     for i in all_url_list:
         comm_url = 'http://www.gafdc.cn/newhouse/' + str(
             i.replace('index', 'base'))
         try:
             self.get_comm_info(comm_url)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
     all_build_url_list = []
     for i in all_url_list:
         build_url = 'http://www.gafdc.cn/newhouse/' + str(
             i.replace('index', 'table'))
         house_url_list = self.get_build_info(build_url)
         if house_url_list:
             all_build_url_list += house_url_list
         else:
             print('楼栋错误,此小区没有楼栋,co_index={},url={}'.format(
                 co_index, build_url))
     all_house_url_list = []
     form_data_list = []
     for i in all_build_url_list:
         house_url = 'http://www.gafdc.cn/newhouse/GetBuildTableByAjax.ashx'
         data = {'itemRecord': i[0], 'houseCode': i[1]}
         all_house_url_list.append(house_url)
         form_data_list.append(data)
     self.get_house_info(form_data_list)
Exemple #26
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=url,
            request_method='get',
            analyzer_type='regex',
            encode='gbk',
            page_count_rule='共(.*?)页',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):
            all_page_url = url + '&Page=' + str(i)
            p = ProducerListUrl(page_url=all_page_url,
                                request_type='get',
                                encode='gbk',
                                analyzer_rules_dict=None,
                                current_url_rule="eval\('openBldg\((.*?)\)",
                                analyzer_type='regex',
                                headers=self.headers)
            comm_url_list = p.get_current_page_url()
            self.get_build_info(comm_url_list)
Exemple #27
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='gbk',
         page_count_rule='下一页.*?page=(.*?)"',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_page_url = 'http://www.sxczfdc.com/pubinfo/More_xm.aspx?page=' + str(
             i)
         response = requests.get(all_page_url, headers=self.headers)
         html = response.text
         comm_url_list = re.findall(
             'style="background-color: .*?(Pub_lpxx.aspx\?DevProjectId=.*?)"',
             html, re.S | re.M)
         area_list = re.findall(
             'style="background-color: .*?center">(.*?)<', html,
             re.S | re.M)
         self.get_comm_info(comm_url_list, area_list)
Exemple #28
0
 def start(self):
     b = AllListUrl(first_page_url=self.url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule='共(.*?)页', )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_page_url = self.url + '&page=' + str(i)
         response = requests.get(url=all_page_url, headers=self.headers)
         html = response.text
         tree = etree.HTML(html)
         comm_url_list = tree.xpath('//dt[@class="name"]/a/@href')
         area_list = tree.xpath('//dl[@class="houseList_n"]/dd[3]/text()')
         for i in range(len(comm_url_list)):
             url = 'http://www.fzfgj.cn/' + comm_url_list[i]
             try:
                 comm = Comm(11)
                 comm.area = area_list[i].replace('所属区域:', '')
                 self.get_comm_info(url, comm)
             except BaseException as e:
                 print('小区错误,co_index={},url={}'.format(co_index, url), e)
Exemple #29
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='1/(\d+)页',
        )
        page = b.get_page_count()

        formdata = {}
        for i in range(1, page + 1):

            res = requests.post(self.url, data=formdata, headers=self.headers)
            html = etree.HTML(res.text)
            formdata["__EVENTTARGET"] = "Pager"
            formdata["__EVENTARGUMENT"] = str(i + 1)
            formdata["__VIEWSTATEGENERATOR"] = "1D9D200C"
            formdata["__VIEWSTATE"] = html.xpath(
                "//input[@id='__VIEWSTATE']/@value")[0]

            comm_url_list = html.xpath("//h3/a/@href")
            self.get_comm_info(comm_url_list)
Exemple #30
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=self.start_url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='<cite>共.*?/(\d+)页',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         url = "http://www.f0795.cn/house/index-htm-page-" + str(
             i) + ".html"
         p = ProducerListUrl(
             page_url=url,
             request_type='get',
             encode='utf-8',
             analyzer_rules_dict=None,
             current_url_rule=
             "//ul[@class='list']//div[@class='text']/h3/a/@href",
             analyzer_type='xpath',
             headers=self.headers)
         comm_url_list = p.get_current_page_url()
         self.get_comm_info(comm_url_list)