Ejemplo n.º 1
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=self.start_url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='共.*?Count">(\d+)</span>页',
     )
     page = b.get_page_count()
     data = {'__EVENTTARGET': 'navigate$LnkBtnGoto'}
     for i in range(1, int(page) + 1):
         if i == 1:
             res = requests.get(self.start_url, headers=self.headers)
             con = res.content.decode()
             html = etree.HTML(con)
             view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
             valid = html.xpath(
                 "//input[@id='__EVENTVALIDATION']/@value")[0]
             data['__VIEWSTATE'] = view_state
             data['__EVENTVALIDATION'] = valid
             self.comm_list(html)
         else:
             data['navigate$txtNewPageIndex'] = i
             res = requests.post(self.start_url,
                                 data=data,
                                 headers=self.headers)
             con = res.content.decode()
             html = etree.HTML(con)
             view_state = html.xpath("//input[@id='__VIEWSTATE']/@value")[0]
             valid = html.xpath(
                 "//input[@id='__EVENTVALIDATION']/@value")[0]
             data['__VIEWSTATE'] = view_state
             data['__EVENTVALIDATION'] = valid
             self.comm_list(html)
Ejemplo n.º 2
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='green1">1/(.*?)<',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_url = 'http://www.tmsf.com/newhouse/property_searchall.htm?&page=' + str(
             i)
         while True:
             try:
                 response = requests.get(all_url,
                                         headers=self.headers,
                                         timeout=10)
                 if response.status_code is 200:
                     break
             except Exception as e:
                 print(
                     '小区列表页加载不出来,co_index={},url={}'.format(
                         co_index, all_url), e)
         html = response.text
         comm_url_list = re.findall(
             'build_word01" onclick="toPropertyInfo\((.*?)\);', html,
             re.S | re.M)
         self.get_comm_info(comm_url_list)
Ejemplo n.º 3
0
    def get_comm_info(self, all_url_list):
        for i in all_url_list:
            res = requests.get(i, headers=self.headers)
            con = res.content.decode('gbk')
            current_url_list = re.findall(
                '<a href="(.*?)"  target="_blank">查看详细', con)
            for current_url in current_url_list:
                co_id = re.search('id=(\d+)', current_url).group(1)
                res = requests.get(current_url, headers=self.headers)
                con = res.content.decode('gbk')
                if '尾页' in con:
                    b = AllListUrl(first_page_url=current_url,
                                   page_count_rule='总页数.*?<b>(\d+)</b>',
                                   analyzer_type='regex',
                                   request_method='get',
                                   headers=self.headers,
                                   encode='gbk')

                    page_count = b.get_page_count()
                    for i in range(1, int(page_count) + 1):
                        url = current_url + "&page=" + str(i)
                        comm_page = requests.get(url, headers=self.headers)
                        comm_con = comm_page.content.decode('gbk')
                        self.comm_info_parse(comm_con, co_id)
                else:
                    self.comm_info_parse(con, co_id)
Ejemplo n.º 4
0
    def start_crawler(self):
        b = AllListUrl(first_page_url=self.start_url,
                       request_method='get',
                       analyzer_type='regex',
                       encode='gbk',
                       page_count_rule='总共<b>(\d+)<',
                       )
        page = b.get_page_count()

        formdata = {}
        comm_url_list = []
        for i in range(1, int(page) + 1):

            res = requests.post(self.start_url, data=formdata,)
            con  = res.content.decode('gbk')
            con = etree.HTML(con)
            view_state = con.xpath("//input[@name='__VIEWSTATE']/@value")[0]
            valid = con.xpath("//input[@name='__EVENTVALIDATION']/@value")[0]
            view_state = parse.quote_plus(view_state,encoding='gbk')
            valid = parse.quote_plus(valid,encoding='gbk')
            formdata["__VIEWSTATE"] = view_state  # 保存当前页的信息作为下一页请求参数
            formdata["__EVENTVALIDATION"] = valid
            formdata["__EVENTTARGET"] = 'AspNetPager1'
            formdata["__VIEWSTATEGENERATOR"] = "248CD702"
            formdata["__EVENTARGUMENT"] = str(i+1)
            formdata["AspNetPager1_input"] = str(i)

            url_list = con.xpath("//tr[@bgcolor='#F5F9FC']/td[@bgcolor='white']/a/@href")
            comm_url_list.extend(url_list)
        self.comm_info(comm_url_list)
Ejemplo n.º 5
0
    def start(self):
        b = AllListUrl(
            first_page_url=self.url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='pageTotal = (.*?);',
        )

        page = b.get_page_count()
        for i in range(1, int(page) + 1):
            url = 'http://fsfc.fsjw.gov.cn/search/index.do?p=' + str(i)
            response = requests.get(url, headers=self.headers)
            html = response.text
            tree = etree.HTML(html)
            comm_url_list = tree.xpath(
                '//*[@id="content"]/div[2]/div[1]/dl/dd/h3/a/@value')
            for i in comm_url_list:
                comm = Comm(co_index)
                url = 'http://fsfc.fsjw.gov.cn/hpms_project/roomView.jhtml?id=' + i
                try:
                    response = requests.get(url, headers=self.headers)
                except Exception as e:
                    print(e)
                    print("co_index={},小区详情页{}请求失败".format(co_index, url))
                    continue
                self.get_comm_info(url, response, comm)
Ejemplo n.º 6
0
    def start_crawler(self):
        for region in self.region.items():
            region_code = region[0]
            region_name = region[1]
            url = self.start_url + region_code + '.html'
            b = AllListUrl(
                first_page_url=url,
                request_method='get',
                analyzer_type='regex',
                encode='utf-8',
                page_count_rule='共(\d+)页>',
            )
            page = b.get_page_count()
            for i in range(1, int(page) + 1):
                new_url = url + "?page=" + str(i)
                res = requests.get(new_url, headers=self.headers)
                html = etree.HTML(res.text)
                co_list = html.xpath("//dl[@class='spf_lp_searchlist bg1']")
                for co in co_list:
                    comm = Comm(co_index)
                    co_url = co.xpath("./dt/h4/a/@href")[0]
                    comm.co_name = co.xpath("./dt/h4/a/text()")[0]
                    comm.co_address = co.xpath(".//address/text()")[0]
                    comm.co_id = re.search('\d+', co_url).group(0)
                    comm.co_develops = co.xpath(
                        "./dd[@class='dev']/a/text()")[0]
                    comm.co_plan_pro = co.xpath("./dt/h4/span/text()")[0]
                    comm.co_type = co.xpath(".//p/span[2]/text()")[0]
                    comm.area = region_name
                    comm.insert_db()

                    detail_url = "http://www.zstmsf.com" + co_url
                    self.bu_parse(detail_url, comm.co_id)
Ejemplo n.º 7
0
    def start_crawler(self):
        b = AllListUrl(first_page_url=self.start_url,
                       request_method='get',
                       analyzer_type='regex',
                       encode='utf-8',
                       page_count_rule='共(\d+)页',
                       )
        page = b.get_page_count()
        for i in range(1,int(page)+1):
            url = self.start_url + '?pageIndex=2' + str(page)
            page_res = requests.get(url,headers=self.headers)

            html = etree.HTML(page_res.text)
            comm_info_list = html.xpath("//ul/li/div")
            for comm_info in comm_info_list:
                try:
                    co = Comm(co_index)
                    co.co_name = comm_info.xpath("./p/a/text()")[0]
                    deve = comm_info.xpath("./p[2]/text()")[0]
                    addr = comm_info.xpath("./p[3]/text()")[0]
                    co.co_develops = re.search('开发商:(.*)',deve).group(1)
                    co.co_address = re.search('楼盘地址.*?:(.*)',addr).group(1)
                    comm_url = comm_info.xpath("./p/a/@href")[0]
                    co.co_id = re.search('projectId=(\d+)',comm_url).group(1)
                    co.insert_db()
                    co_url = 'http://www.bdfdc.net' + comm_url
                    co_res = requests.get(co_url,headers=self.headers)
                    time.sleep(5)
                    bu_html = etree.HTML(co_res.text)
                    bu_url_list = bu_html.xpath("//div[@style]/a")[1:]
                except Exception as e:
                    # log.error("小区信息错误{}".format(e))
                    print("小区信息错误{}".format(e))
                    continue
                self.bu_info(bu_url_list,co.co_id)
Ejemplo n.º 8
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='1/(.*?)页',
                    headers=self.headers
                    )
     page = b.get_page_count()
     for i in range(0, int(page) + 1):
         page_url = 'http://www.fjlyfdc.com.cn/House/Link/YSXXCX.cshtml?pagenumber=' + str(i)
         response = requests.get(page_url)
         html = response.text
         comm_url_list = re.findall('class="c".*?href="(.*?)"', html, re.S | re.M)
         self.get_comm_info(comm_url_list)
Ejemplo n.º 9
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule='>>>.*?page=(.*?)"',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_page_url = url + '&page=' + str(i)
         response = requests.get(all_page_url, headers=self.headers)
         html = response.text
         tree = etree.HTML(html)
         comm_url_list = tree.xpath('//div[@class="info"]/h3/a/@href')
         self.get_comm_info(comm_url_list)
Ejemplo n.º 10
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=self.start_url,
         request_method='get',
         analyzer_type='regex',
         encode='gbk',
         page_count_rule='共(\d+)页',
     )
     page = b.get_page_count()
     for i in range(1, int(page + 1)):
         url = self.start_url + '?page2=' + str(i)
         res = requests.get(url, headers=self.headers)
         html = etree.HTML(res.content.decode('gbk'))
         comm_url_list = html.xpath("//td/a[@target]/@href")
         self.comm_info(comm_url_list)
Ejemplo n.º 11
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule=' 1/(.*?)页',
                    headers=self.headers
                    )
     page = b.get_page_count()
     for i in range(int(page)):
         all_page_url = url + '?page=' + str(i)
         response = requests.get(all_page_url, headers=self.headers)
         html = response.text
         comm_detail_url_list = re.findall('(/House/ProjectInfo\?ProjectId=.*?)"', html, re.S | re.M)
         self.get_comm_info(comm_detail_url_list)
Ejemplo n.º 12
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule=' 1/(.*?)页',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(int(page)):
         all_page_url = 'http://www.fjnpfdc.com/House/ListCanSell?page=' + str(
             i)
         response = requests.get(all_page_url)
         html = response.text
         comm_url_list = re.findall('<tr align="center">.*?<a href="(.*?)"',
                                    html, re.S | re.M)
         self.get_comm_info(comm_url_list)
Ejemplo n.º 13
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='" >(\d+)</a>&nbsp',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):  #翻页
            url = "http://www.hufdc.com/presell.jspx?pageno=" + str(i)
            response = requests.get(url, headers=self.headers)

            url_html = etree.HTML(response.text)
            self.comm_parse(url_html)
Ejemplo n.º 14
0
    def start_crawler(self):
        all_url = AllListUrl(first_page_url=url,
                             page_count_rule='总页数.*?<b>(.*?)</b>',
                             analyzer_type='regex',
                             request_method='get',
                             headers=self.headers,
                             encode='gbk')
        # 所有分页
        page_count = all_url.get_page_count()
        all_url_list = []
        for i in range(1, page_count + 1):
            all_url_list.append(
                'http://www.gyfc.net.cn/2_proInfo/index.aspx/?page=' + str(i))
        print(all_url_list)

        self.get_comm_info(all_url_list)
Ejemplo n.º 15
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='共.*?Count">(\d+)</span>页',
        )
        page = b.get_page_count()
        formdata = {}
        for i in range(1, int(page) + 1):
            formdata["__EVENTTARGET"] = "navigate$LnkBtnGoto"
            formdata["navigate$txtNewPageIndex"] = i
            try:
                res = requests.post(self.url, headers=self.headers)
            except Exception as e:
                print("co_index={},第{}页翻页失败".format(co_index, i))
                print(e)
                continue
            con = etree.HTML(res.text)
            formdata["__VIEWSTATE"] = con.xpath(
                "//input[@id='__VIEWSTATE']/@value")[0]
            formdata["__EVENTVALIDATION"] = con.xpath(
                "//input[@id='__EVENTVALIDATION']/@value")[0]

            bu_url_list = con.xpath("//td[@style='width:13%']/a/@href")
            bu_pre = con.xpath("//td[@style='width:13%']/a/text()")
            bu_dev = con.xpath("//td[@style='width:24%']/text()")
            co_name = con.xpath("//td[@style='width:15%']/text()")
            # print(i)
            for index in range(len(bu_url_list)):
                bu_detail = "http://www.hcsfcglj.com/Templets/BoZhou/aspx/" + bu_url_list[
                    index]
                bu_pre_sale = bu_pre[index]
                bo_develops = bu_dev[index]
                bu_co_name = co_name[index]
                try:
                    bu_res = requests.get(bu_detail, headers=self.headers)
                except Exception as e:
                    print("co_index={},楼栋{}无法访问".format(co_index, bu_detail))
                    print(e)
                    continue
                bu_con = bu_res.text

                self.get_build_info(bu_pre_sale, bo_develops, bu_co_name,
                                    bu_con)
                self.get_house_info(bu_con)
Ejemplo n.º 16
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.start_url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='cite>.*?/(.*?)页<',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):

            url = self.start_url + '&page=' + str(i)
            res = requests.get(url, headers=self.headers)
            comm_url_list = re.findall("window.open\('(.*?)'\)", res.text,
                                       re.S | re.M)
            self.comm_info(comm_url_list)
Ejemplo n.º 17
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='> ..<a.*?>(.*?)<',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         page_url = "http://zfbzj.baotou.gov.cn/index.php?m=content&c=permit&a=init&page=" + str(
             i)
         response = requests.get(page_url, headers=self.headers)
         html = response.text
         comm_url_list = re.findall(
             'href="(http://zfbzj.baotou\.gov\.cn/index\.php\?m=content&c=permit&a=show&id=.*?)".*?http://zfbzj',
             html, re.S | re.M)
         self.get_comm_info(comm_url_list)
Ejemplo n.º 18
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='> ..<.*?>(.*?)<',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         page_all_url = 'http://www.dzfgj.com/index.php?m=content&c=index&a=lists&catid=61&page=' + str(
             i)
         response = requests.get(page_all_url, headers=self.headers)
         html = response.text
         comm_html = re.search('<tbody>.*?</tbody>', html,
                               re.S | re.M).group()
         comm_info_list = re.findall('<tr>.*?</tr>', comm_html, re.S | re.M)
         self.get_comm_info(comm_info_list)
Ejemplo n.º 19
0
 def start_crawler(self):
     for region in self.region_list:
         region_url = self.start_url + region
         b = AllListUrl(
             first_page_url=region_url,
             request_method='get',
             analyzer_type='regex',
             encode='utf-8',
             page_count_rule='1/(\d+)页',
         )
         page = b.get_page_count()
         for i in range(1, int(page) + 1):
             url = region_url + "&pagenumber=" + str(i)
             res = requests.get(url, headers=self.headers)
             html = etree.HTML(res.text)
             url_list = html.xpath("//tr/td/a/@href")
             self.comm_parse(url_list, region)
Ejemplo n.º 20
0
 def start_crawler(self):
     start_url = self.start_url + "searchSpf.jsp?nowPage=1"
     b = AllListUrl(
         first_page_url=start_url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='/(\d+)页',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         url = self.start_url + "searchSpf.jsp?nowPage=" + str(i)
         res = requests.get(url, headers=self.headers)
         html = etree.HTML(res.content.decode())
         url_list = html.xpath("//b/a/@href")
         for comm_temp in url_list:
             try:
                 comm_url = self.start_url + comm_temp.replace(
                     "./xmxxmainNew", 'xmxx/xmjbxx')
                 com_res = requests.get(comm_url, headers=self.headers)
                 con = com_res.content.decode('gbk')
                 co = Comm(co_index)
                 co.co_id = re.search('Id_xmxq=(.*)', comm_temp).group(1)
                 co.co_name = re.search('3a3a3a">(.*?)</b', con).group(1)
                 co.co_address = re.search('项目地址.*?">(.*?)</td', con,
                                           re.S | re.M).group(1)
                 co.co_develops = re.search('开 发 商.*?">(.*?)</td', con,
                                            re.S | re.M).group(1)
                 co.co_all_house = re.search('总 套 数.*?<td>(.*?)</td', con,
                                             re.S | re.M).group(1)
                 co.co_green = re.search('绿 化 率.*?<td>(.*?)</td', con,
                                         re.S | re.M).group(1)
                 co.co_volumetric = re.search('容 积 率.*?<td>(.*?)</td', con,
                                              re.S | re.M).group(1)
                 try:
                     co.co_build_size = re.search('建设规模.*?" >(.*?)平', con,
                                                  re.S | re.M).group(1)
                 except:
                     co.co_build_size = None
                 co.insert_db()
             except Exception as e:
                 log.error('{}小区错误{}'.format(comm_temp, e))
             self.build_parse(co.co_id)
Ejemplo n.º 21
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='green1">1/(.*?)<',
        )
        page = b.get_page_count()
        for i in range(1, int(page) + 1):
            all_url = 'http://tz.tmsf.com/newhouse/property_searchall.htm'
            data = {'keytype': 1, 'page': page}
            response = requests.post(all_url, data=data, headers=self.headers)

            html = response.text
            comm_url_list = re.findall(
                '<div class="build_txt">.*?<a href="(.*?)"', html, re.S | re.M)
            for i in comm_url_list:
                self.get_comm_detail(i)
Ejemplo n.º 22
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='gbk',
         page_count_rule='页次:<b><font color=red>1</font></b>/<b>(.*?)<',
         headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         page_url = 'http://123.131.127.13/xy/dzlist.asp?page=' + str(i)
         time.sleep(5)
         response = requests.get(page_url, headers=self.headers)
         html = response.content.decode('gbk')
         comm_list_html = re.search('项目电子手册列表.*?<table(.*?)</table>', html,
                                    re.S | re.M).group(1)
         comm_html_list = re.findall('<tr>(.*?)</tr>', comm_list_html,
                                     re.S | re.M)[1:]
         self.get_comm_info(comm_html_list)
Ejemplo n.º 23
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='utf-8',
                    page_count_rule='共(.*?)页',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         if i is 1:
             all_page_url = 'http://www.xyfcj.com/html/jplp/index.html'
         else:
             all_page_url = 'http://www.xyfcj.com/html/jplp/index_' + str(
                 i) + '.html'
         response = requests.get(all_page_url, headers=self.headers)
         html = response.text
         comm_url_list = re.findall(
             '<a style="COLOR: #000000" target="_blank" href="(.*?)"', html,
             re.S | re.M)
         self.get_comm_info(comm_url_list)
Ejemplo n.º 24
0
 def start_crawler(self):
     b = AllListUrl(first_page_url=url,
                    request_method='get',
                    analyzer_type='regex',
                    encode='gbk',
                    page_count_rule='strong>1/(.*?)<',
                    headers=self.headers)
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         index_url = 'http://www.kmhouse.org/moreHousePriceList.asp?page=' + str(
             i)
         try:
             response = requests.get(url=index_url, headers=self.headers)
             html = response.content.decode('gbk')
             comm_url_list = re.findall("cellspacing='3'.*?<a href='(.*?)'",
                                        html)
             self.get_comm_info(comm_url_list)
         except Exception as e:
             print('page页错误,co_index={},url={}'.format(co_index, index_url),
                   e)
Ejemplo n.º 25
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='pg.pageCount = (.*?);',
     )
     page = b.get_page_count()
     all_url_list = []
     for i in range(1, int(page) + 1):
         all_url = 'http://www.gafdc.cn/newhouse/houselist.aspx?hou=0-0-0-0-0-0-&page=' + str(
             i)
         comm_url_list = self.get_comm_url(all_url)
         all_url_list += comm_url_list
     # 遍历所有小区url
     for i in all_url_list:
         comm_url = 'http://www.gafdc.cn/newhouse/' + str(
             i.replace('index', 'base'))
         try:
             self.get_comm_info(comm_url)
         except Exception as e:
             print('小区错误,co_index={},url={}'.format(co_index, comm_url), e)
     all_build_url_list = []
     for i in all_url_list:
         build_url = 'http://www.gafdc.cn/newhouse/' + str(
             i.replace('index', 'table'))
         house_url_list = self.get_build_info(build_url)
         if house_url_list:
             all_build_url_list += house_url_list
         else:
             print('楼栋错误,此小区没有楼栋,co_index={},url={}'.format(
                 co_index, build_url))
     all_house_url_list = []
     form_data_list = []
     for i in all_build_url_list:
         house_url = 'http://www.gafdc.cn/newhouse/GetBuildTableByAjax.ashx'
         data = {'itemRecord': i[0], 'houseCode': i[1]}
         all_house_url_list.append(house_url)
         form_data_list.append(data)
     self.get_house_info(form_data_list)
Ejemplo n.º 26
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=url,
            request_method='get',
            analyzer_type='regex',
            encode='gbk',
            page_count_rule='共(.*?)页',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):
            all_page_url = url + '&Page=' + str(i)
            p = ProducerListUrl(page_url=all_page_url,
                                request_type='get',
                                encode='gbk',
                                analyzer_rules_dict=None,
                                current_url_rule="eval\('openBldg\((.*?)\)",
                                analyzer_type='regex',
                                headers=self.headers)
            comm_url_list = p.get_current_page_url()
            self.get_build_info(comm_url_list)
Ejemplo n.º 27
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.start_url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='页数.*?/(\d+)',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):
            formdata = {
                'page': i,
                'keytype': 1,
            }
            res = requests.post(self.start_url,
                                data=formdata,
                                headers=self.headers)
            html = etree.HTML(res.text)
            url_list = html.xpath("//h3/a")
            self.co_parse(url_list)
Ejemplo n.º 28
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='下页</a>.*?page=(.*?)"',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         try:
             all_page_url = url + '?page=' + str(i)
             response = requests.get(all_page_url, headers=self.headers)
             html = response.text
             comm_url_list = re.findall('项目名称:.*?href="(.*?)"', html,
                                        re.S | re.M)
             self.get_comm_info(comm_url_list)
         except Exception as e:
             print(
                 'page页面错误,co_index={},url={}'.format(
                     co_index, all_page_url), e)
Ejemplo n.º 29
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='gbk',
         page_count_rule='下一页.*?page=(.*?)"',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_page_url = 'http://www.sxczfdc.com/pubinfo/More_xm.aspx?page=' + str(
             i)
         response = requests.get(all_page_url, headers=self.headers)
         html = response.text
         comm_url_list = re.findall(
             'style="background-color: .*?(Pub_lpxx.aspx\?DevProjectId=.*?)"',
             html, re.S | re.M)
         area_list = re.findall(
             'style="background-color: .*?center">(.*?)<', html,
             re.S | re.M)
         self.get_comm_info(comm_url_list, area_list)
Ejemplo n.º 30
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=self.url,
            request_method='get',
            analyzer_type='regex',
            encode='utf-8',
            page_count_rule='1/(\d+)页',
        )
        page = b.get_page_count()

        formdata = {}
        for i in range(1, page + 1):

            res = requests.post(self.url, data=formdata, headers=self.headers)
            html = etree.HTML(res.text)
            formdata["__EVENTTARGET"] = "Pager"
            formdata["__EVENTARGUMENT"] = str(i + 1)
            formdata["__VIEWSTATEGENERATOR"] = "1D9D200C"
            formdata["__VIEWSTATE"] = html.xpath(
                "//input[@id='__VIEWSTATE']/@value")[0]

            comm_url_list = html.xpath("//h3/a/@href")
            self.get_comm_info(comm_url_list)