Ejemplo n.º 1
0
 def get_comm_url(self, all_url_list):
     p = ProducerListUrl(page_url=all_url_list,
                         request_type='get',
                         encode='utf-8',
                         current_url_rule="<a class='anone' href='(.*?)'",
                         analyzer_rules_dict=None,
                         analyzer_type='regex',
                         headers=self.headers)
     comm_url_list = p.get_current_page_url()
     return comm_url_list
Ejemplo n.º 2
0
 def start_crawler(self):
     for i in range(1, self.page + 1):
         url = self.url + "More_xm.aspx?page=" + str(i)
         p = ProducerListUrl(page_url=url,
                             request_type='get',
                             encode='utf-8',
                             analyzer_rules_dict=None,
                             current_url_rule="//td[@align='left']/a/@href",
                             analyzer_type='xpath',
                             headers=self.headers)
         comm_url_list = p.get_current_page_url()
         self.get_comm_info(comm_url_list)
Ejemplo n.º 3
0
    def start_crawler(self):
        b = AllListUrl(
            first_page_url=url,
            request_method='get',
            analyzer_type='regex',
            encode='gbk',
            page_count_rule='共(.*?)页',
        )
        page = b.get_page_count()

        for i in range(1, int(page) + 1):
            all_page_url = url + '&Page=' + str(i)
            p = ProducerListUrl(page_url=all_page_url,
                                request_type='get',
                                encode='gbk',
                                analyzer_rules_dict=None,
                                current_url_rule="eval\('openBldg\((.*?)\)",
                                analyzer_type='regex',
                                headers=self.headers)
            comm_url_list = p.get_current_page_url()
            self.get_build_info(comm_url_list)
Ejemplo n.º 4
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=url,
         request_method='get',
         analyzer_type='regex',
         encode='gbk',
         page_count_rule='>>></a>.*?href=".*?page-(.*?)\.html',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         all_url = 'http://www.jmfc.com.cn/index/caid-2/addno-1/page-' + str(
             i) + '.html'
         p = ProducerListUrl(
             page_url=all_url,
             request_type='get',
             encode='gbk',
             analyzer_rules_dict=None,
             current_url_rule=
             "/html/body/div[5]/div[6]/div/div[2]/h3/a/@href",
             analyzer_type='xpath',
             headers=self.headers)
         comm_url_list = p.get_current_page_url()
         self.get_comm_info(comm_url_list)
Ejemplo n.º 5
0
 def start_crawler(self):
     b = AllListUrl(
         first_page_url=self.start_url,
         request_method='get',
         analyzer_type='regex',
         encode='utf-8',
         page_count_rule='<cite>共.*?/(\d+)页',
     )
     page = b.get_page_count()
     for i in range(1, int(page) + 1):
         url = "http://www.f0795.cn/house/index-htm-page-" + str(
             i) + ".html"
         p = ProducerListUrl(
             page_url=url,
             request_type='get',
             encode='utf-8',
             analyzer_rules_dict=None,
             current_url_rule=
             "//ul[@class='list']//div[@class='text']/h3/a/@href",
             analyzer_type='xpath',
             headers=self.headers)
         comm_url_list = p.get_current_page_url()
         self.get_comm_info(comm_url_list)