Esempio n. 1
0
    def crawler(self,_dep=None):
        '''
        爬行器主函数
        '''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider() #维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty,e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth+1
                link_generator = HtmlAnalyzer.extract_links(url_data.html,url_data.url,self.crawl_tags)
                link_list = [ url for url in link_generator]
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([ url for url in link_generator])
                link_list = list(set(link_list))
                for index,link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    # 增加url相似性判断,详见urlfilter.py
                    if not self.check_url_similar(link):
                        continue
                    # 增加url重复判断,详见urlfilter.py
                    if not self.check_url_repeat(link):
                        continue
                    if curr_depth > self.depth:   #最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.fetcher_cache) == self.max_url_num:   #最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link,depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url,block=True)

                for plugin_name in self.plugin_handler: #循环动态调用初始化时注册的插件
                    try:
                        plugin_obj = eval(plugin_name)()
                        plugin_obj.start(url_data)
                    except Exception,e:
                        import traceback
                        traceback.print_exc()

                self.crawler_queue.task_done()
Esempio n. 2
0
    def crawler(self, _dep=None):
        '''
        爬行器主函数
        '''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  #维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1
                #link_generator = HtmlAnalyzer.extract_links(url_data.html,url_data.url,self.crawl_tags)
                link_list = []
                link_generator = HtmlAnalyzer.extract_links_ithome(
                    url_data.html)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:  #最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.fetcher_cache
                           ) == self.max_url_num:  #最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)

                for plugin_name in self.plugin_handler:  #循环动态调用初始化时注册的插件
                    try:
                        plugin_obj = eval(plugin_name)()
                        plugin_obj.start(url_data)
                    except Exception, e:
                        import traceback
                        traceback.print_exc()

                self.crawler_queue.task_done()
Esempio n. 3
0
    def crawler(self, _dep=None):
        """
        爬行器主函数
        """
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()  # 维护爬虫池
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty, e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                pre_depth = url_data.depth
                curr_depth = pre_depth + 1
                link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags)
                link_list = [url for url in link_generator]
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:  # 最大爬行深度判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break

                    if len(self.fetcher_cache) == self.max_url_num:  # 最大收集URL数量判断
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    link = to_unicode(link)
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_cache.insert(url)
                    self.fetcher_queue.put(url, block=True)
                self.crawler_queue.task_done()
Esempio n. 4
0
    def crawler(self, _dep=None):
        '''爬行器主函数'''
        while not self.stopped.isSet() and not self.crawler_stopped.isSet():
            try:
                self._maintain_spider()
                url_data = self.crawler_queue.get(block=False)
            except queue.Empty as e:
                if self.crawler_queue.unfinished_tasks == 0 and self.fetcher_queue.unfinished_tasks == 0:
                    self.stop()
                else:
                    if self.crawler_mode == 1:
                        gevent.sleep()
            else:
                curr_depth = len(str(url_data).split('/'))-2
                link_generator = HtmlAnalyzer.extract_links(url_data.html, url_data.url, self.crawl_tags)
                link_list = list(link_generator)
                if self.dynamic_parse:
                    link_generator = self.webkit.extract_links(url_data.url)
                    link_list.extend([ url for url in link_generator])
                link_list = list(set(link_list))
                for index, link in enumerate(link_list):
                    if not self.check_url_usable(link):
                        continue
                    if curr_depth > self.depth:
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    if self.fetched_url == self.max_url_num:
                        if self.crawler_stopped.isSet():
                            break
                        else:
                            self.crawler_stopped.set()
                            break
                    url = UrlData(link, depth=curr_depth)
                    self.fetcher_bf.insert_element(str(url))
                    self.fetched_url+=1
                    self.fetcher_queue.put(url, block=True)

                self.crawler_queue.task_done()
Esempio n. 5
0
 def feed(self,html):
     return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset)
Esempio n. 6
0
            return

        for link in self.feed(html):
            url = UrlObj(link, depth)
            self.spider.put(url)
        self.spider.remove_thread(self.thread_id)

    def open(self, url):
        strategy = self.spider.strategy
        try:
            resp = requests.get(url, timeout=strategy.timeout)
        except requests.exceptions.RequestException, e:
            raise e
        if resp.status_code != requests.codes.ok:
            resp.raise_for_status()
        charset = HtmlAnalyzer.detectCharSet(resp.text)
        if charset is not None:
            self.charset = charset
            resp.encoding = charset
        return resp.text

    def feed(self,html):
        return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset)


class MySpider(object):
    def __init__(self, max_depth, max_count, root_url):
        self.spider = GeventSpider(max_depth=max_depth, max_count=max_count, root_url=root_url)

    def run(self):
        self.spider.run()
Esempio n. 7
0
 def feed(self,html):
     return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset)
Esempio n. 8
0
            self.logger.debug(
                "sucess crawled '%s' the <%d> urls", url, len(urltable))

        self.stop()

    def open(self, url):
        strategy = self.spider.strategy
        try:
            resp = requests.get(url, headers=strategy.headers,
                                cookies=strategy.cookies, timeout=strategy.timeout,
                                verify=strategy.ssl_verify)
        except requests.exceptions.RequestException, e:
            raise e
        if resp.status_code != requests.codes.ok:
            resp.raise_for_status()
        charset = HtmlAnalyzer.detectCharSet(resp.text)
        if charset is not None:
            self.charset = charset
            resp.encoding = charset
        return resp.text

    def feed(self,html):
        return HtmlAnalyzer.extractLinks(html,self.urlobj.url,self.charset)


    def stop(self):
        self.spider.greenlet_finished.set()
        self.kill(block=False)


class TestSpider(unittest.TestCase):