Example #1
0
    def _create_browser_creator(self):
        """
		It is possible to override this function to use a different 
		C{AbstractWebBrowserCreator}.
		
		@rtype: C{AbstractWebBrowserCreator}
		"""
        return MechanizeBrowserCreator()
Example #2
0
    def __check_download(self,
                         threads_no,
                         address,
                         max_page_opens_per_second=None):
        """@return: run time in seconds"""
        #		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
        #		try:
        with TempDir() as temp_dir:
            token_filler = None
            browser_creator = None
            if max_page_opens_per_second is not None:
                token_bucket = None
                token_bucket = StandardTokenBucket(max_page_opens_per_second)
                token_filler = TokenBucketFiller(token_bucket, 1,
                                                 max_page_opens_per_second)
                token_filler.start()
                browser_creator = ThrottledWebBrowserCreator(
                    MechanizeBrowserCreator(), token_bucket)
            else:
                browser_creator = MechanizeBrowserCreator()

            navigators = []
            for _ in xrange(threads_no):
                navigators.append(
                    HTMLMultipageNavigator(
                        address,
                        LevelsCreator(temp_dir.get_path()).create(),
                        browser_creator))
            sentinel = _StandardNodeExtended()
            crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
            start = time.time()
            crawler.run()
            end = time.time()
            expected_dir = Resources.path(__file__, "data/expected_download")
            actual_dir = temp_dir.get_path()
            self.assert_(
                are_dir_trees_equal(expected_dir,
                                    actual_dir,
                                    ignore=[".gitignore"]))
            self.__check_tree_final_state(sentinel.get_child("root"))
            self.__check_if_each_node_is_processed_once(
                sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
            if max_page_opens_per_second is not None:
                token_filler.stop()
            return end - start
Example #3
0
    def __init__(self, address, levels, browser_creator=None):
        """
		@param browser_creator: a creator of browsers that will be used
			while crawling the web site. The default browser used here 
			is L{MechanizeBrowser}.
		@type browser_creator: L{AbstractWebBrowserCreator}
		@param levels: list of L{Level} objects. The first element is a level 
			corresponding to the root node, the last one corresponds to
			leafs level.
		@param address: URL address string
		"""
        self.__address = address
        self.__browser_creator = browser_creator
        if browser_creator is None:
            self.__browser_creator = MechanizeBrowserCreator()
        self.__br = None
        self.__levels = levels
        self.__path = None
        self.__children_history = None
        self.__current_children = None
        """