def __get_browser_creator_and_start_token_filler(self, max_pages_per_second): self.__token_filler = None browser_creator = None if max_pages_per_second is not None: token_bucket = StandardTokenBucket(max_pages_per_second) browser_creator = ThrottledWebBrowserCreator( self._create_browser_creator(), token_bucket) self.__token_filler = TokenBucketFiller(token_bucket, 1, max_pages_per_second) self.__token_filler.daemon = True self.__token_filler.start() else: browser_creator = self._create_browser_creator() return browser_creator
def __check_download(self, threads_no, address, max_page_opens_per_second=None): """@return: run time in seconds""" # temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-") # try: with TempDir() as temp_dir: token_filler = None browser_creator = None if max_page_opens_per_second is not None: token_bucket = None token_bucket = StandardTokenBucket(max_page_opens_per_second) token_filler = TokenBucketFiller(token_bucket, 1, max_page_opens_per_second) token_filler.start() browser_creator = ThrottledWebBrowserCreator( MechanizeBrowserCreator(), token_bucket) else: browser_creator = MechanizeBrowserCreator() navigators = [] for _ in xrange(threads_no): navigators.append( HTMLMultipageNavigator( address, LevelsCreator(temp_dir.get_path()).create(), browser_creator)) sentinel = _StandardNodeExtended() crawler = _MultithreadedCrawlerExtended(navigators, sentinel) start = time.time() crawler.run() end = time.time() expected_dir = Resources.path(__file__, "data/expected_download") actual_dir = temp_dir.get_path() self.assert_( are_dir_trees_equal(expected_dir, actual_dir, ignore=[".gitignore"])) self.__check_tree_final_state(sentinel.get_child("root")) self.__check_if_each_node_is_processed_once( sentinel.get_child("root"), {"/root/2011-07-16/06": 0}) if max_page_opens_per_second is not None: token_filler.stop() return end - start