Exemple #1
0
class CmdLnNavigatorsCreator(AbstractCmdLnNavigatorsCreator):
    def __init__(self, levels_creator):
        """@type levels_creator: L{AbstractCmdLnLevelsCreator}"""
        self.__token_filler = None
        self.__levels_creator = levels_creator

    def fill_parser(self, parser):
        parser.add_argument("source_address",
                            help="the address of the web site to crawl.")
        parser.add_argument("--max_pages_per_second", type=float,
         help="Maximal number of web pages downloads per second "\
          "(a real number). By default no limit is imposed.")
        self.__levels_creator.fill_parser(parser)

    def create(self, args, navigators_count):
        browser_creator = self.__get_browser_creator_and_start_token_filler(
            args.max_pages_per_second)
        navigators = []
        for _ in range(navigators_count):
            navigators.append(
                HTMLMultipageNavigator(args.source_address,
                                       self.__levels_creator.create(args),
                                       browser_creator))
        return navigators

    def __get_browser_creator_and_start_token_filler(self,
                                                     max_pages_per_second):
        self.__token_filler = None
        browser_creator = None
        if max_pages_per_second is not None:
            token_bucket = StandardTokenBucket(max_pages_per_second)
            browser_creator = ThrottledWebBrowserCreator(
                self._create_browser_creator(), token_bucket)
            self.__token_filler = TokenBucketFiller(token_bucket, 1,
                                                    max_pages_per_second)
            self.__token_filler.daemon = True
            self.__token_filler.start()
        else:
            browser_creator = self._create_browser_creator()
        return browser_creator

    def _create_browser_creator(self):
        """
		It is possible to override this function to use a different 
		C{AbstractWebBrowserCreator}.
		
		@rtype: C{AbstractWebBrowserCreator}
		"""
        return MechanizeBrowserCreator()

    def on_exit(self):
        if self.__token_filler is not None:
            self.__token_filler.stop()
        self.__levels_creator.on_exit()
class CmdLnNavigatorsCreator(AbstractCmdLnNavigatorsCreator):
	def __init__(self, levels_creator):
		"""@type levels_creator: L{AbstractCmdLnLevelsCreator}"""
		self.__token_filler = None
		self.__levels_creator = levels_creator
	
	def fill_parser(self, parser):
		parser.add_argument("source_address", 
			help="the address of the web site to crawl.")
		parser.add_argument("--max_pages_per_second", type=float, 
			help="Maximal number of web pages downloads per second "\
				"(a real number). By default no limit is imposed.")
		self.__levels_creator.fill_parser(parser)
	
	def create(self, args, navigators_count):
		browser_creator = self.__get_browser_creator_and_start_token_filler(
			args.max_pages_per_second)
		navigators = []
		for _ in range(navigators_count):
			navigators.append(
				HTMLMultipageNavigator(args.source_address, 
					self.__levels_creator.create(args), 
					browser_creator))
		return navigators

	def __get_browser_creator_and_start_token_filler(self, 
			max_pages_per_second):
		self.__token_filler = None
		browser_creator = None
		if max_pages_per_second is not None:
			token_bucket = StandardTokenBucket(max_pages_per_second)
			browser_creator = ThrottledWebBrowserCreator(
				self._create_browser_creator(), token_bucket)
			self.__token_filler = TokenBucketFiller(
				token_bucket, 1, max_pages_per_second)
			self.__token_filler.daemon = True
			self.__token_filler.start()
		else:
			browser_creator = self._create_browser_creator()
		return browser_creator
	
	def _create_browser_creator(self):
		"""
		It is possible to override this function to use a different 
		C{AbstractWebBrowserCreator}.
		
		@rtype: C{AbstractWebBrowserCreator}
		"""  
		return MechanizeBrowserCreator()

	def on_exit(self):
		if self.__token_filler is not None:
			self.__token_filler.stop()
		self.__levels_creator.on_exit()
Exemple #3
0
 def __get_browser_creator_and_start_token_filler(self,
                                                  max_pages_per_second):
     self.__token_filler = None
     browser_creator = None
     if max_pages_per_second is not None:
         token_bucket = StandardTokenBucket(max_pages_per_second)
         browser_creator = ThrottledWebBrowserCreator(
             self._create_browser_creator(), token_bucket)
         self.__token_filler = TokenBucketFiller(token_bucket, 1,
                                                 max_pages_per_second)
         self.__token_filler.daemon = True
         self.__token_filler.start()
     else:
         browser_creator = self._create_browser_creator()
     return browser_creator
 def test_get(self):
     bucket = StandardTokenBucket(1000)
     filler = TokenBucketFiller(bucket, 2, 3)
     ths = []
     threads_no = 2
     for i in xrange(threads_no):
         ths.append(_Incrementor(bucket))
         ths[i].start()
     filler.start()
     time.sleep(3)
     for i in xrange(threads_no):
         ths[i].order_stop()
     for i in xrange(threads_no):
         ths[i].join()
     filler.stop()
     results = []
     sum = 0
     for i in xrange(threads_no):
         results.append(ths[i].get_result())
         sum += results[i]
     self.assertEqual(8, sum)
Exemple #5
0
	def __check_download(self,
			threads_no, address, max_page_opens_per_second=None):
		"""@return: run time in seconds"""
#		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
#		try:
		with TempDir() as temp_dir:
			token_filler = None
			browser_creator = None
			if max_page_opens_per_second is not None:
				token_bucket = None
				token_bucket = StandardTokenBucket(max_page_opens_per_second)
				token_filler = TokenBucketFiller(token_bucket, 1, 
					max_page_opens_per_second)
				token_filler.start()
				browser_creator = ThrottledWebBrowserCreator(
					MechanizeBrowserCreator(), token_bucket)
			else:
				browser_creator = MechanizeBrowserCreator()
			
			navigators = []
			for _ in xrange(threads_no):
				navigators.append(HTMLMultipageNavigator(address,
					LevelsCreator(temp_dir.get_path()).create(), 
					browser_creator))
			sentinel = _StandardNodeExtended()
			crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
			start = time.time()
			crawler.run()
			end = time.time()
			expected_dir = Resources.path(__file__, "data/expected_download")
			actual_dir = temp_dir.get_path()
			self.assert_(are_dir_trees_equal(expected_dir, actual_dir, 
					ignore=[".gitignore"]))
			self.__check_tree_final_state(sentinel.get_child("root"))
			self.__check_if_each_node_is_processed_once(
				sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
			if max_page_opens_per_second is not None:
				token_filler.stop()
			return end - start
	def __get_browser_creator_and_start_token_filler(self, 
			max_pages_per_second):
		self.__token_filler = None
		browser_creator = None
		if max_pages_per_second is not None:
			token_bucket = StandardTokenBucket(max_pages_per_second)
			browser_creator = ThrottledWebBrowserCreator(
				self._create_browser_creator(), token_bucket)
			self.__token_filler = TokenBucketFiller(
				token_bucket, 1, max_pages_per_second)
			self.__token_filler.daemon = True
			self.__token_filler.start()
		else:
			browser_creator = self._create_browser_creator()
		return browser_creator
    def __check_download(self,
                         threads_no,
                         address,
                         max_page_opens_per_second=None):
        """@return: run time in seconds"""
        #		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
        #		try:
        with TempDir() as temp_dir:
            token_filler = None
            browser_creator = None
            if max_page_opens_per_second is not None:
                token_bucket = None
                token_bucket = StandardTokenBucket(max_page_opens_per_second)
                token_filler = TokenBucketFiller(token_bucket, 1,
                                                 max_page_opens_per_second)
                token_filler.start()
                browser_creator = ThrottledWebBrowserCreator(
                    MechanizeBrowserCreator(), token_bucket)
            else:
                browser_creator = MechanizeBrowserCreator()

            navigators = []
            for _ in xrange(threads_no):
                navigators.append(
                    HTMLMultipageNavigator(
                        address,
                        LevelsCreator(temp_dir.get_path()).create(),
                        browser_creator))
            sentinel = _StandardNodeExtended()
            crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
            start = time.time()
            crawler.run()
            end = time.time()
            expected_dir = Resources.path(__file__, "data/expected_download")
            actual_dir = temp_dir.get_path()
            self.assert_(
                are_dir_trees_equal(expected_dir,
                                    actual_dir,
                                    ignore=[".gitignore"]))
            self.__check_tree_final_state(sentinel.get_child("root"))
            self.__check_if_each_node_is_processed_once(
                sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
            if max_page_opens_per_second is not None:
                token_filler.stop()
            return end - start