def test_website_download(self):
		with TempDir() as temp_dir:
			levels = LevelsCreator(temp_dir.get_path()).create()
			address = "file:"+Resources.path(__file__, 
				"data/original_site-without_broken_links/issues_1.html",
				convert_to_url=True)
			navigator = HTMLMultipageNavigator(address, levels)
			crawler = SimpleDFSCrawler(navigator)
			crawler.run()
			expected_dir = Resources.path(__file__, 
				"data/expected_download-without_broken_links")
			actual_dir = temp_dir.get_path()
			self.assert_(are_dir_trees_equal(expected_dir, actual_dir, 
				ignore=[".gitignore"]))
 def test_simple_browsing(self):
     navigator = HTMLMultipageNavigator(
         "file:" +
         Resources.path(__file__,
                        "../../test/data/original_site/issues_1.html",
                        convert_to_url=True),
         LevelsCreator(None).create())
     navigator.start_in_root()
     root_name = navigator.get_path()[0]
     children1 = navigator.get_children()
     self.assertEqual([
         "2011-07-12", "2011-07-13", "2011-07-14", "2011-07-16",
         "2011-07-16-repetition_1", "2011-07-17"
     ], children1)
     navigator.move_to_child(children1[0])
     self.assertEqual([root_name, "2011-07-12"], navigator.get_path())
     children2 = navigator.get_children()
     self.assertEqual(["01", "02", "03", "04", "05", "06", "07", "08"],
                      children2)
     navigator.move_to_child("05")
     self.assertEqual([root_name, "2011-07-12", "05"], navigator.get_path())
     navigator.move_to_parent()
     self.assertEqual([root_name, "2011-07-12"], navigator.get_path())
     navigator.move_to_parent()
     self.assertEqual([root_name], navigator.get_path())
Example #3
0
 def test_website_download(self):
     with TempDir() as temp_dir:
         levels = LevelsCreator(temp_dir.get_path()).create()
         address = "file:" + Resources.path(
             __file__,
             "data/original_site-without_broken_links/issues_1.html",
             convert_to_url=True)
         navigator = HTMLMultipageNavigator(address, levels)
         crawler = SimpleDFSCrawler(navigator)
         crawler.run()
         expected_dir = Resources.path(
             __file__, "data/expected_download-without_broken_links")
         actual_dir = temp_dir.get_path()
         self.assert_(
             are_dir_trees_equal(expected_dir,
                                 actual_dir,
                                 ignore=[".gitignore"]))
	def __check(self, dir_name, should_be_equal):
		path = Resources.path(__file__, 
			os.path.join("data/dir_tree_comparer", dir_name))
		ret = are_dir_trees_equal(
			os.path.join(path, "01"), 
			os.path.join(path, "02"), ignore=[".gitignore"])
		if should_be_equal:
			self.assertTrue(ret)
		else:
			self.assertFalse(ret)
Example #5
0
	def test_single_threaded_download_without_manager(self):
#		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
#		try:
		with TempDir() as temp_dir:
			levels = LevelsCreator(temp_dir.get_path()).create()
			address = "file:"+\
				Resources.path(__file__, "data/original_site/issues_1.html",
							convert_to_url=True)
			tree = TreeAccessor(_StandardNodeExtended())
			navigator = HTMLMultipageNavigator(address, levels)
			navigator_wrapper = _NavigatorTreeWrapperExtended(navigator, tree)
			crawler = CrawlerThread(navigator_wrapper, tree)
			crawler.run()
			expected_dir = Resources.path(__file__, "data/expected_download")
			actual_dir = temp_dir.get_path()
			self.assert_(are_dir_trees_equal(expected_dir, actual_dir, 
					ignore=[".gitignore"]))
			self.__check_tree_final_state(tree.get_root())
			self.__check_if_each_node_is_processed_once(
				tree.get_root(), {"/root/2011-07-16/06": 0})
Example #6
0
	def test_throttled_download(self):
#		Logger.start(logging_level=logging.DEBUG)
		address = "file:"+\
			Resources.path(__file__, "data/original_site/issues_1.html",
						convert_to_url=True)
		web_pages_no = 34
		max_page_opens_per_second = 15
		min_seconds_taken = float(web_pages_no)/max_page_opens_per_second
		for threads_no in [1, 3]:
			seconds_taken = self.__check_download(
				threads_no, address, max_page_opens_per_second)
#			print >>sys.stderr, "seconds_taken={}".format(seconds_taken)
			self.assertGreaterEqual(seconds_taken, min_seconds_taken)
Example #7
0
 def test_throttled_download(self):
     #		Logger.start(logging_level=logging.DEBUG)
     address = "file:"+\
      Resources.path(__file__, "data/original_site/issues_1.html",
         convert_to_url=True)
     web_pages_no = 34
     max_page_opens_per_second = 15
     min_seconds_taken = float(web_pages_no) / max_page_opens_per_second
     for threads_no in [1, 3]:
         seconds_taken = self.__check_download(threads_no, address,
                                               max_page_opens_per_second)
         #			print >>sys.stderr, "seconds_taken={}".format(seconds_taken)
         self.assertGreaterEqual(seconds_taken, min_seconds_taken)
Example #8
0
 def test_single_threaded_download_without_manager(self):
     #		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
     #		try:
     with TempDir() as temp_dir:
         levels = LevelsCreator(temp_dir.get_path()).create()
         address = "file:"+\
          Resources.path(__file__, "data/original_site/issues_1.html",
             convert_to_url=True)
         tree = TreeAccessor(_StandardNodeExtended())
         navigator = HTMLMultipageNavigator(address, levels)
         navigator_wrapper = _NavigatorTreeWrapperExtended(navigator, tree)
         crawler = CrawlerThread(navigator_wrapper, tree)
         crawler.run()
         expected_dir = Resources.path(__file__, "data/expected_download")
         actual_dir = temp_dir.get_path()
         self.assert_(
             are_dir_trees_equal(expected_dir,
                                 actual_dir,
                                 ignore=[".gitignore"]))
         self.__check_tree_final_state(tree.get_root())
         self.__check_if_each_node_is_processed_once(
             tree.get_root(), {"/root/2011-07-16/06": 0})
Example #9
0
 def test_throttled_download_with_HTTP_server(self):
     #		Logger.start(logging_level=logging.DEBUG)
     with DelayedHTTPFilesServer(
             Resources.path(__file__, "data/original_site"), 0) as server:
         (address, ip_number) = server.start()
         root_address = "http://{}:{}/issues_1.html".format(
             address, ip_number)
         web_pages_no = 34
         max_page_opens_per_second = 15
         min_seconds_taken = float(web_pages_no) / max_page_opens_per_second
         for threads_no in [1, 3]:
             seconds_taken = self.__check_download(
                 threads_no, root_address, max_page_opens_per_second)
             #				print >>sys.stderr, "seconds_taken={}".format(seconds_taken)
             self.assertGreaterEqual(seconds_taken, min_seconds_taken)
Example #10
0
	def test_throttled_download_with_HTTP_server(self):
#		Logger.start(logging_level=logging.DEBUG)
		with DelayedHTTPFilesServer(
				Resources.path(__file__, "data/original_site"), 0) as server:
			(address, ip_number) = server.start()
			root_address = "http://{}:{}/issues_1.html".format(
				address, ip_number)
			web_pages_no = 34
			max_page_opens_per_second = 15
			min_seconds_taken = float(web_pages_no)/max_page_opens_per_second
			for threads_no in [1, 3]:
				seconds_taken = self.__check_download(
					threads_no, root_address, max_page_opens_per_second)
#				print >>sys.stderr, "seconds_taken={}".format(seconds_taken)
				self.assertGreaterEqual(seconds_taken, min_seconds_taken)
Example #11
0
    def __check_download(self,
                         threads_no,
                         address,
                         max_page_opens_per_second=None):
        """@return: run time in seconds"""
        #		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
        #		try:
        with TempDir() as temp_dir:
            token_filler = None
            browser_creator = None
            if max_page_opens_per_second is not None:
                token_bucket = None
                token_bucket = StandardTokenBucket(max_page_opens_per_second)
                token_filler = TokenBucketFiller(token_bucket, 1,
                                                 max_page_opens_per_second)
                token_filler.start()
                browser_creator = ThrottledWebBrowserCreator(
                    MechanizeBrowserCreator(), token_bucket)
            else:
                browser_creator = MechanizeBrowserCreator()

            navigators = []
            for _ in xrange(threads_no):
                navigators.append(
                    HTMLMultipageNavigator(
                        address,
                        LevelsCreator(temp_dir.get_path()).create(),
                        browser_creator))
            sentinel = _StandardNodeExtended()
            crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
            start = time.time()
            crawler.run()
            end = time.time()
            expected_dir = Resources.path(__file__, "data/expected_download")
            actual_dir = temp_dir.get_path()
            self.assert_(
                are_dir_trees_equal(expected_dir,
                                    actual_dir,
                                    ignore=[".gitignore"]))
            self.__check_tree_final_state(sentinel.get_child("root"))
            self.__check_if_each_node_is_processed_once(
                sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
            if max_page_opens_per_second is not None:
                token_filler.stop()
            return end - start
	def test_simple_browsing(self):
		navigator = HTMLMultipageNavigator("file:"+Resources.path(__file__, 
			"../../test/data/original_site/issues_1.html", convert_to_url=True),
			LevelsCreator(None).create())
		navigator.start_in_root()
		root_name = navigator.get_path()[0]
		children1 = navigator.get_children()
		self.assertEqual(["2011-07-12", "2011-07-13", "2011-07-14", 
			"2011-07-16", "2011-07-16-repetition_1", "2011-07-17"], children1) 
		navigator.move_to_child(children1[0])
		self.assertEqual([root_name, "2011-07-12"], navigator.get_path())
		children2 = navigator.get_children()
		self.assertEqual(["01", "02", "03", "04", "05", "06", "07", "08"], 
						children2)
		navigator.move_to_child("05")
		self.assertEqual([root_name, "2011-07-12", "05"], navigator.get_path())
		navigator.move_to_parent()
		self.assertEqual([root_name, "2011-07-12"], navigator.get_path())
		navigator.move_to_parent()
		self.assertEqual([root_name], navigator.get_path())
Example #13
0
	def test_multithreaded_download_speedup_with_slow_HTTP_server(self):
#		Logger.start(logging_level=logging.DEBUG)
		with DelayedHTTPFilesServer(
				Resources.path(__file__, "data/original_site"), 0.1) as server:
			(address, ip_number) = server.start()
			root_address = "http://{}:{}/issues_1.html".format(
				address, ip_number)
			time_taken = []
			threads_no_list = [1, 4]
			for threads_no in threads_no_list:
				run_time = self.__check_download(threads_no, root_address)
				time_taken.append(run_time)
			assert_str = "{} threads time taken: {}s while "\
				"{} threads time taken: {}s".format(
					threads_no_list[0], time_taken[0],
					threads_no_list[1], time_taken[1])
			min_speedup = 1
			## We're expecting at some speedup. The speedup
			## is not fully deterministic and depends e.g. on processor load
			self.assert_(time_taken[0] > min_speedup*time_taken[1], assert_str)
Example #14
0
 def test_multithreaded_download_speedup_with_slow_HTTP_server(self):
     #		Logger.start(logging_level=logging.DEBUG)
     with DelayedHTTPFilesServer(
             Resources.path(__file__, "data/original_site"), 0.1) as server:
         (address, ip_number) = server.start()
         root_address = "http://{}:{}/issues_1.html".format(
             address, ip_number)
         time_taken = []
         threads_no_list = [1, 4]
         for threads_no in threads_no_list:
             run_time = self.__check_download(threads_no, root_address)
             time_taken.append(run_time)
         assert_str = "{} threads time taken: {}s while "\
          "{} threads time taken: {}s".format(
           threads_no_list[0], time_taken[0],
           threads_no_list[1], time_taken[1])
         min_speedup = 1
         ## We're expecting at some speedup. The speedup
         ## is not fully deterministic and depends e.g. on processor load
         self.assert_(time_taken[0] > min_speedup * time_taken[1],
                      assert_str)
Example #15
0
	def __check_download(self,
			threads_no, address, max_page_opens_per_second=None):
		"""@return: run time in seconds"""
#		temp_dir = TempDir(os.path.expanduser("~/tmp"), prefix="dfs_crawler-")
#		try:
		with TempDir() as temp_dir:
			token_filler = None
			browser_creator = None
			if max_page_opens_per_second is not None:
				token_bucket = None
				token_bucket = StandardTokenBucket(max_page_opens_per_second)
				token_filler = TokenBucketFiller(token_bucket, 1, 
					max_page_opens_per_second)
				token_filler.start()
				browser_creator = ThrottledWebBrowserCreator(
					MechanizeBrowserCreator(), token_bucket)
			else:
				browser_creator = MechanizeBrowserCreator()
			
			navigators = []
			for _ in xrange(threads_no):
				navigators.append(HTMLMultipageNavigator(address,
					LevelsCreator(temp_dir.get_path()).create(), 
					browser_creator))
			sentinel = _StandardNodeExtended()
			crawler = _MultithreadedCrawlerExtended(navigators, sentinel)
			start = time.time()
			crawler.run()
			end = time.time()
			expected_dir = Resources.path(__file__, "data/expected_download")
			actual_dir = temp_dir.get_path()
			self.assert_(are_dir_trees_equal(expected_dir, actual_dir, 
					ignore=[".gitignore"]))
			self.__check_tree_final_state(sentinel.get_child("root"))
			self.__check_if_each_node_is_processed_once(
				sentinel.get_child("root"), {"/root/2011-07-16/06": 0})
			if max_page_opens_per_second is not None:
				token_filler.stop()
			return end - start
Example #16
0
 def test_multithreaded_download(self):
     address = "file:"+\
      Resources.path(__file__, "data/original_site/issues_1.html",
         convert_to_url=True)
     for threads_no in [1, 2, 3, 4, 50]:
         self.__check_download(threads_no, address)
Example #17
0
	def test_multithreaded_download(self):
		address = "file:"+\
			Resources.path(__file__, "data/original_site/issues_1.html",
						convert_to_url=True)
		for threads_no in [1, 2, 3, 4, 50]:
			self.__check_download(threads_no, address)