Ejemplo n.º 1
0
 def process(self, tree_path, page_file):
     assert len(tree_path) > 0
     dir_path = os.path.join(self.__dst_dir_path,
                             _convert_tree_path_to_dir_path(tree_path))
     lenient_makedir(dir_path)
     error_page_path = os.path.join(dir_path, "error.txt")
     _handle_error_page(page_file, error_page_path)
Ejemplo n.º 2
0
	def process(self, tree_path, page_file):
		assert len(tree_path) > 0
		dir_path = os.path.join(self.__dst_dir_path,
			_convert_tree_path_to_dir_path(tree_path))
		lenient_makedir(dir_path)
		error_page_path = os.path.join(dir_path, "error.txt")
		_handle_error_page(page_file, error_page_path)
Ejemplo n.º 3
0
 def process(self, tree_path, page_file):
     assert len(tree_path) > 0
     dir_path = os.path.join(self.__dst_dir_path,
                             _convert_tree_path_to_dir_path(tree_path[:-1]))
     lenient_makedir(dir_path)
     error_page_path = os.path.join(dir_path, tree_path[-1] + "-error.txt")
     _handle_error_page(page_file, error_page_path)
     file_path = os.path.join(dir_path, tree_path[-1] + ".html")
     self.__download_page(page_file, file_path)
Ejemplo n.º 4
0
	def process(self, tree_path, page_file):
		assert len(tree_path) > 0
		dir_path = os.path.join(self.__dst_dir_path,
			_convert_tree_path_to_dir_path(tree_path[:-1]))
		lenient_makedir(dir_path)
		error_page_path = os.path.join(dir_path, tree_path[-1]+"-error.txt")
		_handle_error_page(page_file, error_page_path)
		file_path = os.path.join(dir_path, tree_path[-1]+".html")
		self.__download_page(page_file, file_path)
    def __init__(self,
                 navigators,
                 sentinel,
                 activity_schedule=None,
                 log_file_path=None,
                 state_file_path=None,
                 save_period=None,
                 logging_level=logging.ERROR):
        """
		@param navigators: list of navigators to be used by the crawler.
			Each navigator will be run in a separate thread, thus the
			number of the threads is equal to the number of navigators.
		@type navigators: list of L{AbstractTreeNavigator}s
		@param sentinel: a technical node which will be made parent of the 
			root node.
		@type sentinel: L{AbstractNode}
		@param activity_schedule: if C{None}, no schedule is used and the 
			program works until it finishes crawling.
		@type activity_schedule: L{AbstractActivitySchedule} 
		@param log_file_path: path to the log file. If C{None}, no log file
			will be used.
		@param state_file_path: path to the file where the state of the
			program will be saved. If C{None}, the state will not be saved.
		@param save_period: time between saving the tree state. If
			C{state_file_path} is C{None}, this value is ignored.
		@param logging_level: one of the logging level constants from C{logging}
		"""
        if log_file_path is not None:
            lenient_makedir(os.path.dirname(log_file_path))
        if state_file_path is not None:
            if os.path.exists(state_file_path):
                print "State file already exists. Loading the tree from this "\
                 "file and changing nodes with state PROCESSING to OPEN ... ",
                self.__load_state_file(state_file_path, sentinel)
                print "Done."
            else:
                lenient_makedir(os.path.dirname(state_file_path))
        self.__tree = RWLockTreeAccessor(sentinel)
        self.__navigators = navigators
        self.__manager = None
        self.__state_file_path = state_file_path
        self.__save_period = save_period
        self.__activity_schedule = activity_schedule
        if activity_schedule is None:
            self.__activity_schedule = AlwaysActiveSchedule()
        self.__logging_level = logging_level
        self.__log_file_path = log_file_path
Ejemplo n.º 6
0
	def __init__(self, navigators, sentinel, activity_schedule=None,  
			log_file_path=None, state_file_path=None, save_period=None,
			logging_level=logging.ERROR):
		"""
		@param navigators: list of navigators to be used by the crawler.
			Each navigator will be run in a separate thread, thus the
			number of the threads is equal to the number of navigators.
		@type navigators: list of L{AbstractTreeNavigator}s
		@param sentinel: a technical node which will be made parent of the 
			root node.
		@type sentinel: L{AbstractNode}
		@param activity_schedule: if C{None}, no schedule is used and the 
			program works until it finishes crawling.
		@type activity_schedule: L{AbstractActivitySchedule} 
		@param log_file_path: path to the log file. If C{None}, no log file
			will be used.
		@param state_file_path: path to the file where the state of the
			program will be saved. If C{None}, the state will not be saved.
		@param save_period: time between saving the tree state. If
			C{state_file_path} is C{None}, this value is ignored.
		@param logging_level: one of the logging level constants from C{logging}
		"""
		if log_file_path is not None:
			lenient_makedir(os.path.dirname(log_file_path))
		if state_file_path is not None:
			if os.path.exists(state_file_path):
				print "State file already exists. Loading the tree from this "\
					"file and changing nodes with state PROCESSING to OPEN ... ",
				self.__load_state_file(state_file_path, sentinel)
				print "Done."
			else:
				lenient_makedir(os.path.dirname(state_file_path))
		self.__tree = RWLockTreeAccessor(sentinel)
		self.__navigators = navigators
		self.__manager = None
		self.__state_file_path = state_file_path
		self.__save_period = save_period
		self.__activity_schedule = activity_schedule
		if activity_schedule is None:
			self.__activity_schedule = AlwaysActiveSchedule()
		self.__logging_level = logging_level
		self.__log_file_path = log_file_path
Ejemplo n.º 7
0
	def create(self, args):
		lenient_makedir(args.destination_dir)
		return LevelsCreator(args.destination_dir).create()