def __init__(self, original_domain: str, link: str, external_stop_event: multiprocessing.Event, max_thread: int = 1, download_content=True, download_base_dir=None, max_level=2, max_page=200): self._original_domain = original_domain self._archive_link = link self._external_stop_event = external_stop_event self._internal_pages = [] # an array of PageAttrs for page comparison self._external_ref_page = [ ] # an array of PageAttrs for page comparison self._internal_list = [ ] # an array of LinkAttrs for checking download list self._broken_res_list = [] inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( link) file_path, ref_path = LinkUtility.make_valid_web_res_path( path, fragment) self._internal_list.append( LinkAttrs(link=link, path=file_path, ref_link=ref_path, shadow_ref_link=ref_path, source=file_path, res_type=LinkUtility.EXT_WEBPAGE, level=0)) self._max_thread = max_thread self._max_level = max_level self._current_level = 0 self._max_page = max_page if max_thread < 1: self._max_thread = 1 self._download_content = download_content if self._download_content and download_base_dir is None: raise ValueError( "ArchiveExplorer.__init__: download_base_dir cannot be None.") self._file_manager = SiteFileManager( base_dir_path=FilePath.get_default_archive_dir(), file_name=original_domain) self._file_manager.write_to_error_log(LinkAttrs.get_titles()) self._max_redirect = 10 self._max_retries = 2 self._pool = None self._sync_lock = threading.RLock() self._broken_webpage_count = 0 self._broken_image_count = 0 self._broken_css_count = 0 self._broken_js_count = 0 self._broken_others_count = 0 self._total_webpage_count = 0 self._total_image_count = 0 self._total_css_count = 0 self._total_js_count = 0 self._total_others_count = 0 self._total_res_done = 0 self._timeout = 10