Esempio n. 1
0
    def __init__(self,
                 original_domain: str,
                 link: str,
                 external_stop_event: multiprocessing.Event,
                 max_thread: int = 1,
                 download_content=True,
                 download_base_dir=None,
                 max_level=2,
                 max_page=200):
        self._original_domain = original_domain
        self._archive_link = link
        self._external_stop_event = external_stop_event
        self._internal_pages = []  # an array of PageAttrs  for page comparison
        self._external_ref_page = [
        ]  # an array of PageAttrs for page comparison
        self._internal_list = [
        ]  # an array of LinkAttrs for checking download list
        self._broken_res_list = []
        inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
            link)
        file_path, ref_path = LinkUtility.make_valid_web_res_path(
            path, fragment)
        self._internal_list.append(
            LinkAttrs(link=link,
                      path=file_path,
                      ref_link=ref_path,
                      shadow_ref_link=ref_path,
                      source=file_path,
                      res_type=LinkUtility.EXT_WEBPAGE,
                      level=0))
        self._max_thread = max_thread
        self._max_level = max_level
        self._current_level = 0
        self._max_page = max_page
        if max_thread < 1:
            self._max_thread = 1
        self._download_content = download_content
        if self._download_content and download_base_dir is None:
            raise ValueError(
                "ArchiveExplorer.__init__: download_base_dir cannot be None.")
        self._file_manager = SiteFileManager(
            base_dir_path=FilePath.get_default_archive_dir(),
            file_name=original_domain)
        self._file_manager.write_to_error_log(LinkAttrs.get_titles())
        self._max_redirect = 10
        self._max_retries = 2
        self._pool = None
        self._sync_lock = threading.RLock()

        self._broken_webpage_count = 0
        self._broken_image_count = 0
        self._broken_css_count = 0
        self._broken_js_count = 0
        self._broken_others_count = 0

        self._total_webpage_count = 0
        self._total_image_count = 0
        self._total_css_count = 0
        self._total_js_count = 0
        self._total_others_count = 0

        self._total_res_done = 0

        self._timeout = 10
Esempio n. 2
0
 def _map_res_str(captured: [], root_domain: str, page: LinkAttrs,
                  current_match) -> str:
     returned = None
     level = page.level
     try:
         link = current_match.group(0)
         # print("cap:", link)
         match2 = current_match.group(2)
         current_link = current_match.group(1) + match2
         begin_index = str(link).index("/")
         begin_mark = str(link[:begin_index]).strip()
         end_index = begin_index + len(current_link)
         if end_index >= len(link):
             end_mark = ""
         else:
             end_mark = str(link[end_index:]).strip()
         # if "%3" in current_link:  # transform encoded url
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             current_link)
         if len(inner_link) > 0:
             if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE:  # data will be saved in file system
                 if root_domain in domain:
                     is_internal = True
                 else:
                     is_internal = False
                 path_decoded = parse.unquote(path)
                 if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN:
                     short_path, ext = LinkChecker.get_shorter_url_path(
                         path)
                     short_path += ext
                 else:
                     short_path = path
                 if link_class == LinkUtility.EXT_WEBPAGE:
                     if len(ext) > 0 and not ext == ".html":
                         valid_short_path = short_path.replace(ext, ".html")
                     else:
                         valid_short_path = short_path
                 else:
                     valid_short_path = short_path
                 file_path, ref_path = LinkUtility.make_valid_web_res_path(
                     path, fragment)
                 short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(
                     valid_short_path, fragment)
                 current_link = current_link.replace("\\/", "/")
                 captured.append(
                     LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN +
                               current_link,
                               short_file_path,
                               short_ref_path,
                               ref_path,
                               page.path,
                               link_class,
                               level + 1,
                               is_internal=is_internal))
                 returned = begin_mark + short_ref_path + end_mark
             else:  #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE:
                 returned = begin_mark + parse.unquote(match2) + end_mark
             # else:  # capture other resources except external webpage
             #     file_path, ref_path = LinkUtility.make_valid_web_res_path(path)
             #     captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1))
             #     returned = begin_mark + ref_path + end_mark
         else:
             returned = begin_mark + parse.unquote(current_link) + end_mark
     except Exception as ex:
         print("ex in mapping:", ex)
     finally:
         if isinstance(returned, str):
             # print("sub:", returned)
             return returned
         else:
             return ""