def _map_res_str(captured: [], root_domain: str, page: LinkAttrs, current_match) -> str:
     returned = None
     level = page.level
     try:
         link = current_match.group(0)
         # print("cap:", link)
         match2 = current_match.group(2)
         current_link = current_match.group(1) + match2
         begin_index = str(link).index("/")
         begin_mark = str(link[:begin_index]).strip()
         end_index = begin_index + len(current_link)
         if end_index >= len(link):
             end_mark = ""
         else:
             end_mark = str(link[end_index:]).strip()
         # if "%3" in current_link:  # transform encoded url
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(current_link)
         if len(inner_link) > 0:
             if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE:  # data will be saved in file system
                 if root_domain in domain:
                     is_internal = True
                 else:
                     is_internal = False
                 path_decoded = parse.unquote(path)
                 if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN:
                     short_path, ext = LinkChecker.get_shorter_url_path(path)
                     short_path += ext
                 else:
                     short_path = path
                 if link_class == LinkUtility.EXT_WEBPAGE:
                     if len(ext) > 0 and not ext == ".html":
                         valid_short_path = short_path.replace(ext, ".html")
                     else:
                         valid_short_path = short_path
                 else:
                     valid_short_path = short_path
                 file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment)
                 short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(valid_short_path, fragment)
                 current_link = current_link.replace("\\/", "/")
                 captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, short_file_path,
                                           short_ref_path, ref_path,
                                           page.path, link_class, level+1, is_internal=is_internal))
                 returned = begin_mark + short_ref_path + end_mark
             else: #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE:
                 returned = begin_mark + parse.unquote(match2) + end_mark
             # else:  # capture other resources except external webpage
             #     file_path, ref_path = LinkUtility.make_valid_web_res_path(path)
             #     captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1))
             #     returned = begin_mark + ref_path + end_mark
         else:
             returned = begin_mark + parse.unquote(current_link) + end_mark
     except Exception as ex:
         print("ex in mapping:", ex)
     finally:
         if isinstance(returned, str):
             # print("sub:", returned)
             return returned
         else:
             return ""
Exemple #2
0
    def _parse_text_res(self, page: LinkAttrs) -> str:
        page.link = page.link.replace("\\/", "/")  # in case of javascript
        response = LinkChecker.get_common_web_resource(
            page.link,
            timeout=self._timeout,
            redirect=self._max_redirect,
            retries=self._max_retries)
        result = ""
        groups = []
        parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups,
                                         self._original_domain, page)
        if page.res_type == LinkUtility.EXT_WEBPAGE:
            text = str(LinkUtility.remove_archive_org_footprint(response.text))
        else:
            text = response.text
        result = re.sub(link_pattern, parse_str_sp, text)
        for item in groups:
            if isinstance(item, LinkAttrs):
                if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\
                        ArchiveExplorer.is_downloadable_content(item, self._max_level):
                    with self._sync_lock:
                        # print("appending:", item)
                        # print("adding to list:", item.link, "level: ", item.level)
                        if not item.shadow_ref_link == item.ref_link:
                            self._file_manager.write_to_redirect(
                                item.shadow_ref_link, item.ref_link)
                        self._internal_list.append(item)

        return result
    def __init__(self, original_domain: str, link: str, external_stop_event: multiprocessing.Event, max_thread: int=1,
                 download_content=True, download_base_dir=None, max_level=2, max_page=200):
        self._original_domain = original_domain
        self._archive_link = link
        self._external_stop_event = external_stop_event
        self._internal_pages = []  # an array of PageAttrs  for page comparison
        self._external_ref_page = []  # an array of PageAttrs for page comparison
        self._internal_list = []  # an array of LinkAttrs for checking download list
        self._broken_res_list = []
        inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
        file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment)
        self._internal_list.append(LinkAttrs(link=link, path=file_path, ref_link=ref_path, shadow_ref_link=ref_path,
                                             source=file_path,
                                             res_type=LinkUtility.EXT_WEBPAGE, level=0))
        self._max_thread = max_thread
        self._max_level = max_level
        self._current_level = 0
        self._max_page = max_page
        if max_thread < 1:
            self._max_thread = 1
        self._download_content = download_content
        if self._download_content and download_base_dir is None:
            raise ValueError("ArchiveExplorer.__init__: download_base_dir cannot be None.")
        self._file_manager = SiteFileManager(base_dir_path=FilePath.get_default_archive_dir(), file_name=original_domain)
        self._file_manager.write_to_error_log(LinkAttrs.get_titles())
        self._max_redirect = 10
        self._max_retries = 2
        self._pool = None
        self._sync_lock = threading.RLock()

        self._broken_webpage_count = 0
        self._broken_image_count = 0
        self._broken_css_count = 0
        self._broken_js_count = 0
        self._broken_others_count = 0

        self._total_webpage_count = 0
        self._total_image_count = 0
        self._total_css_count = 0
        self._total_js_count = 0
        self._total_others_count = 0

        self._total_res_done = 0

        self._timeout = 10
    def _parse_text_res(self, page: LinkAttrs) -> str:
        page.link = page.link.replace("\\/", "/")  # in case of javascript
        response = LinkChecker.get_common_web_resource(page.link, timeout=self._timeout,
                                                       redirect=self._max_redirect, retries=self._max_retries)
        result = ""
        groups = []
        parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups, self._original_domain, page)
        if page.res_type == LinkUtility.EXT_WEBPAGE:
            text = str(LinkUtility.remove_archive_org_footprint(response.text))
        else:
            text = response.text
        result = re.sub(link_pattern, parse_str_sp, text)
        for item in groups:
            if isinstance(item, LinkAttrs):
                if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\
                        ArchiveExplorer.is_downloadable_content(item, self._max_level):
                    with self._sync_lock:
                        # print("appending:", item)
                        # print("adding to list:", item.link, "level: ", item.level)
                        if not item.shadow_ref_link == item.ref_link:
                            self._file_manager.write_to_redirect(item.shadow_ref_link, item.ref_link)
                        self._internal_list.append(item)

        return result
Exemple #5
0
    def __init__(self,
                 original_domain: str,
                 link: str,
                 external_stop_event: multiprocessing.Event,
                 max_thread: int = 1,
                 download_content=True,
                 download_base_dir=None,
                 max_level=2,
                 max_page=200):
        self._original_domain = original_domain
        self._archive_link = link
        self._external_stop_event = external_stop_event
        self._internal_pages = []  # an array of PageAttrs  for page comparison
        self._external_ref_page = [
        ]  # an array of PageAttrs for page comparison
        self._internal_list = [
        ]  # an array of LinkAttrs for checking download list
        self._broken_res_list = []
        inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
            link)
        file_path, ref_path = LinkUtility.make_valid_web_res_path(
            path, fragment)
        self._internal_list.append(
            LinkAttrs(link=link,
                      path=file_path,
                      ref_link=ref_path,
                      shadow_ref_link=ref_path,
                      source=file_path,
                      res_type=LinkUtility.EXT_WEBPAGE,
                      level=0))
        self._max_thread = max_thread
        self._max_level = max_level
        self._current_level = 0
        self._max_page = max_page
        if max_thread < 1:
            self._max_thread = 1
        self._download_content = download_content
        if self._download_content and download_base_dir is None:
            raise ValueError(
                "ArchiveExplorer.__init__: download_base_dir cannot be None.")
        self._file_manager = SiteFileManager(
            base_dir_path=FilePath.get_default_archive_dir(),
            file_name=original_domain)
        self._file_manager.write_to_error_log(LinkAttrs.get_titles())
        self._max_redirect = 10
        self._max_retries = 2
        self._pool = None
        self._sync_lock = threading.RLock()

        self._broken_webpage_count = 0
        self._broken_image_count = 0
        self._broken_css_count = 0
        self._broken_js_count = 0
        self._broken_others_count = 0

        self._total_webpage_count = 0
        self._total_image_count = 0
        self._total_css_count = 0
        self._total_js_count = 0
        self._total_others_count = 0

        self._total_res_done = 0

        self._timeout = 10
Exemple #6
0
 def _map_res_str(captured: [], root_domain: str, page: LinkAttrs,
                  current_match) -> str:
     returned = None
     level = page.level
     try:
         link = current_match.group(0)
         # print("cap:", link)
         match2 = current_match.group(2)
         current_link = current_match.group(1) + match2
         begin_index = str(link).index("/")
         begin_mark = str(link[:begin_index]).strip()
         end_index = begin_index + len(current_link)
         if end_index >= len(link):
             end_mark = ""
         else:
             end_mark = str(link[end_index:]).strip()
         # if "%3" in current_link:  # transform encoded url
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             current_link)
         if len(inner_link) > 0:
             if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE:  # data will be saved in file system
                 if root_domain in domain:
                     is_internal = True
                 else:
                     is_internal = False
                 path_decoded = parse.unquote(path)
                 if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN:
                     short_path, ext = LinkChecker.get_shorter_url_path(
                         path)
                     short_path += ext
                 else:
                     short_path = path
                 if link_class == LinkUtility.EXT_WEBPAGE:
                     if len(ext) > 0 and not ext == ".html":
                         valid_short_path = short_path.replace(ext, ".html")
                     else:
                         valid_short_path = short_path
                 else:
                     valid_short_path = short_path
                 file_path, ref_path = LinkUtility.make_valid_web_res_path(
                     path, fragment)
                 short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(
                     valid_short_path, fragment)
                 current_link = current_link.replace("\\/", "/")
                 captured.append(
                     LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN +
                               current_link,
                               short_file_path,
                               short_ref_path,
                               ref_path,
                               page.path,
                               link_class,
                               level + 1,
                               is_internal=is_internal))
                 returned = begin_mark + short_ref_path + end_mark
             else:  #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE:
                 returned = begin_mark + parse.unquote(match2) + end_mark
             # else:  # capture other resources except external webpage
             #     file_path, ref_path = LinkUtility.make_valid_web_res_path(path)
             #     captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1))
             #     returned = begin_mark + ref_path + end_mark
         else:
             returned = begin_mark + parse.unquote(current_link) + end_mark
     except Exception as ex:
         print("ex in mapping:", ex)
     finally:
         if isinstance(returned, str):
             # print("sub:", returned)
             return returned
         else:
             return ""