예제 #1
0
    def _parse_text_res(self, page: LinkAttrs) -> str:
        page.link = page.link.replace("\\/", "/")  # in case of javascript
        response = LinkChecker.get_common_web_resource(
            page.link,
            timeout=self._timeout,
            redirect=self._max_redirect,
            retries=self._max_retries)
        result = ""
        groups = []
        parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups,
                                         self._original_domain, page)
        if page.res_type == LinkUtility.EXT_WEBPAGE:
            text = str(LinkUtility.remove_archive_org_footprint(response.text))
        else:
            text = response.text
        result = re.sub(link_pattern, parse_str_sp, text)
        for item in groups:
            if isinstance(item, LinkAttrs):
                if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\
                        ArchiveExplorer.is_downloadable_content(item, self._max_level):
                    with self._sync_lock:
                        # print("appending:", item)
                        # print("adding to list:", item.link, "level: ", item.level)
                        if not item.shadow_ref_link == item.ref_link:
                            self._file_manager.write_to_redirect(
                                item.shadow_ref_link, item.ref_link)
                        self._internal_list.append(item)

        return result
    def scrape_web_res(self, page: LinkAttrs):
        print("look:", page.link, "level: ", page.level)
        try:
            if len(page.path) > ArchiveExplorer.MAX_PATH_LEN:  # max file path in any file system
                raise OSError("file path is too long:" + page.path)
            response_code, content_type = LinkChecker.get_response(page.link)
            if response_code not in [ResponseCode.LinkOK, ResponseCode.LinkFound, ResponseCode.LinkRedirect]:
                raise ConnectionError("res is not available: " + page.link)
            if page.res_type in [LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS, LinkUtility.EXT_JS]:  # parse a webpage
                save_text = self._parse_text_res(page)
                self._file_manager.write_to_file(page.path, save_text)
            # elif page.res_type != LinkUtility.EXT_OTHER:  # TODO: download normal resources
            #     response = LinkChecker.get_common_web_resource(page.link)
            #     if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT:
            #         self._downloader.write_to_file(page.path, response.content, mode="b")
            #     else:
            #         self._downloader.write_to_file(page.path, response.text, mode="t")
            else:
                # response = LinkChecker.get_common_web_resource(page.link)
                # self._downloader.write_to_file(page.path, response.content, mode="b")
                self._file_manager.download_file(sub_path=page.path, url=page.link, timeout=self._timeout,
                                                 redirect=self._max_redirect, retries=self._max_retries)
        except Exception as ex:
            print("exception:", ex)
            print("broken res:", page)
            with self._sync_lock:
                self._file_manager.write_to_error_log(page.to_tuple(str(ex)))
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._broken_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._broken_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._broken_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._broken_js_count += 1
                else:
                    self._broken_others_count += 1

                self._broken_res_list.append(page)
        finally:
            with self._sync_lock:
                self._total_res_done += 1
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._total_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._total_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._total_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._total_js_count += 1
                else:
                    self._total_others_count += 1
    def __init__(self, original_domain: str, link: str, external_stop_event: multiprocessing.Event, max_thread: int=1,
                 download_content=True, download_base_dir=None, max_level=2, max_page=200):
        self._original_domain = original_domain
        self._archive_link = link
        self._external_stop_event = external_stop_event
        self._internal_pages = []  # an array of PageAttrs  for page comparison
        self._external_ref_page = []  # an array of PageAttrs for page comparison
        self._internal_list = []  # an array of LinkAttrs for checking download list
        self._broken_res_list = []
        inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
        file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment)
        self._internal_list.append(LinkAttrs(link=link, path=file_path, ref_link=ref_path, shadow_ref_link=ref_path,
                                             source=file_path,
                                             res_type=LinkUtility.EXT_WEBPAGE, level=0))
        self._max_thread = max_thread
        self._max_level = max_level
        self._current_level = 0
        self._max_page = max_page
        if max_thread < 1:
            self._max_thread = 1
        self._download_content = download_content
        if self._download_content and download_base_dir is None:
            raise ValueError("ArchiveExplorer.__init__: download_base_dir cannot be None.")
        self._file_manager = SiteFileManager(base_dir_path=FilePath.get_default_archive_dir(), file_name=original_domain)
        self._file_manager.write_to_error_log(LinkAttrs.get_titles())
        self._max_redirect = 10
        self._max_retries = 2
        self._pool = None
        self._sync_lock = threading.RLock()

        self._broken_webpage_count = 0
        self._broken_image_count = 0
        self._broken_css_count = 0
        self._broken_js_count = 0
        self._broken_others_count = 0

        self._total_webpage_count = 0
        self._total_image_count = 0
        self._total_css_count = 0
        self._total_js_count = 0
        self._total_others_count = 0

        self._total_res_done = 0

        self._timeout = 10
    def _parse_text_res(self, page: LinkAttrs) -> str:
        page.link = page.link.replace("\\/", "/")  # in case of javascript
        response = LinkChecker.get_common_web_resource(page.link, timeout=self._timeout,
                                                       redirect=self._max_redirect, retries=self._max_retries)
        result = ""
        groups = []
        parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups, self._original_domain, page)
        if page.res_type == LinkUtility.EXT_WEBPAGE:
            text = str(LinkUtility.remove_archive_org_footprint(response.text))
        else:
            text = response.text
        result = re.sub(link_pattern, parse_str_sp, text)
        for item in groups:
            if isinstance(item, LinkAttrs):
                if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\
                        ArchiveExplorer.is_downloadable_content(item, self._max_level):
                    with self._sync_lock:
                        # print("appending:", item)
                        # print("adding to list:", item.link, "level: ", item.level)
                        if not item.shadow_ref_link == item.ref_link:
                            self._file_manager.write_to_redirect(item.shadow_ref_link, item.ref_link)
                        self._internal_list.append(item)

        return result
예제 #5
0
    def scrape_web_res(self, page: LinkAttrs):
        print("look:", page.link, "level: ", page.level)
        try:
            if len(
                    page.path
            ) > ArchiveExplorer.MAX_PATH_LEN:  # max file path in any file system
                raise OSError("file path is too long:" + page.path)
            response_code, content_type = LinkChecker.get_response(page.link)
            if response_code not in [
                    ResponseCode.LinkOK, ResponseCode.LinkFound,
                    ResponseCode.LinkRedirect
            ]:
                raise ConnectionError("res is not available: " + page.link)
            if page.res_type in [
                    LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS,
                    LinkUtility.EXT_JS
            ]:  # parse a webpage
                save_text = self._parse_text_res(page)
                self._file_manager.write_to_file(page.path, save_text)
            # elif page.res_type != LinkUtility.EXT_OTHER:  # TODO: download normal resources
            #     response = LinkChecker.get_common_web_resource(page.link)
            #     if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT:
            #         self._downloader.write_to_file(page.path, response.content, mode="b")
            #     else:
            #         self._downloader.write_to_file(page.path, response.text, mode="t")
            else:
                # response = LinkChecker.get_common_web_resource(page.link)
                # self._downloader.write_to_file(page.path, response.content, mode="b")
                self._file_manager.download_file(sub_path=page.path,
                                                 url=page.link,
                                                 timeout=self._timeout,
                                                 redirect=self._max_redirect,
                                                 retries=self._max_retries)
        except Exception as ex:
            print("exception:", ex)
            print("broken res:", page)
            with self._sync_lock:
                self._file_manager.write_to_error_log(page.to_tuple(str(ex)))
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._broken_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._broken_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._broken_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._broken_js_count += 1
                else:
                    self._broken_others_count += 1

                self._broken_res_list.append(page)
        finally:
            with self._sync_lock:
                self._total_res_done += 1
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._total_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._total_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._total_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._total_js_count += 1
                else:
                    self._total_others_count += 1
예제 #6
0
    def __init__(self,
                 original_domain: str,
                 link: str,
                 external_stop_event: multiprocessing.Event,
                 max_thread: int = 1,
                 download_content=True,
                 download_base_dir=None,
                 max_level=2,
                 max_page=200):
        self._original_domain = original_domain
        self._archive_link = link
        self._external_stop_event = external_stop_event
        self._internal_pages = []  # an array of PageAttrs  for page comparison
        self._external_ref_page = [
        ]  # an array of PageAttrs for page comparison
        self._internal_list = [
        ]  # an array of LinkAttrs for checking download list
        self._broken_res_list = []
        inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
            link)
        file_path, ref_path = LinkUtility.make_valid_web_res_path(
            path, fragment)
        self._internal_list.append(
            LinkAttrs(link=link,
                      path=file_path,
                      ref_link=ref_path,
                      shadow_ref_link=ref_path,
                      source=file_path,
                      res_type=LinkUtility.EXT_WEBPAGE,
                      level=0))
        self._max_thread = max_thread
        self._max_level = max_level
        self._current_level = 0
        self._max_page = max_page
        if max_thread < 1:
            self._max_thread = 1
        self._download_content = download_content
        if self._download_content and download_base_dir is None:
            raise ValueError(
                "ArchiveExplorer.__init__: download_base_dir cannot be None.")
        self._file_manager = SiteFileManager(
            base_dir_path=FilePath.get_default_archive_dir(),
            file_name=original_domain)
        self._file_manager.write_to_error_log(LinkAttrs.get_titles())
        self._max_redirect = 10
        self._max_retries = 2
        self._pool = None
        self._sync_lock = threading.RLock()

        self._broken_webpage_count = 0
        self._broken_image_count = 0
        self._broken_css_count = 0
        self._broken_js_count = 0
        self._broken_others_count = 0

        self._total_webpage_count = 0
        self._total_image_count = 0
        self._total_css_count = 0
        self._total_js_count = 0
        self._total_others_count = 0

        self._total_res_done = 0

        self._timeout = 10
예제 #7
0
 def _map_res_str(captured: [], root_domain: str, page: LinkAttrs,
                  current_match) -> str:
     returned = None
     level = page.level
     try:
         link = current_match.group(0)
         # print("cap:", link)
         match2 = current_match.group(2)
         current_link = current_match.group(1) + match2
         begin_index = str(link).index("/")
         begin_mark = str(link[:begin_index]).strip()
         end_index = begin_index + len(current_link)
         if end_index >= len(link):
             end_mark = ""
         else:
             end_mark = str(link[end_index:]).strip()
         # if "%3" in current_link:  # transform encoded url
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             current_link)
         if len(inner_link) > 0:
             if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE:  # data will be saved in file system
                 if root_domain in domain:
                     is_internal = True
                 else:
                     is_internal = False
                 path_decoded = parse.unquote(path)
                 if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN:
                     short_path, ext = LinkChecker.get_shorter_url_path(
                         path)
                     short_path += ext
                 else:
                     short_path = path
                 if link_class == LinkUtility.EXT_WEBPAGE:
                     if len(ext) > 0 and not ext == ".html":
                         valid_short_path = short_path.replace(ext, ".html")
                     else:
                         valid_short_path = short_path
                 else:
                     valid_short_path = short_path
                 file_path, ref_path = LinkUtility.make_valid_web_res_path(
                     path, fragment)
                 short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(
                     valid_short_path, fragment)
                 current_link = current_link.replace("\\/", "/")
                 captured.append(
                     LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN +
                               current_link,
                               short_file_path,
                               short_ref_path,
                               ref_path,
                               page.path,
                               link_class,
                               level + 1,
                               is_internal=is_internal))
                 returned = begin_mark + short_ref_path + end_mark
             else:  #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE:
                 returned = begin_mark + parse.unquote(match2) + end_mark
             # else:  # capture other resources except external webpage
             #     file_path, ref_path = LinkUtility.make_valid_web_res_path(path)
             #     captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1))
             #     returned = begin_mark + ref_path + end_mark
         else:
             returned = begin_mark + parse.unquote(current_link) + end_mark
     except Exception as ex:
         print("ex in mapping:", ex)
     finally:
         if isinstance(returned, str):
             # print("sub:", returned)
             return returned
         else:
             return ""