Ejemplo n.º 1
0
    def _parse_text_res(self, page: LinkAttrs) -> str:
        page.link = page.link.replace("\\/", "/")  # in case of javascript
        response = LinkChecker.get_common_web_resource(
            page.link,
            timeout=self._timeout,
            redirect=self._max_redirect,
            retries=self._max_retries)
        result = ""
        groups = []
        parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups,
                                         self._original_domain, page)
        if page.res_type == LinkUtility.EXT_WEBPAGE:
            text = str(LinkUtility.remove_archive_org_footprint(response.text))
        else:
            text = response.text
        result = re.sub(link_pattern, parse_str_sp, text)
        for item in groups:
            if isinstance(item, LinkAttrs):
                if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\
                        ArchiveExplorer.is_downloadable_content(item, self._max_level):
                    with self._sync_lock:
                        # print("appending:", item)
                        # print("adding to list:", item.link, "level: ", item.level)
                        if not item.shadow_ref_link == item.ref_link:
                            self._file_manager.write_to_redirect(
                                item.shadow_ref_link, item.ref_link)
                        self._internal_list.append(item)

        return result
    def _parse_text_res(self, page: LinkAttrs) -> str:
        page.link = page.link.replace("\\/", "/")  # in case of javascript
        response = LinkChecker.get_common_web_resource(page.link, timeout=self._timeout,
                                                       redirect=self._max_redirect, retries=self._max_retries)
        result = ""
        groups = []
        parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups, self._original_domain, page)
        if page.res_type == LinkUtility.EXT_WEBPAGE:
            text = str(LinkUtility.remove_archive_org_footprint(response.text))
        else:
            text = response.text
        result = re.sub(link_pattern, parse_str_sp, text)
        for item in groups:
            if isinstance(item, LinkAttrs):
                if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\
                        ArchiveExplorer.is_downloadable_content(item, self._max_level):
                    with self._sync_lock:
                        # print("appending:", item)
                        # print("adding to list:", item.link, "level: ", item.level)
                        if not item.shadow_ref_link == item.ref_link:
                            self._file_manager.write_to_redirect(item.shadow_ref_link, item.ref_link)
                        self._internal_list.append(item)

        return result