def _parse_text_res(self, page: LinkAttrs) -> str: page.link = page.link.replace("\\/", "/") # in case of javascript response = LinkChecker.get_common_web_resource( page.link, timeout=self._timeout, redirect=self._max_redirect, retries=self._max_retries) result = "" groups = [] parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups, self._original_domain, page) if page.res_type == LinkUtility.EXT_WEBPAGE: text = str(LinkUtility.remove_archive_org_footprint(response.text)) else: text = response.text result = re.sub(link_pattern, parse_str_sp, text) for item in groups: if isinstance(item, LinkAttrs): if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\ ArchiveExplorer.is_downloadable_content(item, self._max_level): with self._sync_lock: # print("appending:", item) # print("adding to list:", item.link, "level: ", item.level) if not item.shadow_ref_link == item.ref_link: self._file_manager.write_to_redirect( item.shadow_ref_link, item.ref_link) self._internal_list.append(item) return result
def _parse_text_res(self, page: LinkAttrs) -> str: page.link = page.link.replace("\\/", "/") # in case of javascript response = LinkChecker.get_common_web_resource(page.link, timeout=self._timeout, redirect=self._max_redirect, retries=self._max_retries) result = "" groups = [] parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups, self._original_domain, page) if page.res_type == LinkUtility.EXT_WEBPAGE: text = str(LinkUtility.remove_archive_org_footprint(response.text)) else: text = response.text result = re.sub(link_pattern, parse_str_sp, text) for item in groups: if isinstance(item, LinkAttrs): if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\ ArchiveExplorer.is_downloadable_content(item, self._max_level): with self._sync_lock: # print("appending:", item) # print("adding to list:", item.link, "level: ", item.level) if not item.shadow_ref_link == item.ref_link: self._file_manager.write_to_redirect(item.shadow_ref_link, item.ref_link) self._internal_list.append(item) return result