def test_response(link: str) -> bool: status_code, content_type = LinkChecker.get_response(link) if status_code != 200: print(link, "status bad:", status_code, " content: ", content_type) return False else: print(link, "status good:", status_code, " content: ", content_type) return True
def scrape_web_res(self, page: LinkAttrs): print("look:", page.link, "level: ", page.level) try: if len(page.path) > ArchiveExplorer.MAX_PATH_LEN: # max file path in any file system raise OSError("file path is too long:" + page.path) response_code, content_type = LinkChecker.get_response(page.link) if response_code not in [ResponseCode.LinkOK, ResponseCode.LinkFound, ResponseCode.LinkRedirect]: raise ConnectionError("res is not available: " + page.link) if page.res_type in [LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS, LinkUtility.EXT_JS]: # parse a webpage save_text = self._parse_text_res(page) self._file_manager.write_to_file(page.path, save_text) # elif page.res_type != LinkUtility.EXT_OTHER: # TODO: download normal resources # response = LinkChecker.get_common_web_resource(page.link) # if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT: # self._downloader.write_to_file(page.path, response.content, mode="b") # else: # self._downloader.write_to_file(page.path, response.text, mode="t") else: # response = LinkChecker.get_common_web_resource(page.link) # self._downloader.write_to_file(page.path, response.content, mode="b") self._file_manager.download_file(sub_path=page.path, url=page.link, timeout=self._timeout, redirect=self._max_redirect, retries=self._max_retries) except Exception as ex: print("exception:", ex) print("broken res:", page) with self._sync_lock: self._file_manager.write_to_error_log(page.to_tuple(str(ex))) if page.res_type == LinkUtility.EXT_WEBPAGE: self._broken_webpage_count += 1 elif page.res_type == LinkUtility.EXT_CSS: self._broken_css_count += 1 elif page.res_type == LinkUtility.EXT_IMAGE: self._broken_image_count += 1 elif page.res_type == LinkUtility.EXT_JS: self._broken_js_count += 1 else: self._broken_others_count += 1 self._broken_res_list.append(page) finally: with self._sync_lock: self._total_res_done += 1 if page.res_type == LinkUtility.EXT_WEBPAGE: self._total_webpage_count += 1 elif page.res_type == LinkUtility.EXT_CSS: self._total_css_count += 1 elif page.res_type == LinkUtility.EXT_IMAGE: self._total_image_count += 1 elif page.res_type == LinkUtility.EXT_JS: self._total_js_count += 1 else: self._total_others_count += 1
def test_response(link: str) -> (bool, str): # link_cls, ext = LinkUtility.get_link_class(link) status_code, content_type = LinkChecker.get_response(link) link_cls, ext = LinkUtility.get_link_class(link.rstrip('/')) # print("checking link:", link, " link cls:", link_cls, " ext:", ext) # if "image" in content_type: # link_cls = LinkUtility.EXT_WEBPAGE # elif "html" in content_type: # link_cls = LinkUtility.EXT_WEBPAGE # elif "css" in content_type: # link_cls = LinkUtility.EXT_CSS # elif "javascript" in content_type: # link_cls = LinkUtility.EXT_JS # else: # link_cls = LinkUtility.EXT_OTHER if status_code != 200: # print(link, "status bad:", status_code, " content: ", content_type) return False, link_cls else: # print(link, "status good:", status_code, " content: ", content_type) return True, link_cls
def scrape_web_res(self, page: LinkAttrs): print("look:", page.link, "level: ", page.level) try: if len( page.path ) > ArchiveExplorer.MAX_PATH_LEN: # max file path in any file system raise OSError("file path is too long:" + page.path) response_code, content_type = LinkChecker.get_response(page.link) if response_code not in [ ResponseCode.LinkOK, ResponseCode.LinkFound, ResponseCode.LinkRedirect ]: raise ConnectionError("res is not available: " + page.link) if page.res_type in [ LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS, LinkUtility.EXT_JS ]: # parse a webpage save_text = self._parse_text_res(page) self._file_manager.write_to_file(page.path, save_text) # elif page.res_type != LinkUtility.EXT_OTHER: # TODO: download normal resources # response = LinkChecker.get_common_web_resource(page.link) # if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT: # self._downloader.write_to_file(page.path, response.content, mode="b") # else: # self._downloader.write_to_file(page.path, response.text, mode="t") else: # response = LinkChecker.get_common_web_resource(page.link) # self._downloader.write_to_file(page.path, response.content, mode="b") self._file_manager.download_file(sub_path=page.path, url=page.link, timeout=self._timeout, redirect=self._max_redirect, retries=self._max_retries) except Exception as ex: print("exception:", ex) print("broken res:", page) with self._sync_lock: self._file_manager.write_to_error_log(page.to_tuple(str(ex))) if page.res_type == LinkUtility.EXT_WEBPAGE: self._broken_webpage_count += 1 elif page.res_type == LinkUtility.EXT_CSS: self._broken_css_count += 1 elif page.res_type == LinkUtility.EXT_IMAGE: self._broken_image_count += 1 elif page.res_type == LinkUtility.EXT_JS: self._broken_js_count += 1 else: self._broken_others_count += 1 self._broken_res_list.append(page) finally: with self._sync_lock: self._total_res_done += 1 if page.res_type == LinkUtility.EXT_WEBPAGE: self._total_webpage_count += 1 elif page.res_type == LinkUtility.EXT_CSS: self._total_css_count += 1 elif page.res_type == LinkUtility.EXT_IMAGE: self._total_image_count += 1 elif page.res_type == LinkUtility.EXT_JS: self._total_js_count += 1 else: self._total_others_count += 1