def testScrapePageBatch(self): save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) domains_links = FileHandler.read_lines_from_file(file_path) for link in domains_links: # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer( original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def testGetBestProfileBatch(self): file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt" domains = FileHandler.read_lines_from_file(file_path) save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) for domain in domains: print("begin domain:", domain) try: archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000) CsvLogger.log_to_file_path(save_path, [archive.to_tuple()]) except Exception as ex: print(ex)
def testScrapePage(self): # link = "http://web.archive.org/web/20111102054835/http://www.agfdh.org:80/" link = "http://web.archive.org/web/20150425143742/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer(original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def get_archive_detail(self) -> ArchiveDetail: good_webpage_percent = 0 if self._total_webpage_count == 0 else 1 - self._broken_webpage_count / self._total_webpage_count good_image_percent = 0 if self._total_image_count == 0 else 1 - self._broken_image_count / self._total_image_count good_js_percent = 0 if self._total_js_count == 0 else 1 - self._broken_js_count / self._total_js_count good_css_percent = 0 if self._total_css_count == 0 else 1 - self._broken_css_count / self._total_css_count good_others_percent = 0 if self._total_others_count == 0 else 1 - self._broken_others_count / self._total_others_count all_broken = self._broken_js_count + self._broken_css_count + self._broken_image_count + self._broken_others_count + self._broken_webpage_count good_overall_percent = 0 if self._total_res_done == 0 else 1 - all_broken / self._total_res_done return ArchiveDetail(self._original_domain, self._archive_link, self._total_res_done, good_res_rate=good_overall_percent, total_web_page=self._total_webpage_count, good_webpage_rate=good_webpage_percent, total_css=self._total_css_count, good_css_rate=good_css_percent, total_js=self._total_js_count, good_js_rate=good_js_percent, total_image=self._total_image_count, good_image_rate=good_image_percent, total_other=self._total_others_count, good_other_rate=good_others_percent)
def get_best_archive(root_domain: str, thread_size=100, profile_check=10, pass_threshold=0.8, res_limit=2000) -> ArchiveDetail: """ get the best profile from archive.org by doing profile spectrum analysis, given a root domain name. spectrum analysis: comparison between resources of current profile to all historic resources. :param root_domain: root domain in str, e.g: "google.co.uk" :param thread_size: number of thread to check resource link simultaneously :param profile_check: max number of profile to check :param pass_threshold: threshold define if a profile is good enough. :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc. :return: tuple (archive in ArchiveStruct, spectrum value) """ url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=-profile_check) timestamp = "" info = ArchiveOrg.get_domain_urls(url, limit=res_limit) res_count = len(info) archive = None current_rate = 0.0 min_broken_res_count = 0 good_rate_web_page = 0 good_rate_image = 0 good_rate_css = 0 good_rate_js = 0 good_rate_other = 0 total_web_page_min = 0 total_js_min = 0 total_css_min = 0 total_image_min = 0 total_other_min = 0 if res_count > 0: for profile in profiles: if isinstance(profile, ArchiveStruct): total_web_page = 0 total_js = 0 total_css = 0 total_image = 0 total_other = 0 broken_web_page = 0 broken_js = 0 broken_css = 0 broken_image = 0 broken_other = 0 test_pool = pool.ThreadPool(processes=thread_size) timestamp = profile.date_stamp print("checking:", str(profile)) links = [] for item in info: item.date_stamp = timestamp links.append(ArchiveOrg.get_archive_link(item)) results = [ test_pool.apply_async(func=test_response, args=(x, )) for x in links ] returned = [y.get() for y in results] test_pool.terminate() for result_good, link_cls in returned: if link_cls == LinkUtility.EXT_WEBPAGE: total_web_page += 1 if not result_good: broken_web_page += 1 elif link_cls == LinkUtility.EXT_CSS: total_css += 1 if not result_good: broken_css += 1 elif link_cls == LinkUtility.EXT_JS: total_js += 1 if not result_good: broken_js += 1 elif link_cls == LinkUtility.EXT_IMAGE: total_image += 1 if not result_good: broken_image += 1 else: total_other += 1 if not result_good: broken_other += 1 broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image passed = False total_broken_rate = 1 - broken_res_count / res_count if total_broken_rate >= pass_threshold: passed = True if total_broken_rate > current_rate: current_rate = total_broken_rate archive = profile good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page / total_web_page good_rate_image = 0 if total_image == 0 else 1 - broken_image / total_image good_rate_css = 0 if total_css == 0 else 1 - broken_css / total_css good_rate_js = 0 if total_js == 0 else 1 - broken_js / total_js good_rate_other = 0 if total_other == 0 else 1 - broken_other / total_other total_web_page_min = total_web_page total_js_min = total_js total_css_min = total_css total_image_min = total_image total_other_min = total_other min_broken_res_count = total_broken_rate print("total:", res_count, " broken res:", broken_res_count, " stamp: ", profile.date_stamp, " pass? ", passed, " rate:", total_broken_rate) return ArchiveDetail(root_domain, archive_link=ArchiveOrg.get_archive_link(archive), total_res=res_count, good_res_rate=min_broken_res_count, total_web_page=total_web_page_min, good_webpage_rate=good_rate_web_page, total_css=total_css_min, good_css_rate=good_rate_css, total_js=total_js_min, good_js_rate=good_rate_js, total_image=total_image_min, good_image_rate=good_rate_image, total_other=total_other_min, good_other_rate=good_rate_other)