def check_whois_with_dns(page: OnSiteLink): real_response_code = ResponseCode.DNSError skip_whois_check = False try: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] suffix = root_result[5] if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST: skip_whois_check = True else: if LinkChecker.is_domain_DNS_OK(sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError skip_whois_check = True elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError skip_whois_check = True # response = LinkChecker.get_response(page.link, timeout) # check 404 error page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain except Exception as ex: # ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link) skip_whois_check = True finally: if not skip_whois_check and real_response_code == ResponseCode.DNSError: return check_whois(page) else: return page.link, page.response_code
def _check_whois(self, domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if not self._is_debug: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain( domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois( root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) self._put_output_result_in_queue(domain_data) else: self._put_output_result_in_queue(domain_data) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def _check_whois_v1(self, domain_data: OnSiteLink): root_domain = domain_data.link try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] real_response_code = domain_data.response_code whois = LinkChecker.check_whois(root_domain) # check whois record if whois[0]: if whois[2]: # domain is expired real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired if real_response_code == ResponseCode.Expired: #if ResponseCode.domain_might_be_expired(real_response_code): domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # if isinstance(self._queue_lock, multiprocessing.RLock): with self._queue_lock: self._output_q.put( (domain_data.link, domain_data.response_code)) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def get_sites_by_seed_sites(account: MajesticCom, seed_domains: [], catagories: [], fresh_data=False, index=0, iteration=1, loop_count=0, count_per_domain=100, callback=None, current_count=0, max_count=-1, tf=20) -> []: if iteration < 0: raise ValueError("get_sites_by_seed_sites: iteration should >= 0.") sub_domains = [LinkChecker.get_root_domain(x, use_www=False)[4] for x in seed_domains[index:]] if len(sub_domains) == 0: return [] backlinks = [] # counter = index if max_count > 0 and current_count >= max_count: return backlinks temp_sub_domains = [] temp = [] # target_catagories = [] # for catagory in catagories: # target_catagories.append(str(CategoryManager.decode_sub_category(catagory, False))) for sub_domain in sub_domains: print("doing backlinks of domain:", sub_domain, "seed len:", len(temp_sub_domains)) try: temp = account.get_backlinks(sub_domain, count_per_domain, topic="", is_dev=False, fresh_data=fresh_data) current_count += 1 except Exception as ex: print(ex) for item in temp: if isinstance(item, MajesticBacklinkDataStruct): # item_catagory = str(CategoryManager.decode_sub_category(item.src_topic, False)) domain = LinkChecker.get_root_domain(item.backlink, use_www=False)[4] item.ref_domain = domain # if callback is not None: # callback(item) # if len(target_catagories) > 0 and item_catagory not in target_catagories: # continue if domain not in sub_domains and domain not in temp_sub_domains: if len(catagories) > 0: is_in = False if len(item.src_topic) > 0: decoded = str(CategoryManager.decode_sub_category(item.src_topic, False)) for cate in catagories: if cate in decoded: is_in = True break if is_in and item.src_tf >= tf: temp_sub_domains.append(domain) elif item.src_tf >= tf: temp_sub_domains.append(domain) item.ref_domain = domain if callback is not None: callback(item) if max_count > 0 and current_count >= max_count: break if loop_count >= iteration: return backlinks else: return backlinks + GoogleMajestic.get_sites_by_seed_sites(account, sub_domains + temp_sub_domains, catagories, fresh_data, len(seed_domains), iteration, loop_count+1, count_per_domain, callback, current_count, max_count, tf)
def testRequestAllLink(self): url = "http://www.jehovahs-witness.com" agent = "VegeBot-Careful" source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**", retries=0) links = LinkChecker.get_all_links_from_source(source) for link in links: paras = urlsplit(link) page_scheme, page_domain = paras[0], paras[1] print(LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
def get_sites(keyword: str, page_number: int=1, result_per_page: int=100, index: int=0, length: int=100, use_browser=False, source_type="", filter_list=[], country_code='', return_domain_home_only=True, days_ago=0, **kwargs) -> []: # def get_sites(keyword: str, page: int=1, index: int=0, length: int=100, # history=SeedSiteSettings.TIME_NOW, blog=False) -> []: assert page_number > 0, "page number should greater than 0" assert index >= 0, "index should greater or equal to 0" assert length > 0, "length should greater than 0" search_query = BingConst.SearchLink.format(quote(keyword), quote(keyword), (page_number-1)*length + index + 1, length) user_agent = WebRequestCommonHeader.webpage_agent try: req = BingCom._get_response(request_link=search_query, user_agent=user_agent, **kwargs) # req = requests.get(search_query, timeout=30, headers=WebRequestCommonHeader.get_html_header()) result = req.text soup = bs4.BeautifulSoup(result) tags = soup.select(BingConst.SitePath) domains = [] for tag in tags: try: domain = tag.attrs["href"].strip().replace(" ", "") if return_domain_home_only: domain = LinkChecker.get_root_domain(domain, use_www=False)[2] # get the link else: domain = LinkChecker.get_root_domain(domain, use_www=False)[3] if len(domain) > 0: domains.append(domain) except: pass new_list = [] if isinstance(domains, list): if len(filter_list) > 0: for domain in domains: if isinstance(domain, str): temp = domain.lower().strip() if not any(x in temp for x in filter_list): new_list.append(temp) else: new_list = domains end = index + length data_len = len(new_list) if domains is not None and index < data_len: if data_len >= end: return new_list[index:end] else: return new_list[index:] else: return [] except Exception as ex: print(ex) return None
def get_search_results(keyword: str, page_number: int, proxy: ProxyStruct=None, result_per_page: int=GoogleConst.Result100, timeout=5, return_domain_home_only=True, use_forbidden_filter=True, days_ago=0, addtional_query_parameter: str="", country_code="us", use_browser=False) -> list: """ generic normal search, get a list of domains form page :param keyword: :param page_number: > 0 :param resultPerPage: :param timeout: :param return_domain_home_only: return root domain name if True, else return protocol suffix + domain name :param use_forbidden_filter: :param days_ago: specify how many days ago before when results were indexed. :return: """ assert page_number > 0, "page number should be greater than 0." page_range = GoogleCom.get_result_per_page_range() assert result_per_page in page_range, "result per page should be one of those values:" + str(page_range) sub_domain = "www" request_link = GoogleUtility.get_local_endpoint(country_code, sub_domain) \ + GoogleConst.CommonSearchPath.format(quote(keyword), result_per_page, (page_number - 1) * result_per_page, country_code) \ + addtional_query_parameter+GoogleUtility.get_query_for_days(days_ago) try: user_agent = WebRequestCommonHeader.webpage_agent if not use_browser: response = GoogleCom._get_response(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent) if not response.status_code == 200: # if response.status_code == 503: # print(response.text) raise ConnectionRefusedError("error getting result, with status code:", response.status_code) result = response.text else: result = GoogleCom._get_response_browser(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent) soup = bs4.BeautifulSoup(result) tags = soup.select(GoogleConst.SitePath) domains = [] for tag in tags: try: domain = tag.text.strip().replace(" ", "") if return_domain_home_only: domain = LinkChecker.get_root_domain(domain, use_www=False)[2] # get the link else: domain = LinkChecker.get_root_domain(domain, use_www=False)[3] if use_forbidden_filter and LinkChecker.is_domain_forbidden(domain): continue if len(domain) > 0: domains.append(domain) except: pass return domains except Exception as ex: print(ex) return None
def testRequestAllLink(self): url = "http://www.jehovahs-witness.com" agent = "VegeBot-Careful" source = LinkChecker.get_page_source( url, agent=agent, from_src="*****@*****.**", retries=0) links = LinkChecker.get_all_links_from_source(source) for link in links: paras = urlsplit(link) page_scheme, page_domain = paras[0], paras[1] print( LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
def populate_with_state(self, state): if state is not None and isinstance(state, SiteCheckerState): self._status = "Restarted" self.page_count = state.page_count self.page_allocated = state.page_count self.internal_page_count = state.internal_page_count self.internal_page_last_count = state.internal_page_count self.external_links_checked = state.external_page_count self._external_db_buffer.set_progress(state.external_page_count) self.page_need_look_up = state.page_need_look_up self.current_level = state.current_level self.progress_logger.set_reference(state.log_sample_index, state.log_started_time) counter = 0 if self.data_source is not None: try: for item in self.data_source.get_next(): if counter >= self.cache_size: break if isinstance(item, OnSiteLink) and not LinkChecker.is_external_link(self.root_domain, item.link): self.cache_list.append(item.link) # print("--restore: ", item) counter += 1 except Exception as ex: msg = "error in SiteChecker.populate_with_state(), trying to populate cache, " + self.root_domain ErrorLogger.log_error("SiteChecker", ex, msg) self.data_source.ref = state.datasource_ref self.data_source.output_c = state.datasource_output_c self.data_source.set_progress(state.datasource_index if state.datasource_index < state.page_count else state.page_count) self.data_source.set_continue_lock(True)
def testScrapePageBatch(self): save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) domains_links = FileHandler.read_lines_from_file(file_path) for link in domains_links: # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer( original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def testRobot2(self): rp = LinkChecker.get_robot_agent("http://pointshound.com/robots.txt") if rp is not None: for i in range(1, 1000): print("count:", i, "can fetch:", rp.can_fetch("*", "http://www.bbc.co.uk/fafdjiaofpadpvhagaarga/news/agqrgfv/y")) else: print("domain is not available.")
def testRobot4(self): #rules = LinkChecker.get_robot_agent("sbnet.se") rules = LinkChecker.get_robot_agent("realwire.com") crawl_delay = rules.delay("idiot") print("delay is:", crawl_delay) for i in range(1, 1000): print(rules.allowed("http://api.google.com/search/", agent="idiot"))
def _parse_text_res(self, page: LinkAttrs) -> str: page.link = page.link.replace("\\/", "/") # in case of javascript response = LinkChecker.get_common_web_resource( page.link, timeout=self._timeout, redirect=self._max_redirect, retries=self._max_retries) result = "" groups = [] parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups, self._original_domain, page) if page.res_type == LinkUtility.EXT_WEBPAGE: text = str(LinkUtility.remove_archive_org_footprint(response.text)) else: text = response.text result = re.sub(link_pattern, parse_str_sp, text) for item in groups: if isinstance(item, LinkAttrs): if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\ ArchiveExplorer.is_downloadable_content(item, self._max_level): with self._sync_lock: # print("appending:", item) # print("adding to list:", item.link, "level: ", item.level) if not item.shadow_ref_link == item.ref_link: self._file_manager.write_to_redirect( item.shadow_ref_link, item.ref_link) self._internal_list.append(item) return result
def _map_res_str(captured: [], root_domain: str, page: LinkAttrs, current_match) -> str: returned = None level = page.level try: link = current_match.group(0) # print("cap:", link) match2 = current_match.group(2) current_link = current_match.group(1) + match2 begin_index = str(link).index("/") begin_mark = str(link[:begin_index]).strip() end_index = begin_index + len(current_link) if end_index >= len(link): end_mark = "" else: end_mark = str(link[end_index:]).strip() # if "%3" in current_link: # transform encoded url inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(current_link) if len(inner_link) > 0: if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE: # data will be saved in file system if root_domain in domain: is_internal = True else: is_internal = False path_decoded = parse.unquote(path) if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN: short_path, ext = LinkChecker.get_shorter_url_path(path) short_path += ext else: short_path = path if link_class == LinkUtility.EXT_WEBPAGE: if len(ext) > 0 and not ext == ".html": valid_short_path = short_path.replace(ext, ".html") else: valid_short_path = short_path else: valid_short_path = short_path file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment) short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(valid_short_path, fragment) current_link = current_link.replace("\\/", "/") captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, short_file_path, short_ref_path, ref_path, page.path, link_class, level+1, is_internal=is_internal)) returned = begin_mark + short_ref_path + end_mark else: #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE: returned = begin_mark + parse.unquote(match2) + end_mark # else: # capture other resources except external webpage # file_path, ref_path = LinkUtility.make_valid_web_res_path(path) # captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1)) # returned = begin_mark + ref_path + end_mark else: returned = begin_mark + parse.unquote(current_link) + end_mark except Exception as ex: print("ex in mapping:", ex) finally: if isinstance(returned, str): # print("sub:", returned) return returned else: return ""
def test_response(link: str) -> bool: status_code, content_type = LinkChecker.get_response(link) if status_code != 200: print(link, "status bad:", status_code, " content: ", content_type) return False else: print(link, "status good:", status_code, " content: ", content_type) return True
def test_get_sub_domains(self): full_link = "http://blogspot.co.uk/" domain_data = LinkChecker.get_root_domain(full_link, False) root_domain = domain_data[1] sub_domain = domain_data[4] domain_suffix = domain_data[5] sub_domain_no_local = sub_domain.strip(domain_suffix) print(sub_domain_no_local)
def check_external_page(checker: SiteChecker, page: OnSiteLink, timeout=10): """ check DNS Error Only :param checker: :param page: :param timeout: :return: """ # response = LinkChecker.get_response(page.link, timeout) #real_response_code = response[0] #real_response_code = ResponseCode.LinkOK #print("-------checking external " + page.link) try: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] if len(sub_domain) == 0 or root_domain in checker.external_cache_list: return else: if len(checker.external_cache_list) < checker.external_cache_size: checker.external_cache_list.append(root_domain) real_response_code = page.response_code if LinkChecker.is_domain_DNS_OK(sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError # response = LinkChecker.get_response(page.link, timeout) # check 404 error page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain #print(" ready to output external:", str(page)) if checker.output_all_external or ResponseCode.domain_might_be_expired(real_response_code): # if checker.delegate is not None: # checker.delegate(new_page) if checker.output_queue is not None: with checker._queue_lock: checker.output_queue.put(page) except Exception as ex: PrintLogger.print(ex) ErrorLogger.log_error("PageChecker", ex, "check_external_page() " + page.link)
def get_archives_lang(root_domain: str, thread_size=10, profile_check=300) -> list: url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=0-profile_check) today_stamp = datetime.utcnow().timestamp() for item in profiles: if isinstance(item, ArchiveStruct): timestamp = item.get_datestamp_unix_time() print(str(item), " converted:", str(timestamp)) return []
def testRobot5(self): base_link = "http://pointshound.com" test_sub_paths = [ "/", "/why", "/about", "/privacy", "/howitworks", "/help", "/press", "/terms", "/guarantee", "/contact_form", "/something-else"] rules = LinkChecker.get_robot_agent("pointshound.com", protocol="https") for item in test_sub_paths: path = base_link + item is_allowed = rules.allowed(path, agent="VegeBot Test") print("sub_path:", item, " is allowed:", is_allowed)
def check_whois(domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as ex: print(ex) finally: return domain_data.link, domain_data.response_code
def check_whois(domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois( root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as ex: print(ex) finally: return domain_data.link, domain_data.response_code
def testShortUrl2(self): urls = ["http://gamblingaddiction.cc/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%94%e0%b9%8c%e0%b8%a1%e0%b8%b2%e0%b8%a3%e0%b9%8c%e0%b8%8a%e0%b8%82%e0%b9%88%e0%b8%b2%e0%b8%a7%e0%b8%a3%e0%b8%b4%e0%b8%9f.html", "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%.html", "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%", "/中国人民解放军/中国人民解放军/中国人民解放军.html", "strongholeqp4tfq;eafak;faf"] for url in urls: short_path, ext = LinkChecker.get_shorter_url_path(url) print("doing:", url) print("new path:", short_path) print("extension:", ext)
def testRobot2(self): rp = LinkChecker.get_robot_agent("http://pointshound.com/robots.txt") if rp is not None: for i in range(1, 1000): print( "count:", i, "can fetch:", rp.can_fetch( "*", "http://www.bbc.co.uk/fafdjiaofpadpvhagaarga/news/agqrgfv/y" )) else: print("domain is not available.")
def testShortUrl2(self): urls = [ "http://gamblingaddiction.cc/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%94%e0%b9%8c%e0%b8%a1%e0%b8%b2%e0%b8%a3%e0%b9%8c%e0%b8%8a%e0%b8%82%e0%b9%88%e0%b8%b2%e0%b8%a7%e0%b8%a3%e0%b8%b4%e0%b8%9f.html", "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%.html", "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%", "/中国人民解放军/中国人民解放军/中国人民解放军.html", "strongholeqp4tfq;eafak;faf" ] for url in urls: short_path, ext = LinkChecker.get_shorter_url_path(url) print("doing:", url) print("new path:", short_path) print("extension:", ext)
def scrape_web_res(self, page: LinkAttrs): print("look:", page.link, "level: ", page.level) try: if len(page.path) > ArchiveExplorer.MAX_PATH_LEN: # max file path in any file system raise OSError("file path is too long:" + page.path) response_code, content_type = LinkChecker.get_response(page.link) if response_code not in [ResponseCode.LinkOK, ResponseCode.LinkFound, ResponseCode.LinkRedirect]: raise ConnectionError("res is not available: " + page.link) if page.res_type in [LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS, LinkUtility.EXT_JS]: # parse a webpage save_text = self._parse_text_res(page) self._file_manager.write_to_file(page.path, save_text) # elif page.res_type != LinkUtility.EXT_OTHER: # TODO: download normal resources # response = LinkChecker.get_common_web_resource(page.link) # if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT: # self._downloader.write_to_file(page.path, response.content, mode="b") # else: # self._downloader.write_to_file(page.path, response.text, mode="t") else: # response = LinkChecker.get_common_web_resource(page.link) # self._downloader.write_to_file(page.path, response.content, mode="b") self._file_manager.download_file(sub_path=page.path, url=page.link, timeout=self._timeout, redirect=self._max_redirect, retries=self._max_retries) except Exception as ex: print("exception:", ex) print("broken res:", page) with self._sync_lock: self._file_manager.write_to_error_log(page.to_tuple(str(ex))) if page.res_type == LinkUtility.EXT_WEBPAGE: self._broken_webpage_count += 1 elif page.res_type == LinkUtility.EXT_CSS: self._broken_css_count += 1 elif page.res_type == LinkUtility.EXT_IMAGE: self._broken_image_count += 1 elif page.res_type == LinkUtility.EXT_JS: self._broken_js_count += 1 else: self._broken_others_count += 1 self._broken_res_list.append(page) finally: with self._sync_lock: self._total_res_done += 1 if page.res_type == LinkUtility.EXT_WEBPAGE: self._total_webpage_count += 1 elif page.res_type == LinkUtility.EXT_CSS: self._total_css_count += 1 elif page.res_type == LinkUtility.EXT_IMAGE: self._total_image_count += 1 elif page.res_type == LinkUtility.EXT_JS: self._total_js_count += 1 else: self._total_others_count += 1
def _check_whois(self, domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if not self._is_debug: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) self._put_output_result_in_queue(domain_data) else: self._put_output_result_in_queue(domain_data) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def testRemoveFootprint2(self): link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" page_source = LinkChecker.get_page_source(link) bs4_tree = LinkUtility.remove_archive_org_footprint(page_source.text) link_list = [] for child in bs4_tree.find_all(): if isinstance(child, bs4.Tag): if "href" in child.attrs: link_list.append(child["href"]) elif "src" in child.attrs: link_list.append(child["src"]) for item in link_list: print(item)
def get_archives_lang(root_domain: str, thread_size=10, profile_check=300) -> list: url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=0 - profile_check) today_stamp = datetime.utcnow().timestamp() for item in profiles: if isinstance(item, ArchiveStruct): timestamp = item.get_datestamp_unix_time() print(str(item), " converted:", str(timestamp)) return []
def testRobot5(self): base_link = "http://pointshound.com" test_sub_paths = [ "/", "/why", "/about", "/privacy", "/howitworks", "/help", "/press", "/terms", "/guarantee", "/contact_form", "/something-else" ] rules = LinkChecker.get_robot_agent("pointshound.com", protocol="https") for item in test_sub_paths: path = base_link + item is_allowed = rules.allowed(path, agent="VegeBot Test") print("sub_path:", item, " is allowed:", is_allowed)
def get_link_class(link: str) -> (str, str): ext = LinkChecker.get_link_extension(link).lower() if len(ext) == 0 or ext in LinkChecker.common_html_page_ex: return LinkUtility.EXT_WEBPAGE, ext elif ext in LinkChecker.common_img_ex: return LinkUtility.EXT_IMAGE, ext elif ext in LinkChecker.common_font_ex: return LinkUtility.EXT_FONT, ext elif ext.endswith("css"): return LinkUtility.EXT_CSS, ext elif ext.endswith("js"): return LinkUtility.EXT_JS, ext else: return LinkUtility.EXT_OTHER, ext
def _check_whois_v1(self, domain_data: OnSiteLink): root_domain = domain_data.link try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] real_response_code = domain_data.response_code whois = LinkChecker.check_whois(root_domain) # check whois record if whois[0]: if whois[2]: # domain is expired real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired if real_response_code == ResponseCode.Expired: #if ResponseCode.domain_might_be_expired(real_response_code): domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # if isinstance(self._queue_lock, multiprocessing.RLock): with self._queue_lock: self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def testScrapePage(self): # link = "http://web.archive.org/web/20111102054835/http://www.agfdh.org:80/" link = "http://web.archive.org/web/20150425143742/http://susodigital.com/" #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/" stop_event = multiprocessing.Event() inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link) root_domain = LinkChecker.get_root_domain(domain)[1] path = "/index.html" link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0) explorer = ArchiveExplorer(original_domain=root_domain, link=link, external_stop_event=stop_event, download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2) explorer.run() save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv" CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()]) archive_detail = explorer.get_archive_detail() CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
def download_file(self, sub_path: str, url: str, timeout=5, retries=1, redirect=5): full_path = self._dir_path + sub_path full_path = full_path.replace("//", "/") print("download to file:", full_path) bytes_count = 0 s = LinkChecker.get_common_request_session(retries=retries, redirect=redirect) # NOTE the stream=True parameter r = s.get(url, stream=True, timeout=timeout) FileIO.FileHandler.create_file_if_not_exist(full_path) with open(full_path, 'wb') as f: for chunk in r.iter_content(chunk_size=1024): if chunk: # filter out keep-alive new chunks f.write(chunk) f.flush() bytes_count += len(chunk) f.close() if bytes_count == 0: raise ConnectionError("URL broken: " + url)
def test_response(link: str) -> (bool, str): # link_cls, ext = LinkUtility.get_link_class(link) status_code, content_type = LinkChecker.get_response(link) link_cls, ext = LinkUtility.get_link_class(link.rstrip('/')) # print("checking link:", link, " link cls:", link_cls, " ext:", ext) # if "image" in content_type: # link_cls = LinkUtility.EXT_WEBPAGE # elif "html" in content_type: # link_cls = LinkUtility.EXT_WEBPAGE # elif "css" in content_type: # link_cls = LinkUtility.EXT_CSS # elif "javascript" in content_type: # link_cls = LinkUtility.EXT_JS # else: # link_cls = LinkUtility.EXT_OTHER if status_code != 200: # print(link, "status bad:", status_code, " content: ", content_type) return False, link_cls else: # print(link, "status good:", status_code, " content: ", content_type) return True, link_cls
def _get_back_link_thread(account: MajesticCom, sub_domain: str, count_per_domain: int, fresh_data: bool, sub_domains: [], temp_sub_domains: [], categories: [], callback, tf=20, bad_country_list=[]): temp = [] print("doing backlinks of domain:", sub_domain, " domain len:", len(temp_sub_domains)) try: temp = account.get_backlinks(sub_domain, count_per_domain, topic="", is_dev=False, fresh_data=fresh_data) except Exception as ex: print(ex) for item in temp: if isinstance(item, MajesticBacklinkDataStruct): # item_catagory = str(CategoryManager.decode_sub_category(item.src_topic, False)) domain = LinkChecker.get_root_domain(item.backlink, use_www=False)[4] item.ref_domain = domain # if callback is not None: # callback(item) # if len(target_catagories) > 0 and item_catagory not in target_catagories: # continue if domain not in sub_domains and domain not in temp_sub_domains: if len(categories) > 0: is_in = False if len(item.src_topic) > 0: decoded = str(CategoryManager.decode_sub_category(item.src_topic, False)) for cate in categories: if cate in decoded: is_in = True break if is_in and item.src_tf >= tf: temp_sub_domains.append(domain) elif item.src_tf >= tf: temp_sub_domains.append(domain) item.ref_domain = domain if callback is not None: callback(item) time.sleep(1)
def _parse_text_res(self, page: LinkAttrs) -> str: page.link = page.link.replace("\\/", "/") # in case of javascript response = LinkChecker.get_common_web_resource(page.link, timeout=self._timeout, redirect=self._max_redirect, retries=self._max_retries) result = "" groups = [] parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups, self._original_domain, page) if page.res_type == LinkUtility.EXT_WEBPAGE: text = str(LinkUtility.remove_archive_org_footprint(response.text)) else: text = response.text result = re.sub(link_pattern, parse_str_sp, text) for item in groups: if isinstance(item, LinkAttrs): if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\ ArchiveExplorer.is_downloadable_content(item, self._max_level): with self._sync_lock: # print("appending:", item) # print("adding to list:", item.link, "level: ", item.level) if not item.shadow_ref_link == item.ref_link: self._file_manager.write_to_redirect(item.shadow_ref_link, item.ref_link) self._internal_list.append(item) return result
def test_from_url(self): response = LinkChecker.get_page_source(link="http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848") print(langid.classify(response.text))
def get_sites_by_seed_sites_muti_threads(account: MajesticCom, seed_domains: [], catagories: [], fresh_data=False, index=0, iteration=1, loop_count=0, count_per_domain=100, callback=None, current_count=0, max_count=-1, tf=20, thread_pool_size=20, get_backlinks=True, bad_country_list=[]): """ :param account: :param seed_domains: :param catagories: :param fresh_data: :param index: :param iteration: :param loop_count: :param count_per_domain: :param callback: :param current_count: :param max_count: :param tf: :param thread_pool_size: :param get_backlinks: it will get backlinks of domains if True, else it will get ref domains instead, which is cheaper. :return: """ target_func = GoogleMajestic._get_back_link_thread if get_backlinks else GoogleMajestic._get_ref_domain_thread if iteration < 0: raise ValueError("get_sites_by_seed_sites: iteration should >= 0.") sub_domains = [LinkChecker.get_root_domain(x, use_www=False)[4] for x in seed_domains[index:]] if len(sub_domains) == 0: print("sub_domains is len 0.") return # counter = index process_len = len(sub_domains) if max_count > 0: if current_count >= max_count: print("exceeded seed len.") return elif current_count + process_len > max_count: process_len = max_count - current_count #target_catagories = [] # for catagory in catagories: # target_catagories.append(str(CategoryManager.decode_sub_category(catagory, False))) temp_sub_domains = [] thread_pool = ThreadPool(processes=thread_pool_size) processes = [thread_pool.apply_async(target_func, args=(account, x, count_per_domain, fresh_data, sub_domains, temp_sub_domains, catagories, callback, tf, bad_country_list)) for x in sub_domains[0: process_len]] results = [y.get() for y in processes] thread_pool.terminate() current_count += process_len if loop_count >= iteration: return else: new_seeds = sub_domains + temp_sub_domains print("going to next level with seeds:", len(new_seeds)) return GoogleMajestic.get_sites_by_seed_sites_muti_threads(account, new_seeds, catagories, fresh_data, len(seed_domains), iteration, loop_count+1, count_per_domain, callback, current_count, max_count, thread_pool_size=10, tf=tf, get_backlinks=get_backlinks, bad_country_list=bad_country_list)
def test_from_url(self): response = LinkChecker.get_page_source( link= "http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848" ) print(langid.classify(response.text))
def get_best_archive(root_domain: str, thread_size=100, profile_check=10, pass_threshold=0.8, res_limit=2000) -> ArchiveDetail: """ get the best profile from archive.org by doing profile spectrum analysis, given a root domain name. spectrum analysis: comparison between resources of current profile to all historic resources. :param root_domain: root domain in str, e.g: "google.co.uk" :param thread_size: number of thread to check resource link simultaneously :param profile_check: max number of profile to check :param pass_threshold: threshold define if a profile is good enough. :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc. :return: tuple (archive in ArchiveStruct, spectrum value) """ url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=-profile_check) timestamp ="" info = ArchiveOrg.get_domain_urls(url, limit=res_limit) res_count = len(info) archive = None current_rate = 0.0 min_broken_res_count = 0 good_rate_web_page = 0 good_rate_image = 0 good_rate_css = 0 good_rate_js = 0 good_rate_other = 0 total_web_page_min = 0 total_js_min = 0 total_css_min = 0 total_image_min = 0 total_other_min = 0 if res_count > 0: for profile in profiles: if isinstance(profile, ArchiveStruct): total_web_page = 0 total_js = 0 total_css = 0 total_image = 0 total_other = 0 broken_web_page = 0 broken_js = 0 broken_css = 0 broken_image = 0 broken_other = 0 test_pool = pool.ThreadPool(processes=thread_size) timestamp = profile.date_stamp print("checking:", str(profile)) links = [] for item in info: item.date_stamp = timestamp links.append(ArchiveOrg.get_archive_link(item)) results = [test_pool.apply_async(func=test_response, args=(x,)) for x in links] returned = [y.get() for y in results] test_pool.terminate() for result_good, link_cls in returned: if link_cls == LinkUtility.EXT_WEBPAGE: total_web_page += 1 if not result_good: broken_web_page += 1 elif link_cls == LinkUtility.EXT_CSS: total_css += 1 if not result_good: broken_css += 1 elif link_cls == LinkUtility.EXT_JS: total_js += 1 if not result_good: broken_js += 1 elif link_cls == LinkUtility.EXT_IMAGE: total_image += 1 if not result_good: broken_image += 1 else: total_other += 1 if not result_good: broken_other += 1 broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image passed = False total_broken_rate = 1-broken_res_count/res_count if total_broken_rate >= pass_threshold: passed = True if total_broken_rate > current_rate: current_rate = total_broken_rate archive = profile good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page/total_web_page good_rate_image = 0 if total_image == 0 else 1 - broken_image/total_image good_rate_css = 0 if total_css == 0 else 1 - broken_css/total_css good_rate_js = 0 if total_js == 0 else 1 - broken_js/total_js good_rate_other = 0 if total_other == 0 else 1 - broken_other/total_other total_web_page_min = total_web_page total_js_min = total_js total_css_min = total_css total_image_min = total_image total_other_min = total_other min_broken_res_count = total_broken_rate print("total:", res_count, " broken res:", broken_res_count, " stamp: ", profile.date_stamp, " pass? ", passed, " rate:", total_broken_rate) return ArchiveDetail(root_domain, archive_link=ArchiveOrg.get_archive_link(archive), total_res=res_count, good_res_rate=min_broken_res_count, total_web_page=total_web_page_min, good_webpage_rate=good_rate_web_page, total_css=total_css_min, good_css_rate=good_rate_css, total_js=total_js_min, good_js_rate=good_rate_js, total_image=total_image_min, good_image_rate=good_rate_image, total_other=total_other_min, good_other_rate=good_rate_other)
def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None, controller: SiteCheckerController=None, max_level=10, max_page=1000, delegate=None, output_buff_size=2000, output_queue=None, output_all_external=False, result_delegate=None, memory_control_terminate_event=None, check_robot_text=True, **kwargs): """ :param full_link: The full link of a domain, e.g: https://www.google.co.uk :param domain: domain to crawl :param max_level: stop crawling if it reaches this level :param max_page: maximum pages to check within a site, also stop crawling :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999 :param result_delegate: send site_info upon finish :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process. :return: """ FeedbackInterface.__init__(self, **kwargs) #super(SiteChecker, self).__init__(**kwargs) if full_link is None or len(full_link) == 0: raise ValueError() original_path = "" try: paras = urlsplit(full_link) self.scheme, self.domain, original_path = paras[0], paras[1], paras[2] except: pass domain_data = LinkChecker.get_root_domain(full_link, False) self.root_domain = domain_data[1] self.sub_domain = domain_data[4] self.domain_suffix = domain_data[5] self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix) if self.scheme == "": self.scheme = "http" if self.domain == "": self.domain = self.root_domain self.orginal_link = full_link self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme) self.max_level = max_level self.max_page = max_page self.page_count = 0 # keep track page done self._page_count_shadow = 0 # track previous count self._all_page_count_shadow = 0 #track previous count in datasource self.internal_page_count = 0 self.internal_page_last_count = 0 self.page_allocated = 0 self.current_level = 0 # if this = 0, it is root domain/home_page self._stop_event = Event() valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link) self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self, stop_event=self._stop_event, buf_size=int(output_buff_size/2), dir_path=get_db_buffer_default_dir(), convert_output=False) self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False) self._memory_control_terminate_event = memory_control_terminate_event self.task_control_lock = threading.RLock() if data_source is None: #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self) self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self) else: self.data_source = data_source # a list of OnSiteLink self.delegate = delegate if LinkChecker.might_be_link_html_page(original_path): self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1)) self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1)) self.cache_list = [] # internal page cache self.page_need_look_up_temp = 0 self.cache_list.append(self.domain_link) if "www." not in self.sub_domain: self.cache_list.append(self.scheme + "://www."+self.sub_domain) self.cache_list.append(self.scheme + "://" + self.domain) self.page_need_look_up = self.data_source.count_all() self.cache_size = 500 # create a small cache list to avoid going to check link in file system with lots of read and write self._double_check_cache_lock = threading.RLock() self._double_check_cache = deque(maxlen=self.cache_size) self.external_cache_list = [] self.external_cache_size = 500 # cache that hold external sites self.external_links_checked = 0 self.add_internal_page_OK_only = True self.output_queue = output_queue self.output_all_external = output_all_external self.controller = controller self.result_delegate = result_delegate self.page_count_lock = threading.RLock() self.internal_page_count_lock = threading.RLock() self.level_lock = threading.RLock() self.page_look_up_lock = threading.RLock() self.external_link_check_lock = threading.RLock() self._finihsed = False self.task_control_max = 1 self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \ "if you have an enquiry, please email to: [email protected])" self.agent_from = "*****@*****.**" if check_robot_text: self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme) else: self.robot_agent = None self.site_crawl_delay = 0.60 if isinstance(self.robot_agent, Rules): delay_temp = self.robot_agent.delay(self.agent) if delay_temp is not None and delay_temp != self.site_crawl_delay: self.site_crawl_delay = delay_temp self.task_control_counter = 1 self._speed_penalty_count = 0 self._speed_penalty_threshold = 10 self._progress_logging_speed = 120 self._output_period = 120 self._output_batch_size = 100 self._death_wish_sent = False SiteChecker._is_lxml_parser_exist() self._output_thread = None self._output_queue = None self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event) self._status = "Start" self._populate_with_state() # restore laste known state
def _map_res_str(captured: [], root_domain: str, page: LinkAttrs, current_match) -> str: returned = None level = page.level try: link = current_match.group(0) # print("cap:", link) match2 = current_match.group(2) current_link = current_match.group(1) + match2 begin_index = str(link).index("/") begin_mark = str(link[:begin_index]).strip() end_index = begin_index + len(current_link) if end_index >= len(link): end_mark = "" else: end_mark = str(link[end_index:]).strip() # if "%3" in current_link: # transform encoded url inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail( current_link) if len(inner_link) > 0: if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE: # data will be saved in file system if root_domain in domain: is_internal = True else: is_internal = False path_decoded = parse.unquote(path) if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN: short_path, ext = LinkChecker.get_shorter_url_path( path) short_path += ext else: short_path = path if link_class == LinkUtility.EXT_WEBPAGE: if len(ext) > 0 and not ext == ".html": valid_short_path = short_path.replace(ext, ".html") else: valid_short_path = short_path else: valid_short_path = short_path file_path, ref_path = LinkUtility.make_valid_web_res_path( path, fragment) short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path( valid_short_path, fragment) current_link = current_link.replace("\\/", "/") captured.append( LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN + current_link, short_file_path, short_ref_path, ref_path, page.path, link_class, level + 1, is_internal=is_internal)) returned = begin_mark + short_ref_path + end_mark else: #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE: returned = begin_mark + parse.unquote(match2) + end_mark # else: # capture other resources except external webpage # file_path, ref_path = LinkUtility.make_valid_web_res_path(path) # captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1)) # returned = begin_mark + ref_path + end_mark else: returned = begin_mark + parse.unquote(current_link) + end_mark except Exception as ex: print("ex in mapping:", ex) finally: if isinstance(returned, str): # print("sub:", returned) return returned else: return ""
def scrape_web_res(self, page: LinkAttrs): print("look:", page.link, "level: ", page.level) try: if len( page.path ) > ArchiveExplorer.MAX_PATH_LEN: # max file path in any file system raise OSError("file path is too long:" + page.path) response_code, content_type = LinkChecker.get_response(page.link) if response_code not in [ ResponseCode.LinkOK, ResponseCode.LinkFound, ResponseCode.LinkRedirect ]: raise ConnectionError("res is not available: " + page.link) if page.res_type in [ LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS, LinkUtility.EXT_JS ]: # parse a webpage save_text = self._parse_text_res(page) self._file_manager.write_to_file(page.path, save_text) # elif page.res_type != LinkUtility.EXT_OTHER: # TODO: download normal resources # response = LinkChecker.get_common_web_resource(page.link) # if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT: # self._downloader.write_to_file(page.path, response.content, mode="b") # else: # self._downloader.write_to_file(page.path, response.text, mode="t") else: # response = LinkChecker.get_common_web_resource(page.link) # self._downloader.write_to_file(page.path, response.content, mode="b") self._file_manager.download_file(sub_path=page.path, url=page.link, timeout=self._timeout, redirect=self._max_redirect, retries=self._max_retries) except Exception as ex: print("exception:", ex) print("broken res:", page) with self._sync_lock: self._file_manager.write_to_error_log(page.to_tuple(str(ex))) if page.res_type == LinkUtility.EXT_WEBPAGE: self._broken_webpage_count += 1 elif page.res_type == LinkUtility.EXT_CSS: self._broken_css_count += 1 elif page.res_type == LinkUtility.EXT_IMAGE: self._broken_image_count += 1 elif page.res_type == LinkUtility.EXT_JS: self._broken_js_count += 1 else: self._broken_others_count += 1 self._broken_res_list.append(page) finally: with self._sync_lock: self._total_res_done += 1 if page.res_type == LinkUtility.EXT_WEBPAGE: self._total_webpage_count += 1 elif page.res_type == LinkUtility.EXT_CSS: self._total_css_count += 1 elif page.res_type == LinkUtility.EXT_IMAGE: self._total_image_count += 1 elif page.res_type == LinkUtility.EXT_JS: self._total_js_count += 1 else: self._total_others_count += 1
def test_get_all_links(self): link = "http://web.archive.org/web/20140711025724/http://susodigital.com/" source = LinkChecker.get_page_source(link) all_links = LinkChecker.get_all_links_from_source(source) for link in all_links: print(link)
def testRequest(self): url = "http://127.0.0.1:8000/" agent = "VegeBot" source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**") print(source)
def testGetAgent(self): root_domain = "halifaxnational.com" agent = LinkChecker.get_robot_agent(root_domain) can_fetch = agent.can_fetch("*", "http://halifaxnational.com/somethin") print(agent,"can fetch:", can_fetch)
def get_best_archive(root_domain: str, thread_size=100, profile_check=10, pass_threshold=0.8, res_limit=2000) -> ArchiveDetail: """ get the best profile from archive.org by doing profile spectrum analysis, given a root domain name. spectrum analysis: comparison between resources of current profile to all historic resources. :param root_domain: root domain in str, e.g: "google.co.uk" :param thread_size: number of thread to check resource link simultaneously :param profile_check: max number of profile to check :param pass_threshold: threshold define if a profile is good enough. :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc. :return: tuple (archive in ArchiveStruct, spectrum value) """ url = LinkChecker.get_valid_link(root_domain, link="") profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=-profile_check) timestamp = "" info = ArchiveOrg.get_domain_urls(url, limit=res_limit) res_count = len(info) archive = None current_rate = 0.0 min_broken_res_count = 0 good_rate_web_page = 0 good_rate_image = 0 good_rate_css = 0 good_rate_js = 0 good_rate_other = 0 total_web_page_min = 0 total_js_min = 0 total_css_min = 0 total_image_min = 0 total_other_min = 0 if res_count > 0: for profile in profiles: if isinstance(profile, ArchiveStruct): total_web_page = 0 total_js = 0 total_css = 0 total_image = 0 total_other = 0 broken_web_page = 0 broken_js = 0 broken_css = 0 broken_image = 0 broken_other = 0 test_pool = pool.ThreadPool(processes=thread_size) timestamp = profile.date_stamp print("checking:", str(profile)) links = [] for item in info: item.date_stamp = timestamp links.append(ArchiveOrg.get_archive_link(item)) results = [ test_pool.apply_async(func=test_response, args=(x, )) for x in links ] returned = [y.get() for y in results] test_pool.terminate() for result_good, link_cls in returned: if link_cls == LinkUtility.EXT_WEBPAGE: total_web_page += 1 if not result_good: broken_web_page += 1 elif link_cls == LinkUtility.EXT_CSS: total_css += 1 if not result_good: broken_css += 1 elif link_cls == LinkUtility.EXT_JS: total_js += 1 if not result_good: broken_js += 1 elif link_cls == LinkUtility.EXT_IMAGE: total_image += 1 if not result_good: broken_image += 1 else: total_other += 1 if not result_good: broken_other += 1 broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image passed = False total_broken_rate = 1 - broken_res_count / res_count if total_broken_rate >= pass_threshold: passed = True if total_broken_rate > current_rate: current_rate = total_broken_rate archive = profile good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page / total_web_page good_rate_image = 0 if total_image == 0 else 1 - broken_image / total_image good_rate_css = 0 if total_css == 0 else 1 - broken_css / total_css good_rate_js = 0 if total_js == 0 else 1 - broken_js / total_js good_rate_other = 0 if total_other == 0 else 1 - broken_other / total_other total_web_page_min = total_web_page total_js_min = total_js total_css_min = total_css total_image_min = total_image total_other_min = total_other min_broken_res_count = total_broken_rate print("total:", res_count, " broken res:", broken_res_count, " stamp: ", profile.date_stamp, " pass? ", passed, " rate:", total_broken_rate) return ArchiveDetail(root_domain, archive_link=ArchiveOrg.get_archive_link(archive), total_res=res_count, good_res_rate=min_broken_res_count, total_web_page=total_web_page_min, good_webpage_rate=good_rate_web_page, total_css=total_css_min, good_css_rate=good_rate_css, total_js=total_js_min, good_js_rate=good_rate_js, total_image=total_image_min, good_image_rate=good_rate_image, total_other=total_other_min, good_other_rate=good_rate_other)
def get_search_results(keyword: str, page_number: int, proxy: ProxyStruct = None, result_per_page: int = GoogleConst.Result100, timeout=5, return_domain_home_only=True, use_forbidden_filter=True, days_ago=0, addtional_query_parameter: str = "", country_code="us", use_browser=False) -> list: """ generic normal search, get a list of domains form page :param keyword: :param page_number: > 0 :param resultPerPage: :param timeout: :param return_domain_home_only: return root domain name if True, else return protocol suffix + domain name :param use_forbidden_filter: :param days_ago: specify how many days ago before when results were indexed. :return: """ assert page_number > 0, "page number should be greater than 0." page_range = GoogleCom.get_result_per_page_range() assert result_per_page in page_range, "result per page should be one of those values:" + str( page_range) sub_domain = "www" request_link = GoogleUtility.get_local_endpoint(country_code, sub_domain) \ + GoogleConst.CommonSearchPath.format(quote(keyword), result_per_page, (page_number - 1) * result_per_page, country_code) \ + addtional_query_parameter+GoogleUtility.get_query_for_days(days_ago) try: user_agent = WebRequestCommonHeader.webpage_agent if not use_browser: response = GoogleCom._get_response(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent) if not response.status_code == 200: # if response.status_code == 503: # print(response.text) raise ConnectionRefusedError( "error getting result, with status code:", response.status_code) result = response.text else: result = GoogleCom._get_response_browser(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent) soup = bs4.BeautifulSoup(result) tags = soup.select(GoogleConst.SitePath) domains = [] for tag in tags: try: domain = tag.text.strip().replace(" ", "") if return_domain_home_only: domain = LinkChecker.get_root_domain( domain, use_www=False)[2] # get the link else: domain = LinkChecker.get_root_domain(domain, use_www=False)[3] if use_forbidden_filter and LinkChecker.is_domain_forbidden( domain): continue if len(domain) > 0: domains.append(domain) except: pass return domains except Exception as ex: print(ex) return None