def get_archives_lang(root_domain: str, thread_size=10, profile_check=300) -> list:
     url = LinkChecker.get_valid_link(root_domain, link="")
     profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=0-profile_check)
     today_stamp = datetime.utcnow().timestamp()
     for item in profiles:
         if isinstance(item, ArchiveStruct):
             timestamp = item.get_datestamp_unix_time()
             print(str(item), " converted:", str(timestamp))
     return []
 def testRequestAllLink(self):
     url = "http://www.jehovahs-witness.com"
     agent = "VegeBot-Careful"
     source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**", retries=0)
     links = LinkChecker.get_all_links_from_source(source)
     for link in links:
         paras = urlsplit(link)
         page_scheme, page_domain = paras[0], paras[1]
         print(LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
Beispiel #3
0
 def get_archives_lang(root_domain: str,
                       thread_size=10,
                       profile_check=300) -> list:
     url = LinkChecker.get_valid_link(root_domain, link="")
     profiles = ArchiveOrg.get_url_info(url,
                                        min_size=1,
                                        limit=0 - profile_check)
     today_stamp = datetime.utcnow().timestamp()
     for item in profiles:
         if isinstance(item, ArchiveStruct):
             timestamp = item.get_datestamp_unix_time()
             print(str(item), " converted:", str(timestamp))
     return []
Beispiel #4
0
 def testRequestAllLink(self):
     url = "http://www.jehovahs-witness.com"
     agent = "VegeBot-Careful"
     source = LinkChecker.get_page_source(
         url,
         agent=agent,
         from_src="*****@*****.**",
         retries=0)
     links = LinkChecker.get_all_links_from_source(source)
     for link in links:
         paras = urlsplit(link)
         page_scheme, page_domain = paras[0], paras[1]
         print(
             LinkChecker.get_valid_link(page_domain, link.strip(),
                                        page_scheme))
Beispiel #5
0
    def get_best_archive(root_domain: str,
                         thread_size=100,
                         profile_check=10,
                         pass_threshold=0.8,
                         res_limit=2000) -> ArchiveDetail:
        """
        get the best profile from archive.org by doing profile spectrum analysis, given a root domain name.
        spectrum analysis: comparison between resources of current profile to all historic resources.
        :param root_domain: root domain in str, e.g: "google.co.uk"
        :param thread_size: number of thread to check resource link simultaneously
        :param profile_check: max number of profile to check
        :param pass_threshold: threshold define if a profile is good enough.
        :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc.
        :return: tuple (archive in ArchiveStruct, spectrum value)
        """
        url = LinkChecker.get_valid_link(root_domain, link="")
        profiles = ArchiveOrg.get_url_info(url,
                                           min_size=1,
                                           limit=-profile_check)
        timestamp = ""
        info = ArchiveOrg.get_domain_urls(url, limit=res_limit)
        res_count = len(info)
        archive = None
        current_rate = 0.0
        min_broken_res_count = 0
        good_rate_web_page = 0
        good_rate_image = 0
        good_rate_css = 0
        good_rate_js = 0
        good_rate_other = 0

        total_web_page_min = 0
        total_js_min = 0
        total_css_min = 0
        total_image_min = 0
        total_other_min = 0
        if res_count > 0:
            for profile in profiles:
                if isinstance(profile, ArchiveStruct):
                    total_web_page = 0
                    total_js = 0
                    total_css = 0
                    total_image = 0
                    total_other = 0

                    broken_web_page = 0
                    broken_js = 0
                    broken_css = 0
                    broken_image = 0
                    broken_other = 0

                    test_pool = pool.ThreadPool(processes=thread_size)
                    timestamp = profile.date_stamp
                    print("checking:", str(profile))
                    links = []
                    for item in info:
                        item.date_stamp = timestamp
                        links.append(ArchiveOrg.get_archive_link(item))
                    results = [
                        test_pool.apply_async(func=test_response, args=(x, ))
                        for x in links
                    ]
                    returned = [y.get() for y in results]
                    test_pool.terminate()
                    for result_good, link_cls in returned:
                        if link_cls == LinkUtility.EXT_WEBPAGE:
                            total_web_page += 1
                            if not result_good:
                                broken_web_page += 1
                        elif link_cls == LinkUtility.EXT_CSS:
                            total_css += 1
                            if not result_good:
                                broken_css += 1
                        elif link_cls == LinkUtility.EXT_JS:
                            total_js += 1
                            if not result_good:
                                broken_js += 1
                        elif link_cls == LinkUtility.EXT_IMAGE:
                            total_image += 1
                            if not result_good:
                                broken_image += 1
                        else:
                            total_other += 1
                            if not result_good:
                                broken_other += 1
                    broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image
                    passed = False
                    total_broken_rate = 1 - broken_res_count / res_count
                    if total_broken_rate >= pass_threshold:
                        passed = True
                    if total_broken_rate > current_rate:
                        current_rate = total_broken_rate
                        archive = profile
                        good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page / total_web_page
                        good_rate_image = 0 if total_image == 0 else 1 - broken_image / total_image
                        good_rate_css = 0 if total_css == 0 else 1 - broken_css / total_css
                        good_rate_js = 0 if total_js == 0 else 1 - broken_js / total_js
                        good_rate_other = 0 if total_other == 0 else 1 - broken_other / total_other

                        total_web_page_min = total_web_page
                        total_js_min = total_js
                        total_css_min = total_css
                        total_image_min = total_image
                        total_other_min = total_other
                        min_broken_res_count = total_broken_rate
                    print("total:", res_count, " broken res:",
                          broken_res_count, " stamp: ", profile.date_stamp,
                          " pass? ", passed, " rate:", total_broken_rate)
        return ArchiveDetail(root_domain,
                             archive_link=ArchiveOrg.get_archive_link(archive),
                             total_res=res_count,
                             good_res_rate=min_broken_res_count,
                             total_web_page=total_web_page_min,
                             good_webpage_rate=good_rate_web_page,
                             total_css=total_css_min,
                             good_css_rate=good_rate_css,
                             total_js=total_js_min,
                             good_js_rate=good_rate_js,
                             total_image=total_image_min,
                             good_image_rate=good_rate_image,
                             total_other=total_other_min,
                             good_other_rate=good_rate_other)
    def get_best_archive(root_domain: str, thread_size=100, profile_check=10, pass_threshold=0.8, res_limit=2000) -> ArchiveDetail:
        """
        get the best profile from archive.org by doing profile spectrum analysis, given a root domain name.
        spectrum analysis: comparison between resources of current profile to all historic resources.
        :param root_domain: root domain in str, e.g: "google.co.uk"
        :param thread_size: number of thread to check resource link simultaneously
        :param profile_check: max number of profile to check
        :param pass_threshold: threshold define if a profile is good enough.
        :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc.
        :return: tuple (archive in ArchiveStruct, spectrum value)
        """
        url = LinkChecker.get_valid_link(root_domain, link="")
        profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=-profile_check)
        timestamp =""
        info = ArchiveOrg.get_domain_urls(url, limit=res_limit)
        res_count = len(info)
        archive = None
        current_rate = 0.0
        min_broken_res_count = 0
        good_rate_web_page = 0
        good_rate_image = 0
        good_rate_css = 0
        good_rate_js = 0
        good_rate_other = 0

        total_web_page_min = 0
        total_js_min = 0
        total_css_min = 0
        total_image_min = 0
        total_other_min = 0
        if res_count > 0:
            for profile in profiles:
                if isinstance(profile, ArchiveStruct):
                    total_web_page = 0
                    total_js = 0
                    total_css = 0
                    total_image = 0
                    total_other = 0

                    broken_web_page = 0
                    broken_js = 0
                    broken_css = 0
                    broken_image = 0
                    broken_other = 0

                    test_pool = pool.ThreadPool(processes=thread_size)
                    timestamp = profile.date_stamp
                    print("checking:", str(profile))
                    links = []
                    for item in info:
                        item.date_stamp = timestamp
                        links.append(ArchiveOrg.get_archive_link(item))
                    results = [test_pool.apply_async(func=test_response, args=(x,)) for x in links]
                    returned = [y.get() for y in results]
                    test_pool.terminate()
                    for result_good, link_cls in returned:
                        if link_cls == LinkUtility.EXT_WEBPAGE:
                            total_web_page += 1
                            if not result_good:
                                broken_web_page += 1
                        elif link_cls == LinkUtility.EXT_CSS:
                            total_css += 1
                            if not result_good:
                                broken_css += 1
                        elif link_cls == LinkUtility.EXT_JS:
                            total_js += 1
                            if not result_good:
                                broken_js += 1
                        elif link_cls == LinkUtility.EXT_IMAGE:
                            total_image += 1
                            if not result_good:
                                broken_image += 1
                        else:
                            total_other += 1
                            if not result_good:
                                broken_other += 1
                    broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image
                    passed = False
                    total_broken_rate = 1-broken_res_count/res_count
                    if total_broken_rate >= pass_threshold:
                        passed = True
                    if total_broken_rate > current_rate:
                        current_rate = total_broken_rate
                        archive = profile
                        good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page/total_web_page
                        good_rate_image = 0 if total_image == 0 else 1 - broken_image/total_image
                        good_rate_css = 0 if total_css == 0 else 1 - broken_css/total_css
                        good_rate_js = 0 if total_js == 0 else 1 - broken_js/total_js
                        good_rate_other = 0 if total_other == 0 else 1 - broken_other/total_other

                        total_web_page_min = total_web_page
                        total_js_min = total_js
                        total_css_min = total_css
                        total_image_min = total_image
                        total_other_min = total_other
                        min_broken_res_count = total_broken_rate
                    print("total:", res_count, " broken res:", broken_res_count, " stamp: ",
                          profile.date_stamp, " pass? ", passed, " rate:", total_broken_rate)
        return ArchiveDetail(root_domain, archive_link=ArchiveOrg.get_archive_link(archive), total_res=res_count, good_res_rate=min_broken_res_count,
                             total_web_page=total_web_page_min, good_webpage_rate=good_rate_web_page,
                             total_css=total_css_min, good_css_rate=good_rate_css,
                             total_js=total_js_min,  good_js_rate=good_rate_js,
                             total_image=total_image_min, good_image_rate=good_rate_image,
                             total_other=total_other_min, good_other_rate=good_rate_other)
    def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None,
                 controller: SiteCheckerController=None,
                 max_level=10, max_page=1000, delegate=None, output_buff_size=2000,
                 output_queue=None, output_all_external=False, result_delegate=None,
                 memory_control_terminate_event=None, check_robot_text=True,
                 **kwargs):
        """
        :param full_link: The full link of a domain, e.g: https://www.google.co.uk
        :param domain: domain to crawl
        :param max_level: stop crawling if it reaches this level
        :param max_page: maximum pages to check within a site, also stop crawling
        :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999
        :param result_delegate: send site_info upon finish
        :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process.
        :return:
        """
        FeedbackInterface.__init__(self, **kwargs)
        #super(SiteChecker, self).__init__(**kwargs)
        if full_link is None or len(full_link) == 0:
            raise ValueError()

        original_path = ""
        try:
            paras = urlsplit(full_link)
            self.scheme, self.domain, original_path = paras[0], paras[1], paras[2]
        except:
            pass

        domain_data = LinkChecker.get_root_domain(full_link, False)
        self.root_domain = domain_data[1]
        self.sub_domain = domain_data[4]
        self.domain_suffix = domain_data[5]
        self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix)
        if self.scheme == "":
            self.scheme = "http"
        if self.domain == "":
            self.domain = self.root_domain
        self.orginal_link = full_link
        self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme)
        self.max_level = max_level
        self.max_page = max_page
        self.page_count = 0  # keep track page done
        self._page_count_shadow = 0 # track previous count
        self._all_page_count_shadow = 0 #track previous count in datasource
        self.internal_page_count = 0
        self.internal_page_last_count = 0
        self.page_allocated = 0
        self.current_level = 0  # if this = 0, it is root domain/home_page
        self._stop_event = Event()
        valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link)
        self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self,
                                                              stop_event=self._stop_event,
                                                              buf_size=int(output_buff_size/2),
                                                              dir_path=get_db_buffer_default_dir(),
                                                              convert_output=False)
        self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False)
        self._memory_control_terminate_event = memory_control_terminate_event
        self.task_control_lock = threading.RLock()
        if data_source is None:
            #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self)
            self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self)
        else:
            self.data_source = data_source  # a list of OnSiteLink
        self.delegate = delegate
        if LinkChecker.might_be_link_html_page(original_path):
            self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point
        self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1))
        self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1))
        self.cache_list = []  # internal page cache
        self.page_need_look_up_temp = 0
        self.cache_list.append(self.domain_link)
        if "www." not in self.sub_domain:
            self.cache_list.append(self.scheme + "://www."+self.sub_domain)
        self.cache_list.append(self.scheme + "://" + self.domain)
        self.page_need_look_up = self.data_source.count_all()
        self.cache_size = 500  # create a small cache list to avoid going to check link in file system with lots of read and write
        self._double_check_cache_lock = threading.RLock()
        self._double_check_cache = deque(maxlen=self.cache_size)
        self.external_cache_list = []
        self.external_cache_size = 500  # cache that hold external sites
        self.external_links_checked = 0
        self.add_internal_page_OK_only = True
        self.output_queue = output_queue
        self.output_all_external = output_all_external
        self.controller = controller
        self.result_delegate = result_delegate
        self.page_count_lock = threading.RLock()
        self.internal_page_count_lock = threading.RLock()
        self.level_lock = threading.RLock()
        self.page_look_up_lock = threading.RLock()
        self.external_link_check_lock = threading.RLock()
        self._finihsed = False
        self.task_control_max = 1
        self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \
                     "if you have an enquiry, please email to: [email protected])"
        self.agent_from = "*****@*****.**"
        if check_robot_text:
            self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme)
        else:
            self.robot_agent = None
        self.site_crawl_delay = 0.60

        if isinstance(self.robot_agent, Rules):
            delay_temp = self.robot_agent.delay(self.agent)
            if delay_temp is not None and delay_temp != self.site_crawl_delay:
                self.site_crawl_delay = delay_temp

        self.task_control_counter = 1
        self._speed_penalty_count = 0
        self._speed_penalty_threshold = 10
        self._progress_logging_speed = 120
        self._output_period = 120
        self._output_batch_size = 100
        self._death_wish_sent = False
        SiteChecker._is_lxml_parser_exist()
        self._output_thread = None
        self._output_queue = None
        self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event)
        self._status = "Start"
        self._populate_with_state()  # restore laste known state
    def check_internal_page(checker: SiteChecker, page: OnSiteLink, timeout=10) -> ([], []):
        internal_pages = []
        external_pages = []
        #
        # if isinstance(checker.robot_agent, robotparser.RobotFileParser):
        #     if not checker.robot_agent.can_fetch(useragent=checker.agent, url=page.link):
        #         return [], []
        # print("checking internal_page", page)

        if isinstance(checker.robot_agent, Rules):
            try:
                if not checker.robot_agent.allowed(page.link, agent=checker.agent):
                    return [], []
            except:
                return [], []

        use_lxml_parser = checker.use_lxml_parser()
        with checker.task_control_lock:
            time.sleep(checker.site_crawl_delay)
            response = LinkChecker.get_page_source(page.link, timeout, agent=checker.agent, from_src=checker.agent_from)
        if response is None or response.status_code == ResponseCode.LinkError:
            return [], []
        paras = urlsplit(page.link)
        page_scheme, page_domain = paras[0], paras[1]

        links = LinkChecker.get_webpage_links_from_source(response, use_lxml_parser)

        for link in links:
            link_type = OnSiteLink.TypeOutbound
            valid_link = LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme)
            # if PageChecker.is_link_in_list(valid_link, new_pages):
            #     continue
            try:
                link_paras = urlsplit(valid_link)
                link_scheme, link_domain, link_path = link_paras[0], link_paras[1], link_paras[2]
                if link_domain.lower().startswith("mailto:"):
                    continue
                if not LinkChecker.might_be_link_html_page(link_path):
                    continue
            except:
                continue
            # if str(link_domain).endswith(checker.root_domain):
            if checker.sub_domain_no_local in link_domain:  # important change
                if checker.data_source.all_record > checker.max_page:
                    continue
                link_type = OnSiteLink.TypeOnSite
            else: # external
                valid_link = link_scheme + "://" + link_domain
            if link_type == OnSiteLink.TypeOnSite:
                if checker.is_link_in_cache(valid_link):
                    continue
                else:
                    checker.add_link_to_cache(valid_link)
                    internal_page = (valid_link, ResponseCode.LinkOK, page.link_level+1, OnSiteLink.TypeOnSite)
                    internal_pages.append(internal_page)
            else:
                stripped = str(link_domain).lower().strip()
                if stripped in checker.external_cache_list:
                    continue
                if len(checker.external_cache_list) < checker.external_cache_size:
                    checker.external_cache_list.append(stripped)
                external_page = (stripped, ResponseCode.DNSError)
                external_pages.append(external_page)
        return internal_pages, external_pages