def _check_whois_v1(self, domain_data: OnSiteLink):
     root_domain = domain_data.link
     try:
         if root_domain.startswith("http"):
             root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
         real_response_code = domain_data.response_code
         whois = LinkChecker.check_whois(root_domain)  # check whois record
         if whois[0]:
             if whois[2]:  # domain is expired
                 real_response_code = ResponseCode.Expired
             else:
                 real_response_code = ResponseCode.MightBeExpired
         if real_response_code == ResponseCode.Expired:
             #if ResponseCode.domain_might_be_expired(real_response_code):
             domain_data.link = root_domain
             domain_data.response_code = real_response_code
             #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
             # if isinstance(self._queue_lock, multiprocessing.RLock):
             with self._queue_lock:
                 self._output_q.put(
                     (domain_data.link, domain_data.response_code))
     except Exception as ex:
         ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex,
                               "_check_whois() " + root_domain)
     finally:
         self._add_job_done_one()
 def _check_whois(self, domain_data: OnSiteLink):
     root_domain = domain_data.link.lower()
     try:
         if not self._is_debug:
             if root_domain.startswith("http"):
                 root_domain = LinkChecker.get_root_domain(
                     domain_data.link)[1]
             is_available, is_redemption = LinkChecker.is_domain_available_whois(
                 root_domain)  # check whois record
             if is_available or is_redemption:
                 if is_available:
                     real_response_code = ResponseCode.Expired
                 else:
                     real_response_code = ResponseCode.MightBeExpired
                 domain_data.link = root_domain
                 domain_data.response_code = real_response_code
                 #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
                 self._put_output_result_in_queue(domain_data)
         else:
             self._put_output_result_in_queue(domain_data)
     except Exception as ex:
         ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex,
                               "_check_whois() " + root_domain)
     finally:
         self._add_job_done_one()
def check_whois_with_dns(page: OnSiteLink):

    real_response_code = ResponseCode.DNSError
    skip_whois_check = False
    try:
        root_result = LinkChecker.get_root_domain(page.link)
        root_domain = root_result[1]
        sub_domain = root_result[4]
        suffix = root_result[5]

        if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST:
            skip_whois_check = True
        else:

            if LinkChecker.is_domain_DNS_OK(sub_domain):  # check DNS first
                real_response_code = ResponseCode.NoDNSError
                skip_whois_check = True
            elif not sub_domain.startswith("www."):
                if LinkChecker.is_domain_DNS_OK("www." + root_domain):
                    real_response_code = ResponseCode.NoDNSError
                    skip_whois_check = True
                # response = LinkChecker.get_response(page.link, timeout)  # check 404 error

            page.response_code = real_response_code
            page.link_type = OnSiteLink.TypeOutbound
            page.link = root_domain

    except Exception as ex:
        # ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link)
        skip_whois_check = True
    finally:
        if not skip_whois_check and real_response_code == ResponseCode.DNSError:
            return check_whois(page)
        else:
            return page.link, page.response_code
Exemple #4
0
def check_whois_with_dns(page: OnSiteLink):

    real_response_code = ResponseCode.DNSError
    skip_whois_check = False
    try:
        root_result = LinkChecker.get_root_domain(page.link)
        root_domain = root_result[1]
        sub_domain = root_result[4]
        suffix = root_result[5]

        if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST:
            skip_whois_check = True
        else:

            if LinkChecker.is_domain_DNS_OK(sub_domain):  # check DNS first
                real_response_code = ResponseCode.NoDNSError
                skip_whois_check = True
            elif not sub_domain.startswith("www."):
                if LinkChecker.is_domain_DNS_OK("www." + root_domain):
                    real_response_code = ResponseCode.NoDNSError
                    skip_whois_check = True
                # response = LinkChecker.get_response(page.link, timeout)  # check 404 error

            page.response_code = real_response_code
            page.link_type = OnSiteLink.TypeOutbound
            page.link = root_domain

    except Exception as ex:
        # ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link)
        skip_whois_check = True
    finally:
        if not skip_whois_check and real_response_code == ResponseCode.DNSError:
            return check_whois(page)
        else:
            return page.link, page.response_code
 def format_output(self, data):
     if self._convert_output:
         return OnSiteLink(data[0],
                           data[1],
                           link_type=OnSiteLink.TypeOutbound)
     else:
         return data
    def check_external_page(checker: SiteChecker, page: OnSiteLink, timeout=10):
        """
        check DNS Error Only
        :param checker:
        :param page:
        :param timeout:
        :return:
        """
        # response = LinkChecker.get_response(page.link, timeout)
        #real_response_code = response[0]
        #real_response_code = ResponseCode.LinkOK

        #print("-------checking external " + page.link)
        try:
            root_result = LinkChecker.get_root_domain(page.link)
            root_domain = root_result[1]
            sub_domain = root_result[4]

            if len(sub_domain) == 0 or root_domain in checker.external_cache_list:
                return
            else:
                if len(checker.external_cache_list) < checker.external_cache_size:
                    checker.external_cache_list.append(root_domain)

            real_response_code = page.response_code
            if LinkChecker.is_domain_DNS_OK(sub_domain):  # check DNS first
                real_response_code = ResponseCode.NoDNSError
            elif not sub_domain.startswith("www."):
                if LinkChecker.is_domain_DNS_OK("www." + root_domain):
                    real_response_code = ResponseCode.NoDNSError
                # response = LinkChecker.get_response(page.link, timeout)  # check 404 error

            page.response_code = real_response_code
            page.link_type = OnSiteLink.TypeOutbound
            page.link = root_domain
            #print(" ready to output external:", str(page))
            if checker.output_all_external or ResponseCode.domain_might_be_expired(real_response_code):
                    # if checker.delegate is not None:
                    #     checker.delegate(new_page)
                if checker.output_queue is not None:
                    with checker._queue_lock:
                        checker.output_queue.put(page)
        except Exception as ex:
            PrintLogger.print(ex)
            ErrorLogger.log_error("PageChecker", ex, "check_external_page() " + page.link)
def check_whois(domain_data: OnSiteLink):
    root_domain = domain_data.link.lower()
    try:
        if root_domain.startswith("http"):
            root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
        is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain)  # check whois record
        if is_available or is_redemption:
            if is_available:
                real_response_code = ResponseCode.Expired
            else:
                real_response_code = ResponseCode.MightBeExpired
            domain_data.link = root_domain
            domain_data.response_code = real_response_code
            #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
            # self._output_q.put((domain_data.link, domain_data.response_code))
    except Exception as ex:
        print(ex)
    finally:
        return domain_data.link, domain_data.response_code
Exemple #8
0
 def testWhoisDNS(self):
     test_domains = [
         'abacouncensored.com', 'sexy-chat-rooms.org', 'girlxxxfree.info',
         'yourptr.us', 'nick-rees-enterprises.net', 'unibag.co.nz'
     ]
     for item in [
             OnSiteLink(link=x, response_code=ResponseCode.DNSError)
             for x in test_domains
     ]:
         print(check_whois_with_dns(item))
 def reset_as(self, domain: str, link: str=""):  # reset the target domain
     PrintLogger.print("crawl reset as: "+domain)
     self.domain = domain
     self.domain_link = self.scheme + "://" + self.domain
     self.page_count = 0
     self.current_level = 0
     self.set_page_need_look_up(1)
    # self.set_page_looked_up(0)
     self.clear()
     if len(link) == 0:
         self.cache_list.append(self.domain_link)
         self.data_source.re_target(self.domain_link, OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1))
         #self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1))
     else:
         self.cache_list.append(link)
         self.data_source.re_target(link, OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1))
         #self.data_source.append(OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1))
     self.additional_reset()
     self.data_source.additional_startup_procedures()
Exemple #10
0
def check_whois(domain_data: OnSiteLink):
    root_domain = domain_data.link.lower()
    try:
        if root_domain.startswith("http"):
            root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
        is_available, is_redemption = LinkChecker.is_domain_available_whois(
            root_domain)  # check whois record
        if is_available or is_redemption:
            if is_available:
                real_response_code = ResponseCode.Expired
            else:
                real_response_code = ResponseCode.MightBeExpired
            domain_data.link = root_domain
            domain_data.response_code = real_response_code
            #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
            # self._output_q.put((domain_data.link, domain_data.response_code))
    except Exception as ex:
        print(ex)
    finally:
        return domain_data.link, domain_data.response_code
Exemple #11
0
 def default_delegate(self, result):
     with self.temp_result_lock:
         if isinstance(result, OnSiteLink):
             self.temp_results.append(result)  # make no difference
             #CsvLogger.log_to_file("ExternalSiteTemp", [(result.link, result.response_code), ])
         elif isinstance(result, str):
             self.temp_results.append(result)
         elif isinstance(result, tuple) and len(result) == 2:
             temp = OnSiteLink(result[0], result[1])
             print("new domain:", temp)
             self.temp_results.append(temp)
         else:
             pass
 def _check_whois(self, domain_data: OnSiteLink):
     root_domain = domain_data.link.lower()
     try:
         if not self._is_debug:
             if root_domain.startswith("http"):
                 root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
             is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain)  # check whois record
             if is_available or is_redemption:
                 if is_available:
                     real_response_code = ResponseCode.Expired
                 else:
                     real_response_code = ResponseCode.MightBeExpired
                 domain_data.link = root_domain
                 domain_data.response_code = real_response_code
             #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
                 self._put_output_result_in_queue(domain_data)
         else:
             self._put_output_result_in_queue(domain_data)
     except Exception as ex:
         ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain)
     finally:
         self._add_job_done_one()
 def _check_whois_v1(self, domain_data: OnSiteLink):
     root_domain = domain_data.link
     try:
         if root_domain.startswith("http"):
             root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
         real_response_code = domain_data.response_code
         whois = LinkChecker.check_whois(root_domain)  # check whois record
         if whois[0]:
             if whois[2]:  # domain is expired
                 real_response_code = ResponseCode.Expired
             else:
                 real_response_code = ResponseCode.MightBeExpired
         if real_response_code == ResponseCode.Expired:
         #if ResponseCode.domain_might_be_expired(real_response_code):
             domain_data.link = root_domain
             domain_data.response_code = real_response_code
             #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
             # if isinstance(self._queue_lock, multiprocessing.RLock):
             with self._queue_lock:
                 self._output_q.put((domain_data.link, domain_data.response_code))
     except Exception as ex:
         ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain)
     finally:
         self._add_job_done_one()
 def format_output(self, data):
     link, code, level, link_type, rowid = data
     return self.ref_obj, OnSiteLink(link, code, level, link_type)
    def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None,
                 controller: SiteCheckerController=None,
                 max_level=10, max_page=1000, delegate=None, output_buff_size=2000,
                 output_queue=None, output_all_external=False, result_delegate=None,
                 memory_control_terminate_event=None, check_robot_text=True,
                 **kwargs):
        """
        :param full_link: The full link of a domain, e.g: https://www.google.co.uk
        :param domain: domain to crawl
        :param max_level: stop crawling if it reaches this level
        :param max_page: maximum pages to check within a site, also stop crawling
        :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999
        :param result_delegate: send site_info upon finish
        :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process.
        :return:
        """
        FeedbackInterface.__init__(self, **kwargs)
        #super(SiteChecker, self).__init__(**kwargs)
        if full_link is None or len(full_link) == 0:
            raise ValueError()

        original_path = ""
        try:
            paras = urlsplit(full_link)
            self.scheme, self.domain, original_path = paras[0], paras[1], paras[2]
        except:
            pass

        domain_data = LinkChecker.get_root_domain(full_link, False)
        self.root_domain = domain_data[1]
        self.sub_domain = domain_data[4]
        self.domain_suffix = domain_data[5]
        self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix)
        if self.scheme == "":
            self.scheme = "http"
        if self.domain == "":
            self.domain = self.root_domain
        self.orginal_link = full_link
        self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme)
        self.max_level = max_level
        self.max_page = max_page
        self.page_count = 0  # keep track page done
        self._page_count_shadow = 0 # track previous count
        self._all_page_count_shadow = 0 #track previous count in datasource
        self.internal_page_count = 0
        self.internal_page_last_count = 0
        self.page_allocated = 0
        self.current_level = 0  # if this = 0, it is root domain/home_page
        self._stop_event = Event()
        valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link)
        self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self,
                                                              stop_event=self._stop_event,
                                                              buf_size=int(output_buff_size/2),
                                                              dir_path=get_db_buffer_default_dir(),
                                                              convert_output=False)
        self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False)
        self._memory_control_terminate_event = memory_control_terminate_event
        self.task_control_lock = threading.RLock()
        if data_source is None:
            #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self)
            self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self)
        else:
            self.data_source = data_source  # a list of OnSiteLink
        self.delegate = delegate
        if LinkChecker.might_be_link_html_page(original_path):
            self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point
        self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1))
        self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1))
        self.cache_list = []  # internal page cache
        self.page_need_look_up_temp = 0
        self.cache_list.append(self.domain_link)
        if "www." not in self.sub_domain:
            self.cache_list.append(self.scheme + "://www."+self.sub_domain)
        self.cache_list.append(self.scheme + "://" + self.domain)
        self.page_need_look_up = self.data_source.count_all()
        self.cache_size = 500  # create a small cache list to avoid going to check link in file system with lots of read and write
        self._double_check_cache_lock = threading.RLock()
        self._double_check_cache = deque(maxlen=self.cache_size)
        self.external_cache_list = []
        self.external_cache_size = 500  # cache that hold external sites
        self.external_links_checked = 0
        self.add_internal_page_OK_only = True
        self.output_queue = output_queue
        self.output_all_external = output_all_external
        self.controller = controller
        self.result_delegate = result_delegate
        self.page_count_lock = threading.RLock()
        self.internal_page_count_lock = threading.RLock()
        self.level_lock = threading.RLock()
        self.page_look_up_lock = threading.RLock()
        self.external_link_check_lock = threading.RLock()
        self._finihsed = False
        self.task_control_max = 1
        self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \
                     "if you have an enquiry, please email to: [email protected])"
        self.agent_from = "*****@*****.**"
        if check_robot_text:
            self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme)
        else:
            self.robot_agent = None
        self.site_crawl_delay = 0.60

        if isinstance(self.robot_agent, Rules):
            delay_temp = self.robot_agent.delay(self.agent)
            if delay_temp is not None and delay_temp != self.site_crawl_delay:
                self.site_crawl_delay = delay_temp

        self.task_control_counter = 1
        self._speed_penalty_count = 0
        self._speed_penalty_threshold = 10
        self._progress_logging_speed = 120
        self._output_period = 120
        self._output_batch_size = 100
        self._death_wish_sent = False
        SiteChecker._is_lxml_parser_exist()
        self._output_thread = None
        self._output_queue = None
        self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event)
        self._status = "Start"
        self._populate_with_state()  # restore laste known state