def _check_whois_v1(self, domain_data: OnSiteLink): root_domain = domain_data.link try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] real_response_code = domain_data.response_code whois = LinkChecker.check_whois(root_domain) # check whois record if whois[0]: if whois[2]: # domain is expired real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired if real_response_code == ResponseCode.Expired: #if ResponseCode.domain_might_be_expired(real_response_code): domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # if isinstance(self._queue_lock, multiprocessing.RLock): with self._queue_lock: self._output_q.put( (domain_data.link, domain_data.response_code)) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def _check_whois(self, domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if not self._is_debug: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain( domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois( root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) self._put_output_result_in_queue(domain_data) else: self._put_output_result_in_queue(domain_data) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def check_whois_with_dns(page: OnSiteLink): real_response_code = ResponseCode.DNSError skip_whois_check = False try: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] suffix = root_result[5] if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST: skip_whois_check = True else: if LinkChecker.is_domain_DNS_OK(sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError skip_whois_check = True elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError skip_whois_check = True # response = LinkChecker.get_response(page.link, timeout) # check 404 error page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain except Exception as ex: # ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link) skip_whois_check = True finally: if not skip_whois_check and real_response_code == ResponseCode.DNSError: return check_whois(page) else: return page.link, page.response_code
def format_output(self, data): if self._convert_output: return OnSiteLink(data[0], data[1], link_type=OnSiteLink.TypeOutbound) else: return data
def check_external_page(checker: SiteChecker, page: OnSiteLink, timeout=10): """ check DNS Error Only :param checker: :param page: :param timeout: :return: """ # response = LinkChecker.get_response(page.link, timeout) #real_response_code = response[0] #real_response_code = ResponseCode.LinkOK #print("-------checking external " + page.link) try: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] if len(sub_domain) == 0 or root_domain in checker.external_cache_list: return else: if len(checker.external_cache_list) < checker.external_cache_size: checker.external_cache_list.append(root_domain) real_response_code = page.response_code if LinkChecker.is_domain_DNS_OK(sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError # response = LinkChecker.get_response(page.link, timeout) # check 404 error page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain #print(" ready to output external:", str(page)) if checker.output_all_external or ResponseCode.domain_might_be_expired(real_response_code): # if checker.delegate is not None: # checker.delegate(new_page) if checker.output_queue is not None: with checker._queue_lock: checker.output_queue.put(page) except Exception as ex: PrintLogger.print(ex) ErrorLogger.log_error("PageChecker", ex, "check_external_page() " + page.link)
def check_whois(domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as ex: print(ex) finally: return domain_data.link, domain_data.response_code
def testWhoisDNS(self): test_domains = [ 'abacouncensored.com', 'sexy-chat-rooms.org', 'girlxxxfree.info', 'yourptr.us', 'nick-rees-enterprises.net', 'unibag.co.nz' ] for item in [ OnSiteLink(link=x, response_code=ResponseCode.DNSError) for x in test_domains ]: print(check_whois_with_dns(item))
def reset_as(self, domain: str, link: str=""): # reset the target domain PrintLogger.print("crawl reset as: "+domain) self.domain = domain self.domain_link = self.scheme + "://" + self.domain self.page_count = 0 self.current_level = 0 self.set_page_need_look_up(1) # self.set_page_looked_up(0) self.clear() if len(link) == 0: self.cache_list.append(self.domain_link) self.data_source.re_target(self.domain_link, OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) #self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) else: self.cache_list.append(link) self.data_source.re_target(link, OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1)) #self.data_source.append(OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1)) self.additional_reset() self.data_source.additional_startup_procedures()
def check_whois(domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois( root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as ex: print(ex) finally: return domain_data.link, domain_data.response_code
def default_delegate(self, result): with self.temp_result_lock: if isinstance(result, OnSiteLink): self.temp_results.append(result) # make no difference #CsvLogger.log_to_file("ExternalSiteTemp", [(result.link, result.response_code), ]) elif isinstance(result, str): self.temp_results.append(result) elif isinstance(result, tuple) and len(result) == 2: temp = OnSiteLink(result[0], result[1]) print("new domain:", temp) self.temp_results.append(temp) else: pass
def _check_whois(self, domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if not self._is_debug: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) self._put_output_result_in_queue(domain_data) else: self._put_output_result_in_queue(domain_data) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def _check_whois_v1(self, domain_data: OnSiteLink): root_domain = domain_data.link try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] real_response_code = domain_data.response_code whois = LinkChecker.check_whois(root_domain) # check whois record if whois[0]: if whois[2]: # domain is expired real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired if real_response_code == ResponseCode.Expired: #if ResponseCode.domain_might_be_expired(real_response_code): domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # if isinstance(self._queue_lock, multiprocessing.RLock): with self._queue_lock: self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def format_output(self, data): link, code, level, link_type, rowid = data return self.ref_obj, OnSiteLink(link, code, level, link_type)
def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None, controller: SiteCheckerController=None, max_level=10, max_page=1000, delegate=None, output_buff_size=2000, output_queue=None, output_all_external=False, result_delegate=None, memory_control_terminate_event=None, check_robot_text=True, **kwargs): """ :param full_link: The full link of a domain, e.g: https://www.google.co.uk :param domain: domain to crawl :param max_level: stop crawling if it reaches this level :param max_page: maximum pages to check within a site, also stop crawling :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999 :param result_delegate: send site_info upon finish :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process. :return: """ FeedbackInterface.__init__(self, **kwargs) #super(SiteChecker, self).__init__(**kwargs) if full_link is None or len(full_link) == 0: raise ValueError() original_path = "" try: paras = urlsplit(full_link) self.scheme, self.domain, original_path = paras[0], paras[1], paras[2] except: pass domain_data = LinkChecker.get_root_domain(full_link, False) self.root_domain = domain_data[1] self.sub_domain = domain_data[4] self.domain_suffix = domain_data[5] self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix) if self.scheme == "": self.scheme = "http" if self.domain == "": self.domain = self.root_domain self.orginal_link = full_link self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme) self.max_level = max_level self.max_page = max_page self.page_count = 0 # keep track page done self._page_count_shadow = 0 # track previous count self._all_page_count_shadow = 0 #track previous count in datasource self.internal_page_count = 0 self.internal_page_last_count = 0 self.page_allocated = 0 self.current_level = 0 # if this = 0, it is root domain/home_page self._stop_event = Event() valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link) self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self, stop_event=self._stop_event, buf_size=int(output_buff_size/2), dir_path=get_db_buffer_default_dir(), convert_output=False) self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False) self._memory_control_terminate_event = memory_control_terminate_event self.task_control_lock = threading.RLock() if data_source is None: #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self) self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self) else: self.data_source = data_source # a list of OnSiteLink self.delegate = delegate if LinkChecker.might_be_link_html_page(original_path): self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1)) self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1)) self.cache_list = [] # internal page cache self.page_need_look_up_temp = 0 self.cache_list.append(self.domain_link) if "www." not in self.sub_domain: self.cache_list.append(self.scheme + "://www."+self.sub_domain) self.cache_list.append(self.scheme + "://" + self.domain) self.page_need_look_up = self.data_source.count_all() self.cache_size = 500 # create a small cache list to avoid going to check link in file system with lots of read and write self._double_check_cache_lock = threading.RLock() self._double_check_cache = deque(maxlen=self.cache_size) self.external_cache_list = [] self.external_cache_size = 500 # cache that hold external sites self.external_links_checked = 0 self.add_internal_page_OK_only = True self.output_queue = output_queue self.output_all_external = output_all_external self.controller = controller self.result_delegate = result_delegate self.page_count_lock = threading.RLock() self.internal_page_count_lock = threading.RLock() self.level_lock = threading.RLock() self.page_look_up_lock = threading.RLock() self.external_link_check_lock = threading.RLock() self._finihsed = False self.task_control_max = 1 self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \ "if you have an enquiry, please email to: [email protected])" self.agent_from = "*****@*****.**" if check_robot_text: self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme) else: self.robot_agent = None self.site_crawl_delay = 0.60 if isinstance(self.robot_agent, Rules): delay_temp = self.robot_agent.delay(self.agent) if delay_temp is not None and delay_temp != self.site_crawl_delay: self.site_crawl_delay = delay_temp self.task_control_counter = 1 self._speed_penalty_count = 0 self._speed_penalty_threshold = 10 self._progress_logging_speed = 120 self._output_period = 120 self._output_batch_size = 100 self._death_wish_sent = False SiteChecker._is_lxml_parser_exist() self._output_thread = None self._output_queue = None self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event) self._status = "Start" self._populate_with_state() # restore laste known state