def __init__(self, stop_event: Event, input_queue: Queue=None, output_queue: Queue=None,
              max_worker: int=10, dir_path="", is_debug=False,  **kwargs):
     self._is_debug = is_debug
     FeedbackInterface.__init__(self, **kwargs)
     ExternalTempInterface.__init__(self)
     # do not use predefined queue here
     # self._input_q = input_queue
     # self._output_q = output_queue
     self._stop_event = stop_event
     self._internal_stop_event = Event()
     self._max_worker = max_worker
     self._job_done = 0
     self._job_done_shadow = 0
     self._job_done_lock = RLock()
     self._input_period = 0.0001  # time to sample data into the buffer
     self._max_sample_results = 100000
     self._min_sampling_duration = 0.0001
     self._sample_batch_size = 5000
     self._sample_batch_timeout = 60
     if is_debug:
         self._min_buff_delete_threshold = 10000  # default is 100000
     else:
         self._min_buff_delete_threshold = 100000
     self._speed_penalty_count = 0
     self._finished = False
     manager, self._output_q = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
     self._db_buffer = ExternalTempDataDiskBuffer("whois_check.db", self, self._internal_stop_event, buf_size=self._max_worker*50,
                                                  terminate_callback=WhoisChecker.terminate_callback, dir_path=dir_path)
     self._populate_with_state()  # FeedbackInterface
     if not is_debug:
         log_period = 120
     else:
         log_period = 10
     self._progress_logger = ProgressLogger(log_period, self, self._internal_stop_event)
 def __init__(self,
              stop_event: Event,
              input_queue: Queue = None,
              output_queue: Queue = None,
              max_worker: int = 10,
              dir_path="",
              is_debug=False,
              **kwargs):
     self._is_debug = is_debug
     FeedbackInterface.__init__(self, **kwargs)
     ExternalTempInterface.__init__(self)
     # do not use predefined queue here
     # self._input_q = input_queue
     # self._output_q = output_queue
     self._stop_event = stop_event
     self._internal_stop_event = Event()
     self._max_worker = max_worker
     self._job_done = 0
     self._job_done_shadow = 0
     self._job_done_lock = RLock()
     self._input_period = 0.0001  # time to sample data into the buffer
     self._max_sample_results = 100000
     self._min_sampling_duration = 0.0001
     self._sample_batch_size = 5000
     self._sample_batch_timeout = 60
     if is_debug:
         self._min_buff_delete_threshold = 10000  # default is 100000
     else:
         self._min_buff_delete_threshold = 100000
     self._speed_penalty_count = 0
     self._finished = False
     manager, self._output_q = get_queue_client(
         QueueManager.MachineSettingCrawler,
         QueueManager.Method_Whois_Output)
     self._db_buffer = ExternalTempDataDiskBuffer(
         "whois_check.db",
         self,
         self._internal_stop_event,
         buf_size=self._max_worker * 50,
         terminate_callback=WhoisChecker.terminate_callback,
         dir_path=dir_path)
     self._populate_with_state()  # FeedbackInterface
     if not is_debug:
         log_period = 120
     else:
         log_period = 10
     self._progress_logger = ProgressLogger(log_period, self,
                                            self._internal_stop_event)
 def __init__(self, db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event,
              matrix: CrawlMatrix, accounts: list, force_mode=False, force_mode_offset=0, force_mode_total=0,  **kwargs):
     FeedbackInterface.__init__(self, **kwargs)
     self._stop_event = stop_event
     self._matrix = matrix
     self._db_ref = db_ref
     self._input_queue = input_queue
     self._output_queue = output_queue
     self._pool_input = Queue()
     self._pool = FilterPool(self._pool_input, self._output_queue, self._queue_lock, self._stop_event, self._matrix,
                             accounts=accounts)
     self._db_buffer = ExternalTempDataDiskBuffer(self._db_ref, self, self._stop_event, dir_path=db_dir,
                                                  buf_size=2500, output_f=5000) # control how data flow speed,
                                                  # it can keep input:output ratio = 1:1 at max 10 milion data row per hour
     #FeedbackInterface.__init__(self, **kwargs)
     ExternalTempInterface.__init__(self)
     self._populate_with_state()
     if force_mode:
         new_state = _FilterState(progress=force_mode_offset, all_data=force_mode_total)
         self.populate_with_state(new_state)
 def __init__(self,
              db_ref: str,
              db_dir: str,
              input_queue: Queue,
              output_queue: Queue,
              stop_event: Event,
              matrix: CrawlMatrix,
              accounts: list,
              force_mode=False,
              force_mode_offset=0,
              force_mode_total=0,
              **kwargs):
     FeedbackInterface.__init__(self, **kwargs)
     self._stop_event = stop_event
     self._matrix = matrix
     self._db_ref = db_ref
     self._input_queue = input_queue
     self._output_queue = output_queue
     self._pool_input = Queue()
     self._pool = FilterPool(self._pool_input,
                             self._output_queue,
                             self._queue_lock,
                             self._stop_event,
                             self._matrix,
                             accounts=accounts)
     self._db_buffer = ExternalTempDataDiskBuffer(
         self._db_ref,
         self,
         self._stop_event,
         dir_path=db_dir,
         buf_size=2500,
         output_f=5000)  # control how data flow speed,
     # it can keep input:output ratio = 1:1 at max 10 milion data row per hour
     #FeedbackInterface.__init__(self, **kwargs)
     ExternalTempInterface.__init__(self)
     self._populate_with_state()
     if force_mode:
         new_state = _FilterState(progress=force_mode_offset,
                                  all_data=force_mode_total)
         self.populate_with_state(new_state)
    def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None,
                 controller: SiteCheckerController=None,
                 max_level=10, max_page=1000, delegate=None, output_buff_size=2000,
                 output_queue=None, output_all_external=False, result_delegate=None,
                 memory_control_terminate_event=None, check_robot_text=True,
                 **kwargs):
        """
        :param full_link: The full link of a domain, e.g: https://www.google.co.uk
        :param domain: domain to crawl
        :param max_level: stop crawling if it reaches this level
        :param max_page: maximum pages to check within a site, also stop crawling
        :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999
        :param result_delegate: send site_info upon finish
        :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process.
        :return:
        """
        FeedbackInterface.__init__(self, **kwargs)
        #super(SiteChecker, self).__init__(**kwargs)
        if full_link is None or len(full_link) == 0:
            raise ValueError()

        original_path = ""
        try:
            paras = urlsplit(full_link)
            self.scheme, self.domain, original_path = paras[0], paras[1], paras[2]
        except:
            pass

        domain_data = LinkChecker.get_root_domain(full_link, False)
        self.root_domain = domain_data[1]
        self.sub_domain = domain_data[4]
        self.domain_suffix = domain_data[5]
        self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix)
        if self.scheme == "":
            self.scheme = "http"
        if self.domain == "":
            self.domain = self.root_domain
        self.orginal_link = full_link
        self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme)
        self.max_level = max_level
        self.max_page = max_page
        self.page_count = 0  # keep track page done
        self._page_count_shadow = 0 # track previous count
        self._all_page_count_shadow = 0 #track previous count in datasource
        self.internal_page_count = 0
        self.internal_page_last_count = 0
        self.page_allocated = 0
        self.current_level = 0  # if this = 0, it is root domain/home_page
        self._stop_event = Event()
        valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link)
        self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self,
                                                              stop_event=self._stop_event,
                                                              buf_size=int(output_buff_size/2),
                                                              dir_path=get_db_buffer_default_dir(),
                                                              convert_output=False)
        self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False)
        self._memory_control_terminate_event = memory_control_terminate_event
        self.task_control_lock = threading.RLock()
        if data_source is None:
            #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self)
            self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self)
        else:
            self.data_source = data_source  # a list of OnSiteLink
        self.delegate = delegate
        if LinkChecker.might_be_link_html_page(original_path):
            self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point
        self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1))
        self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1))
        self.cache_list = []  # internal page cache
        self.page_need_look_up_temp = 0
        self.cache_list.append(self.domain_link)
        if "www." not in self.sub_domain:
            self.cache_list.append(self.scheme + "://www."+self.sub_domain)
        self.cache_list.append(self.scheme + "://" + self.domain)
        self.page_need_look_up = self.data_source.count_all()
        self.cache_size = 500  # create a small cache list to avoid going to check link in file system with lots of read and write
        self._double_check_cache_lock = threading.RLock()
        self._double_check_cache = deque(maxlen=self.cache_size)
        self.external_cache_list = []
        self.external_cache_size = 500  # cache that hold external sites
        self.external_links_checked = 0
        self.add_internal_page_OK_only = True
        self.output_queue = output_queue
        self.output_all_external = output_all_external
        self.controller = controller
        self.result_delegate = result_delegate
        self.page_count_lock = threading.RLock()
        self.internal_page_count_lock = threading.RLock()
        self.level_lock = threading.RLock()
        self.page_look_up_lock = threading.RLock()
        self.external_link_check_lock = threading.RLock()
        self._finihsed = False
        self.task_control_max = 1
        self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \
                     "if you have an enquiry, please email to: [email protected])"
        self.agent_from = "*****@*****.**"
        if check_robot_text:
            self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme)
        else:
            self.robot_agent = None
        self.site_crawl_delay = 0.60

        if isinstance(self.robot_agent, Rules):
            delay_temp = self.robot_agent.delay(self.agent)
            if delay_temp is not None and delay_temp != self.site_crawl_delay:
                self.site_crawl_delay = delay_temp

        self.task_control_counter = 1
        self._speed_penalty_count = 0
        self._speed_penalty_threshold = 10
        self._progress_logging_speed = 120
        self._output_period = 120
        self._output_batch_size = 100
        self._death_wish_sent = False
        SiteChecker._is_lxml_parser_exist()
        self._output_thread = None
        self._output_queue = None
        self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event)
        self._status = "Start"
        self._populate_with_state()  # restore laste known state