def testExternalDbBuffer(self):
     backup_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/ResultSitesList.db"
     table = "24/10/2015 Gardening"
     backup_db = ExternalSiteDB(table, db_addr=backup_db_addr)
     total = backup_db.site_count(False)
     count = 0
     db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/"
     stop_event = Event()
     db_buffer = ExternalTempDataDiskBuffer("TempDB.db", None, stop_event, dir_path=db_addr)
     db_buffer._input_convert_tuple = False
     while count < total:
         results = backup_db.get_next_patch_no_rollover(count, 5000)
         print(count)
         db_buffer.write(results)
         count += 5000
 def __init__(self,
              stop_event: Event,
              input_queue: Queue = None,
              output_queue: Queue = None,
              max_worker: int = 10,
              dir_path="",
              is_debug=False,
              **kwargs):
     self._is_debug = is_debug
     FeedbackInterface.__init__(self, **kwargs)
     ExternalTempInterface.__init__(self)
     # do not use predefined queue here
     # self._input_q = input_queue
     # self._output_q = output_queue
     self._stop_event = stop_event
     self._internal_stop_event = Event()
     self._max_worker = max_worker
     self._job_done = 0
     self._job_done_shadow = 0
     self._job_done_lock = RLock()
     self._input_period = 0.0001  # time to sample data into the buffer
     self._max_sample_results = 100000
     self._min_sampling_duration = 0.0001
     self._sample_batch_size = 5000
     self._sample_batch_timeout = 60
     if is_debug:
         self._min_buff_delete_threshold = 10000  # default is 100000
     else:
         self._min_buff_delete_threshold = 100000
     self._speed_penalty_count = 0
     self._finished = False
     manager, self._output_q = get_queue_client(
         QueueManager.MachineSettingCrawler,
         QueueManager.Method_Whois_Output)
     self._db_buffer = ExternalTempDataDiskBuffer(
         "whois_check.db",
         self,
         self._internal_stop_event,
         buf_size=self._max_worker * 50,
         terminate_callback=WhoisChecker.terminate_callback,
         dir_path=dir_path)
     self._populate_with_state()  # FeedbackInterface
     if not is_debug:
         log_period = 120
     else:
         log_period = 10
     self._progress_logger = ProgressLogger(log_period, self,
                                            self._internal_stop_event)
 def __init__(self, stop_event: Event, input_queue: Queue=None, output_queue: Queue=None,
              max_worker: int=10, dir_path="", is_debug=False,  **kwargs):
     self._is_debug = is_debug
     FeedbackInterface.__init__(self, **kwargs)
     ExternalTempInterface.__init__(self)
     # do not use predefined queue here
     # self._input_q = input_queue
     # self._output_q = output_queue
     self._stop_event = stop_event
     self._internal_stop_event = Event()
     self._max_worker = max_worker
     self._job_done = 0
     self._job_done_shadow = 0
     self._job_done_lock = RLock()
     self._input_period = 0.0001  # time to sample data into the buffer
     self._max_sample_results = 100000
     self._min_sampling_duration = 0.0001
     self._sample_batch_size = 5000
     self._sample_batch_timeout = 60
     if is_debug:
         self._min_buff_delete_threshold = 10000  # default is 100000
     else:
         self._min_buff_delete_threshold = 100000
     self._speed_penalty_count = 0
     self._finished = False
     manager, self._output_q = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
     self._db_buffer = ExternalTempDataDiskBuffer("whois_check.db", self, self._internal_stop_event, buf_size=self._max_worker*50,
                                                  terminate_callback=WhoisChecker.terminate_callback, dir_path=dir_path)
     self._populate_with_state()  # FeedbackInterface
     if not is_debug:
         log_period = 120
     else:
         log_period = 10
     self._progress_logger = ProgressLogger(log_period, self, self._internal_stop_event)
 def __init__(self,
              db_ref: str,
              db_dir: str,
              input_queue: Queue,
              output_queue: Queue,
              stop_event: Event,
              matrix: CrawlMatrix,
              accounts: list,
              force_mode=False,
              force_mode_offset=0,
              force_mode_total=0,
              **kwargs):
     FeedbackInterface.__init__(self, **kwargs)
     self._stop_event = stop_event
     self._matrix = matrix
     self._db_ref = db_ref
     self._input_queue = input_queue
     self._output_queue = output_queue
     self._pool_input = Queue()
     self._pool = FilterPool(self._pool_input,
                             self._output_queue,
                             self._queue_lock,
                             self._stop_event,
                             self._matrix,
                             accounts=accounts)
     self._db_buffer = ExternalTempDataDiskBuffer(
         self._db_ref,
         self,
         self._stop_event,
         dir_path=db_dir,
         buf_size=2500,
         output_f=5000)  # control how data flow speed,
     # it can keep input:output ratio = 1:1 at max 10 milion data row per hour
     #FeedbackInterface.__init__(self, **kwargs)
     ExternalTempInterface.__init__(self)
     self._populate_with_state()
     if force_mode:
         new_state = _FilterState(progress=force_mode_offset,
                                  all_data=force_mode_total)
         self.populate_with_state(new_state)
 def __init__(self, db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event,
              matrix: CrawlMatrix, accounts: list, force_mode=False, force_mode_offset=0, force_mode_total=0,  **kwargs):
     FeedbackInterface.__init__(self, **kwargs)
     self._stop_event = stop_event
     self._matrix = matrix
     self._db_ref = db_ref
     self._input_queue = input_queue
     self._output_queue = output_queue
     self._pool_input = Queue()
     self._pool = FilterPool(self._pool_input, self._output_queue, self._queue_lock, self._stop_event, self._matrix,
                             accounts=accounts)
     self._db_buffer = ExternalTempDataDiskBuffer(self._db_ref, self, self._stop_event, dir_path=db_dir,
                                                  buf_size=2500, output_f=5000) # control how data flow speed,
                                                  # it can keep input:output ratio = 1:1 at max 10 milion data row per hour
     #FeedbackInterface.__init__(self, **kwargs)
     ExternalTempInterface.__init__(self)
     self._populate_with_state()
     if force_mode:
         new_state = _FilterState(progress=force_mode_offset, all_data=force_mode_total)
         self.populate_with_state(new_state)
class FilterController(FeedbackInterface, ExternalTempInterface):
    """
    This Controller can be memory controlled since it implement FeedbackInterface
    """
    def __init__(self,
                 db_ref: str,
                 db_dir: str,
                 input_queue: Queue,
                 output_queue: Queue,
                 stop_event: Event,
                 matrix: CrawlMatrix,
                 accounts: list,
                 force_mode=False,
                 force_mode_offset=0,
                 force_mode_total=0,
                 **kwargs):
        FeedbackInterface.__init__(self, **kwargs)
        self._stop_event = stop_event
        self._matrix = matrix
        self._db_ref = db_ref
        self._input_queue = input_queue
        self._output_queue = output_queue
        self._pool_input = Queue()
        self._pool = FilterPool(self._pool_input,
                                self._output_queue,
                                self._queue_lock,
                                self._stop_event,
                                self._matrix,
                                accounts=accounts)
        self._db_buffer = ExternalTempDataDiskBuffer(
            self._db_ref,
            self,
            self._stop_event,
            dir_path=db_dir,
            buf_size=2500,
            output_f=5000)  # control how data flow speed,
        # it can keep input:output ratio = 1:1 at max 10 milion data row per hour
        #FeedbackInterface.__init__(self, **kwargs)
        ExternalTempInterface.__init__(self)
        self._populate_with_state()
        if force_mode:
            new_state = _FilterState(progress=force_mode_offset,
                                     all_data=force_mode_total)
            self.populate_with_state(new_state)

    @staticmethod
    def get_input_parameters(db_ref: str, db_dir: str, input_queue: Queue,
                             output_queue: Queue, stop_event: Event,
                             matrix: CrawlMatrix, accounts: list,
                             force_mode: bool, force_mode_offset: int,
                             force_mode_total: int):
        return {
            "db_ref": db_ref,
            "db_dir": db_dir,
            "input_queue": input_queue,
            "output_queue": output_queue,
            "stop_event": stop_event,
            "matrix": matrix,
            "accounts": accounts,
            "force_mode": force_mode,
            "force_mode_offset": force_mode_offset,
            "force_mode_total": force_mode_total,
        }

    def _sample_data_buffer_input(self):
        while not self._stop_event.is_set():
            data_list = []
            # if isinstance(self._queue_lock, multiprocessing.RLock):
            while not self._input_queue.empty():
                #with self._queue_lock:
                data = self._input_queue.get()
                if isinstance(data, tuple) and len(data) == 2:
                    data_list.append(data)
            if len(data_list) > 0:
                self._db_buffer.append_to_buffer(data_list,
                                                 convert_tuple=False)
                data_list.clear()
            time.sleep(1)

    def _sample_data_buffer_output(self):
        for item in self._db_buffer:
            if self._stop_event.is_set():
                break
            if isinstance(item, OnSiteLink):
                self._pool_input.put(
                    FilteredDomainData(domain=item.link,
                                       found=int(time.time())))

    def begin_filtering(self):
        self._start_sending_feedback()
        self._db_buffer.start_input_output_cycle(
        )  # start input and output data to/from file
        input_t = threading.Thread(target=self._sample_data_buffer_input)
        output_t = threading.Thread(target=self._sample_data_buffer_output)
        input_t.start()  # start sampling data
        output_t.start()
        self._pool.start()
        if self._pool.is_alive():
            self._pool.join()
        self._db_buffer.terminate()
        if input_t.is_alive():
            input_t.join()
        if output_t.is_alive():
            output_t.join()
        self._end_sending_feedback()

    def populate_with_state(self, state):
        """
        FeedbackInterface, subclass implement this method to
        :param state: the state from previous
        :return:
        """
        if isinstance(state, _FilterState):
            self._pool.set_job_done(state.progress)
            self._db_buffer.set_progress(state.progress)

    def get_state(self):
        """
        FeedbackInterface, subclass this so that the controller can gather state info, which in turn will feedback into next iteration
        :return:
        """
        return _FilterState(progress=self._pool.get_job_done(),
                            all_data=self._db_buffer.get_total_record())

    def get_callback_data(self):
        """
        FeedbackInterface, subclass this so that any callback data can be gathered by the controller
        :return:
        """
        return None

    def is_programme_finshed(self):
        """
        FeedbackInterface, indicate if the programme has finished execution, if not it goes to next iteration
        :return:
        """
        return self._stop_event.is_set()

    def get_external_count_finished(self) -> int:
        """
        ExternalTempInterface, get the number of job done in ExternalTempDataDiskBuffer
        :return:
        """
        return self._pool.get_job_done()

    def set_internal_count(self, count: int):
        """
        ExternalTempInterface, set the number of job done in ExternalTempDataDiskBuffer
        :param count:
        :return:
        """
        self._pool.set_job_done(count)
class WhoisChecker(FeedbackInterface, ExternalTempInterface,
                   ProgressLogInterface):
    """
    usage:
    checker = WhoisChecker(...)
    checker.run_farm()
    """
    def __init__(self,
                 stop_event: Event,
                 input_queue: Queue = None,
                 output_queue: Queue = None,
                 max_worker: int = 10,
                 dir_path="",
                 is_debug=False,
                 **kwargs):
        self._is_debug = is_debug
        FeedbackInterface.__init__(self, **kwargs)
        ExternalTempInterface.__init__(self)
        # do not use predefined queue here
        # self._input_q = input_queue
        # self._output_q = output_queue
        self._stop_event = stop_event
        self._internal_stop_event = Event()
        self._max_worker = max_worker
        self._job_done = 0
        self._job_done_shadow = 0
        self._job_done_lock = RLock()
        self._input_period = 0.0001  # time to sample data into the buffer
        self._max_sample_results = 100000
        self._min_sampling_duration = 0.0001
        self._sample_batch_size = 5000
        self._sample_batch_timeout = 60
        if is_debug:
            self._min_buff_delete_threshold = 10000  # default is 100000
        else:
            self._min_buff_delete_threshold = 100000
        self._speed_penalty_count = 0
        self._finished = False
        manager, self._output_q = get_queue_client(
            QueueManager.MachineSettingCrawler,
            QueueManager.Method_Whois_Output)
        self._db_buffer = ExternalTempDataDiskBuffer(
            "whois_check.db",
            self,
            self._internal_stop_event,
            buf_size=self._max_worker * 50,
            terminate_callback=WhoisChecker.terminate_callback,
            dir_path=dir_path)
        self._populate_with_state()  # FeedbackInterface
        if not is_debug:
            log_period = 120
        else:
            log_period = 10
        self._progress_logger = ProgressLogger(log_period, self,
                                               self._internal_stop_event)

    @staticmethod
    def terminate_callback():
        ErrorLogger.log_error("WhoisChecker", StopIteration("terminated."))

    @staticmethod
    def get_input_parameters(input_queue: Queue,
                             output_queue: Queue,
                             stop_event: Event,
                             max_worker: int = 20) -> dict:
        return {
            "input_queue": input_queue,
            "output_queue": output_queue,
            "stop_event": stop_event,
            "max_worker": max_worker
        }

    def get_job_done_count(self):
        with self._job_done_lock:
            job_done = self._job_done
        return job_done

    def _add_job_done_one(self):
        with self._job_done_lock:
            self._job_done += 1

    def _put_output_result_in_queue(self, domain_data: OnSiteLink):
        if not self._stop_event.is_set(
        ) or not self._internal_stop_event.is_set():
            try:
                self._output_q.put(
                    (domain_data.link, domain_data.response_code))
            except Exception as inner_ex:
                if self._output_q is None:
                    manager, self._output_q = get_queue_client(
                        QueueManager.MachineSettingCrawler,
                        QueueManager.Method_Whois_Output)
                ErrorLogger.log_error(
                    "WhoisChecker",
                    inner_ex,
                    addtional="failed to put result in queue.")
                time.sleep(0.01)
                self._put_output_result_in_queue(domain_data)

    def _check_whois_v1(self, domain_data: OnSiteLink):
        root_domain = domain_data.link
        try:
            if root_domain.startswith("http"):
                root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
            real_response_code = domain_data.response_code
            whois = LinkChecker.check_whois(root_domain)  # check whois record
            if whois[0]:
                if whois[2]:  # domain is expired
                    real_response_code = ResponseCode.Expired
                else:
                    real_response_code = ResponseCode.MightBeExpired
            if real_response_code == ResponseCode.Expired:
                #if ResponseCode.domain_might_be_expired(real_response_code):
                domain_data.link = root_domain
                domain_data.response_code = real_response_code
                #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
                # if isinstance(self._queue_lock, multiprocessing.RLock):
                with self._queue_lock:
                    self._output_q.put(
                        (domain_data.link, domain_data.response_code))
        except Exception as ex:
            ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex,
                                  "_check_whois() " + root_domain)
        finally:
            self._add_job_done_one()

    def _check_whois(self, domain_data: OnSiteLink):
        root_domain = domain_data.link.lower()
        try:
            if not self._is_debug:
                if root_domain.startswith("http"):
                    root_domain = LinkChecker.get_root_domain(
                        domain_data.link)[1]
                is_available, is_redemption = LinkChecker.is_domain_available_whois(
                    root_domain)  # check whois record
                if is_available or is_redemption:
                    if is_available:
                        real_response_code = ResponseCode.Expired
                    else:
                        real_response_code = ResponseCode.MightBeExpired
                    domain_data.link = root_domain
                    domain_data.response_code = real_response_code
                    #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
                    self._put_output_result_in_queue(domain_data)
            else:
                self._put_output_result_in_queue(domain_data)
        except Exception as ex:
            ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex,
                                  "_check_whois() " + root_domain)
        finally:
            self._add_job_done_one()

    def _check_whois_with_dns(self, page: OnSiteLink):

        real_response_code = ResponseCode.DNSError
        skip_whois_check = False
        try:
            if not self._is_debug:
                root_result = LinkChecker.get_root_domain(page.link)
                root_domain = root_result[1]
                sub_domain = root_result[4]
                suffix = root_result[5]

                if len(sub_domain
                       ) == 0 or suffix not in TldUtility.TOP_TLD_LIST:
                    skip_whois_check = True
                else:

                    if LinkChecker.is_domain_DNS_OK(
                            sub_domain):  # check DNS first
                        real_response_code = ResponseCode.NoDNSError
                        skip_whois_check = True
                    elif not sub_domain.startswith("www."):
                        if LinkChecker.is_domain_DNS_OK("www." + root_domain):
                            real_response_code = ResponseCode.NoDNSError
                            skip_whois_check = True

                    page.response_code = real_response_code
                    page.link_type = OnSiteLink.TypeOutbound
                    page.link = root_domain
        except Exception as ex:
            ErrorLogger.log_error("WhoisChecker", ex,
                                  "_check_whois_with_dns() " + page.link)
            skip_whois_check = True
        finally:
            if not skip_whois_check and real_response_code == ResponseCode.DNSError:
                self._check_whois(page)
            else:
                self._add_job_done_one()

    def _sample_data(self):
        ref_time = time.time()
        manager, result_queue = get_queue_client(
            QueueManager.MachineSettingCrawler,
            QueueManager.Method_Whois_Input)
        if result_queue is None:
            ErrorLogger.log_error(
                "ExternalSiteChecker.WhoisChecker._sample_data()",
                ValueError("result queue is None, cannot get data."))
            if not (self._stop_event.is_set()
                    or self._internal_stop_event.is_set()):
                self._sample_data()
        else:
            while not (self._stop_event.is_set()
                       or self._internal_stop_event.is_set()):
                data_list = []
                counter = 0
                while not result_queue.empty():

                    data = None
                    try:
                        data = result_queue.get()
                    except Exception as ex:
                        ErrorLogger.log_error("WhoisChecker._sampling_data",
                                              ex)
                        if result_queue is None:
                            manager, result_queue = get_queue_client(
                                QueueManager.MachineSettingCrawler,
                                QueueManager.Method_Whois_Input)
                    if isinstance(data, OnSiteLink):
                        counter += 1
                        data_list.append((data.link, data.response_code))
                    elif isinstance(data, tuple) and len(data) == 2:
                        #print("External Site checker: recieved:", data)
                        counter += 1
                        data_list.append(data)
                    if isinstance(data, list):
                        data_list += data
                        counter += 1
                    if counter >= self._sample_batch_size:
                        break
                    current_time = time.time()
                    if current_time - ref_time >= self._sample_batch_timeout:
                        break
                    time.sleep(self._min_sampling_duration)
                ref_time = time.time()
                if len(data_list) > 0:
                    #print("whois checker input data in db_buff: ", len(data_list))
                    self._db_buffer.append_to_buffer(data_list,
                                                     convert_tuple=False)
                    data_list.clear()
                else:
                    pass
                time.sleep(self._input_period)

    def sample_gen(self):
        while not self._stop_event.is_set(
        ) or not self._internal_stop_event.is_set():
            try:
                if not self._db_buffer.reset_event.is_set():
                    for next_item in self._db_buffer:
                        time.sleep(0.0001)
                        return next_item
                print("going to next cycle in sample_gen.")
                time.sleep(1)
            except Exception as ex:
                print("sample_gen error:", ex)

    def run_farm(self):
        try:
            self._start_sending_feedback()
            input_t = threading.Thread(target=self._sample_data)
            input_t.start()  # start sampling data
            self._progress_logger.start()
            self._db_buffer.start_input_output_cycle(
            )  # start input and output data to/from file
            pool = ThreadPool(processes=self._max_worker)
            # pool.imap_unordered(self._check_whois_with_dns, self._db_buffer, chunksize=1)
            pool.imap_unordered(self._check_whois_with_dns,
                                iter(self.sample_gen, None),
                                chunksize=1)
            while not self._stop_event.is_set(
            ) or not self._internal_stop_event.is_set():
                time.sleep(1)
            if self._stop_event.is_set():
                self._internal_stop_event.set()
            input_t.join()
            self._progress_logger.join()
            self._db_buffer.terminate()
            if self._stop_event.is_set():
                self._finished = True
            self._end_sending_feedback()
        except Exception as ex:
            if self._stop_event.is_set():
                self._finished = True
            ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex,
                                  "run_farm() index at:" + str(self._job_done))

    def get_external_count_finished(self):
        """
        ExternalTempInterface
        :return:
        """
        return self.get_job_done_count()

    def set_internal_count(self, count: int):
        """
        ExternalTempInterface
        :param count:
        :return:
        """
        self._job_done = count

    def populate_with_state(self, state):
        """
        FeedbackInterface, subclass implement this method to
        :param state: the state from previous
        :return:
        """
        if isinstance(state, WhoisCheckerState):
            self._job_done = state.progress_count
            self._db_buffer.set_progress(state.progress_count)

    def get_state(self):
        """
        FeedbackInterface, subclass this so that the controller can gather state info, which in turn will feedback into next iteration
        :return:
        """
        if self._internal_stop_event.is_set():
            return WhoisCheckerState(0, 0)
        else:
            return WhoisCheckerState(self.get_job_done_count(),
                                     self._db_buffer.get_total_record())

    def get_callback_data(self):
        """
        FeedbackInterface, subclass this so that any callback data can be gathered by the controller
        :return:
        """
        return None

    def is_programme_finshed(self):
        """
        FeedbackInterface, indicate if the programme has finished execution, if not it goes to next iteration
        :return:
        """
        return self._finished

    def is_progamme_need_restart(self):
        """
        FeedbackInterface, indicate if the programme needs to be restarted, if not it goes to next iteration
        """
        return self._internal_stop_event.is_set()

    def get_file_name(self) -> str:
        """
        ProgressLogInterface, the file name used to save in file system.
        :return:
        """
        return "whois_check_progress.csv"

    def get_column_names(self) -> []:
        """
        ProgressLogInterface, the column name for each prograss entry in get_prograss(), all in str format
        :return: array contains column names, length should match the length of prograss entries
        """
        return ["Done", "Total"]

    def reset(self):
        self._job_done = 0
        self._job_done_shadow = 0
        self._speed_penalty_count = 0

    def get_progress(self) -> []:
        """
        ProgressLogInterface, get the prograss data in tuple format, so that it can be used to complie to standard format
        :return: array contains prograss data, which has the exact length of column names in get_column_names()
        """
        total_record = self._db_buffer.get_total_record()

        if (self._job_done == self._job_done_shadow and self._job_done > 0
            ) or (self._job_done > self._min_buff_delete_threshold * 0.9
                  and total_record > self._min_buff_delete_threshold):
            self._speed_penalty_count += 1
            if self._speed_penalty_count >= 2:
                ErrorLogger.log_error(
                    "WhoisChecker.get_progress()",
                    TimeoutError("progress is stucked, restarted internal."),
                    self._db_buffer._file_name)
                print("going to clear cache")
                self._db_buffer.clear_cache()
                self.reset()
                total_record = 0
                self._db_buffer.start_input_output_cycle()
        else:
            print("no need to clear cache.")
            self._job_done_shadow = self._job_done
            self._speed_penalty_count = 0
        return [self._job_done, total_record]

    def get_limit(self) -> int:
        """
        ProgressLogInterface, the number of samples you want to collect.
        :return: max number of samples
        """
        return self._max_sample_results
class WhoisChecker(FeedbackInterface, ExternalTempInterface, ProgressLogInterface):
    """
    usage:
    checker = WhoisChecker(...)
    checker.run_farm()
    """
    def __init__(self, stop_event: Event, input_queue: Queue=None, output_queue: Queue=None,
                 max_worker: int=10, dir_path="", is_debug=False,  **kwargs):
        self._is_debug = is_debug
        FeedbackInterface.__init__(self, **kwargs)
        ExternalTempInterface.__init__(self)
        # do not use predefined queue here
        # self._input_q = input_queue
        # self._output_q = output_queue
        self._stop_event = stop_event
        self._internal_stop_event = Event()
        self._max_worker = max_worker
        self._job_done = 0
        self._job_done_shadow = 0
        self._job_done_lock = RLock()
        self._input_period = 0.0001  # time to sample data into the buffer
        self._max_sample_results = 100000
        self._min_sampling_duration = 0.0001
        self._sample_batch_size = 5000
        self._sample_batch_timeout = 60
        if is_debug:
            self._min_buff_delete_threshold = 10000  # default is 100000
        else:
            self._min_buff_delete_threshold = 100000
        self._speed_penalty_count = 0
        self._finished = False
        manager, self._output_q = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
        self._db_buffer = ExternalTempDataDiskBuffer("whois_check.db", self, self._internal_stop_event, buf_size=self._max_worker*50,
                                                     terminate_callback=WhoisChecker.terminate_callback, dir_path=dir_path)
        self._populate_with_state()  # FeedbackInterface
        if not is_debug:
            log_period = 120
        else:
            log_period = 10
        self._progress_logger = ProgressLogger(log_period, self, self._internal_stop_event)

    @staticmethod
    def terminate_callback():
        ErrorLogger.log_error("WhoisChecker", StopIteration("terminated."))

    @staticmethod
    def get_input_parameters(input_queue: Queue, output_queue: Queue, stop_event: Event, max_worker: int=20) -> dict:
        return {"input_queue": input_queue, "output_queue": output_queue, "stop_event": stop_event,
                "max_worker": max_worker}

    def get_job_done_count(self):
        with self._job_done_lock:
            job_done = self._job_done
        return job_done

    def _add_job_done_one(self):
        with self._job_done_lock:
            self._job_done += 1

    def _put_output_result_in_queue(self, domain_data: OnSiteLink):
        if not self._stop_event.is_set() or not self._internal_stop_event.is_set():
            try:
                self._output_q.put((domain_data.link, domain_data.response_code))
            except Exception as inner_ex:
                if self._output_q is None:
                    manager, self._output_q = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
                ErrorLogger.log_error("WhoisChecker", inner_ex, addtional="failed to put result in queue.")
                time.sleep(0.01)
                self._put_output_result_in_queue(domain_data)

    def _check_whois_v1(self, domain_data: OnSiteLink):
        root_domain = domain_data.link
        try:
            if root_domain.startswith("http"):
                root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
            real_response_code = domain_data.response_code
            whois = LinkChecker.check_whois(root_domain)  # check whois record
            if whois[0]:
                if whois[2]:  # domain is expired
                    real_response_code = ResponseCode.Expired
                else:
                    real_response_code = ResponseCode.MightBeExpired
            if real_response_code == ResponseCode.Expired:
            #if ResponseCode.domain_might_be_expired(real_response_code):
                domain_data.link = root_domain
                domain_data.response_code = real_response_code
                #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
                # if isinstance(self._queue_lock, multiprocessing.RLock):
                with self._queue_lock:
                    self._output_q.put((domain_data.link, domain_data.response_code))
        except Exception as ex:
            ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain)
        finally:
            self._add_job_done_one()

    def _check_whois(self, domain_data: OnSiteLink):
        root_domain = domain_data.link.lower()
        try:
            if not self._is_debug:
                if root_domain.startswith("http"):
                    root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
                is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain)  # check whois record
                if is_available or is_redemption:
                    if is_available:
                        real_response_code = ResponseCode.Expired
                    else:
                        real_response_code = ResponseCode.MightBeExpired
                    domain_data.link = root_domain
                    domain_data.response_code = real_response_code
                #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
                    self._put_output_result_in_queue(domain_data)
            else:
                self._put_output_result_in_queue(domain_data)
        except Exception as ex:
            ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain)
        finally:
            self._add_job_done_one()

    def _check_whois_with_dns(self, page: OnSiteLink):

        real_response_code = ResponseCode.DNSError
        skip_whois_check = False
        try:
            if not self._is_debug:
                root_result = LinkChecker.get_root_domain(page.link)
                root_domain = root_result[1]
                sub_domain = root_result[4]
                suffix = root_result[5]

                if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST:
                    skip_whois_check = True
                else:

                    if LinkChecker.is_domain_DNS_OK(sub_domain):  # check DNS first
                        real_response_code = ResponseCode.NoDNSError
                        skip_whois_check = True
                    elif not sub_domain.startswith("www."):
                        if LinkChecker.is_domain_DNS_OK("www." + root_domain):
                            real_response_code = ResponseCode.NoDNSError
                            skip_whois_check = True

                    page.response_code = real_response_code
                    page.link_type = OnSiteLink.TypeOutbound
                    page.link = root_domain
        except Exception as ex:
            ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link)
            skip_whois_check = True
        finally:
            if not skip_whois_check and real_response_code == ResponseCode.DNSError:
                self._check_whois(page)
            else:
                self._add_job_done_one()

    def _sample_data(self):
        ref_time = time.time()
        manager, result_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input)
        if result_queue is None:
            ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker._sample_data()", ValueError("result queue is None, cannot get data."))
            if not (self._stop_event.is_set() or self._internal_stop_event.is_set()):
                self._sample_data()
        else:
            while not (self._stop_event.is_set() or self._internal_stop_event.is_set()):
                data_list = []
                counter = 0
                while not result_queue.empty():

                    data = None
                    try:
                        data = result_queue.get()
                    except Exception as ex:
                        ErrorLogger.log_error("WhoisChecker._sampling_data", ex)
                        if result_queue is None:
                            manager, result_queue = get_queue_client(QueueManager.MachineSettingCrawler,
                                                                    QueueManager.Method_Whois_Input)
                    if isinstance(data, OnSiteLink):
                        counter += 1
                        data_list.append((data.link, data.response_code))
                    elif isinstance(data, tuple) and len(data) == 2:
                        #print("External Site checker: recieved:", data)
                        counter += 1
                        data_list.append(data)
                    if isinstance(data, list):
                        data_list += data
                        counter += 1
                    if counter >= self._sample_batch_size:
                        break
                    current_time = time.time()
                    if current_time - ref_time >= self._sample_batch_timeout:
                        break
                    time.sleep(self._min_sampling_duration)
                ref_time = time.time()
                if len(data_list) > 0:
                    #print("whois checker input data in db_buff: ", len(data_list))
                    self._db_buffer.append_to_buffer(data_list, convert_tuple=False)
                    data_list.clear()
                else:
                    pass
                time.sleep(self._input_period)

    def sample_gen(self):
        while not self._stop_event.is_set() or not self._internal_stop_event.is_set():
            try:
                if not self._db_buffer.reset_event.is_set():
                    for next_item in self._db_buffer:
                        time.sleep(0.0001)
                        return next_item
                print("going to next cycle in sample_gen.")
                time.sleep(1)
            except Exception as ex:
                print("sample_gen error:", ex)

    def run_farm(self):
        try:
            self._start_sending_feedback()
            input_t = threading.Thread(target=self._sample_data)
            input_t.start()  # start sampling data
            self._progress_logger.start()
            self._db_buffer.start_input_output_cycle()  # start input and output data to/from file
            pool = ThreadPool(processes=self._max_worker)
            # pool.imap_unordered(self._check_whois_with_dns, self._db_buffer, chunksize=1)
            pool.imap_unordered(self._check_whois_with_dns, iter(self.sample_gen, None), chunksize=1)
            while not self._stop_event.is_set() or not self._internal_stop_event.is_set():
                time.sleep(1)
            if self._stop_event.is_set():
                self._internal_stop_event.set()
            input_t.join()
            self._progress_logger.join()
            self._db_buffer.terminate()
            if self._stop_event.is_set():
                self._finished = True
            self._end_sending_feedback()
        except Exception as ex:
            if self._stop_event.is_set():
                self._finished = True
            ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "run_farm() index at:" + str(self._job_done))

    def get_external_count_finished(self):
        """
        ExternalTempInterface
        :return:
        """
        return self.get_job_done_count()

    def set_internal_count(self, count: int):
        """
        ExternalTempInterface
        :param count:
        :return:
        """
        self._job_done = count

    def populate_with_state(self, state):
        """
        FeedbackInterface, subclass implement this method to
        :param state: the state from previous
        :return:
        """
        if isinstance(state, WhoisCheckerState):
            self._job_done = state.progress_count
            self._db_buffer.set_progress(state.progress_count)

    def get_state(self):
        """
        FeedbackInterface, subclass this so that the controller can gather state info, which in turn will feedback into next iteration
        :return:
        """
        if self._internal_stop_event.is_set():
            return WhoisCheckerState(0, 0)
        else:
            return WhoisCheckerState(self.get_job_done_count(), self._db_buffer.get_total_record())

    def get_callback_data(self):
        """
        FeedbackInterface, subclass this so that any callback data can be gathered by the controller
        :return:
        """
        return None

    def is_programme_finshed(self):
        """
        FeedbackInterface, indicate if the programme has finished execution, if not it goes to next iteration
        :return:
        """
        return self._finished

    def is_progamme_need_restart(self):
        """
        FeedbackInterface, indicate if the programme needs to be restarted, if not it goes to next iteration
        """
        return self._internal_stop_event.is_set()

    def get_file_name(self) -> str:
        """
        ProgressLogInterface, the file name used to save in file system.
        :return:
        """
        return "whois_check_progress.csv"

    def get_column_names(self) -> []:
        """
        ProgressLogInterface, the column name for each prograss entry in get_prograss(), all in str format
        :return: array contains column names, length should match the length of prograss entries
        """
        return ["Done", "Total"]

    def reset(self):
        self._job_done = 0
        self._job_done_shadow = 0
        self._speed_penalty_count = 0

    def get_progress(self) -> []:
        """
        ProgressLogInterface, get the prograss data in tuple format, so that it can be used to complie to standard format
        :return: array contains prograss data, which has the exact length of column names in get_column_names()
        """
        total_record = self._db_buffer.get_total_record()

        if (self._job_done == self._job_done_shadow and self._job_done > 0) or (self._job_done > self._min_buff_delete_threshold * 0.9 and total_record > self._min_buff_delete_threshold):
            self._speed_penalty_count += 1
            if self._speed_penalty_count >= 2:
                ErrorLogger.log_error("WhoisChecker.get_progress()", TimeoutError("progress is stucked, restarted internal."), self._db_buffer._file_name)
                print("going to clear cache")
                self._db_buffer.clear_cache()
                self.reset()
                total_record = 0
                self._db_buffer.start_input_output_cycle()
        else:
            print("no need to clear cache.")
            self._job_done_shadow = self._job_done
            self._speed_penalty_count = 0
        return [self._job_done, total_record]

    def get_limit(self) -> int:
        """
        ProgressLogInterface, the number of samples you want to collect.
        :return: max number of samples
        """
        return self._max_sample_results
class FilterController(FeedbackInterface, ExternalTempInterface):
    """
    This Controller can be memory controlled since it implement FeedbackInterface
    """
    def __init__(self, db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event,
                 matrix: CrawlMatrix, accounts: list, force_mode=False, force_mode_offset=0, force_mode_total=0,  **kwargs):
        FeedbackInterface.__init__(self, **kwargs)
        self._stop_event = stop_event
        self._matrix = matrix
        self._db_ref = db_ref
        self._input_queue = input_queue
        self._output_queue = output_queue
        self._pool_input = Queue()
        self._pool = FilterPool(self._pool_input, self._output_queue, self._queue_lock, self._stop_event, self._matrix,
                                accounts=accounts)
        self._db_buffer = ExternalTempDataDiskBuffer(self._db_ref, self, self._stop_event, dir_path=db_dir,
                                                     buf_size=2500, output_f=5000) # control how data flow speed,
                                                     # it can keep input:output ratio = 1:1 at max 10 milion data row per hour
        #FeedbackInterface.__init__(self, **kwargs)
        ExternalTempInterface.__init__(self)
        self._populate_with_state()
        if force_mode:
            new_state = _FilterState(progress=force_mode_offset, all_data=force_mode_total)
            self.populate_with_state(new_state)

    @staticmethod
    def get_input_parameters(db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event,
                 matrix: CrawlMatrix, accounts: list, force_mode: bool, force_mode_offset:int,force_mode_total:int):
        return {"db_ref": db_ref, "db_dir": db_dir,  "input_queue": input_queue, "output_queue": output_queue, "stop_event": stop_event,
                 "matrix": matrix, "accounts": accounts,
                 "force_mode": force_mode, "force_mode_offset": force_mode_offset, "force_mode_total": force_mode_total,}

    def _sample_data_buffer_input(self):
        while not self._stop_event.is_set():
            data_list = []
            # if isinstance(self._queue_lock, multiprocessing.RLock):
            while not self._input_queue.empty():
                #with self._queue_lock:
                data = self._input_queue.get()
                if isinstance(data, tuple) and len(data) == 2:
                    data_list.append(data)
            if len(data_list) > 0:
                self._db_buffer.append_to_buffer(data_list, convert_tuple=False)
                data_list.clear()
            time.sleep(1)

    def _sample_data_buffer_output(self):
        for item in self._db_buffer:
            if self._stop_event.is_set():
                break
            if isinstance(item, OnSiteLink):
                self._pool_input.put(FilteredDomainData(domain=item.link, found=int(time.time())))

    def begin_filtering(self):
        self._start_sending_feedback()
        self._db_buffer.start_input_output_cycle()  # start input and output data to/from file
        input_t = threading.Thread(target=self._sample_data_buffer_input)
        output_t = threading.Thread(target=self._sample_data_buffer_output)
        input_t.start()  # start sampling data
        output_t.start()
        self._pool.start()
        if self._pool.is_alive():
            self._pool.join()
        self._db_buffer.terminate()
        if input_t.is_alive():
            input_t.join()
        if output_t.is_alive():
            output_t.join()
        self._end_sending_feedback()

    def populate_with_state(self, state):
        """
        FeedbackInterface, subclass implement this method to
        :param state: the state from previous
        :return:
        """
        if isinstance(state, _FilterState):
            self._pool.set_job_done(state.progress)
            self._db_buffer.set_progress(state.progress)

    def get_state(self):
        """
        FeedbackInterface, subclass this so that the controller can gather state info, which in turn will feedback into next iteration
        :return:
        """
        return _FilterState(progress=self._pool.get_job_done(), all_data=self._db_buffer.get_total_record())

    def get_callback_data(self):
        """
        FeedbackInterface, subclass this so that any callback data can be gathered by the controller
        :return:
        """
        return None

    def is_programme_finshed(self):
        """
        FeedbackInterface, indicate if the programme has finished execution, if not it goes to next iteration
        :return:
        """
        return self._stop_event.is_set()

    def get_external_count_finished(self) -> int:
        """
        ExternalTempInterface, get the number of job done in ExternalTempDataDiskBuffer
        :return:
        """
        return self._pool.get_job_done()

    def set_internal_count(self, count: int):
        """
        ExternalTempInterface, set the number of job done in ExternalTempDataDiskBuffer
        :param count:
        :return:
        """
        self._pool.set_job_done(count)
    def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None,
                 controller: SiteCheckerController=None,
                 max_level=10, max_page=1000, delegate=None, output_buff_size=2000,
                 output_queue=None, output_all_external=False, result_delegate=None,
                 memory_control_terminate_event=None, check_robot_text=True,
                 **kwargs):
        """
        :param full_link: The full link of a domain, e.g: https://www.google.co.uk
        :param domain: domain to crawl
        :param max_level: stop crawling if it reaches this level
        :param max_page: maximum pages to check within a site, also stop crawling
        :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999
        :param result_delegate: send site_info upon finish
        :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process.
        :return:
        """
        FeedbackInterface.__init__(self, **kwargs)
        #super(SiteChecker, self).__init__(**kwargs)
        if full_link is None or len(full_link) == 0:
            raise ValueError()

        original_path = ""
        try:
            paras = urlsplit(full_link)
            self.scheme, self.domain, original_path = paras[0], paras[1], paras[2]
        except:
            pass

        domain_data = LinkChecker.get_root_domain(full_link, False)
        self.root_domain = domain_data[1]
        self.sub_domain = domain_data[4]
        self.domain_suffix = domain_data[5]
        self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix)
        if self.scheme == "":
            self.scheme = "http"
        if self.domain == "":
            self.domain = self.root_domain
        self.orginal_link = full_link
        self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme)
        self.max_level = max_level
        self.max_page = max_page
        self.page_count = 0  # keep track page done
        self._page_count_shadow = 0 # track previous count
        self._all_page_count_shadow = 0 #track previous count in datasource
        self.internal_page_count = 0
        self.internal_page_last_count = 0
        self.page_allocated = 0
        self.current_level = 0  # if this = 0, it is root domain/home_page
        self._stop_event = Event()
        valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link)
        self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self,
                                                              stop_event=self._stop_event,
                                                              buf_size=int(output_buff_size/2),
                                                              dir_path=get_db_buffer_default_dir(),
                                                              convert_output=False)
        self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False)
        self._memory_control_terminate_event = memory_control_terminate_event
        self.task_control_lock = threading.RLock()
        if data_source is None:
            #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self)
            self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self)
        else:
            self.data_source = data_source  # a list of OnSiteLink
        self.delegate = delegate
        if LinkChecker.might_be_link_html_page(original_path):
            self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point
        self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1))
        self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1))
        self.cache_list = []  # internal page cache
        self.page_need_look_up_temp = 0
        self.cache_list.append(self.domain_link)
        if "www." not in self.sub_domain:
            self.cache_list.append(self.scheme + "://www."+self.sub_domain)
        self.cache_list.append(self.scheme + "://" + self.domain)
        self.page_need_look_up = self.data_source.count_all()
        self.cache_size = 500  # create a small cache list to avoid going to check link in file system with lots of read and write
        self._double_check_cache_lock = threading.RLock()
        self._double_check_cache = deque(maxlen=self.cache_size)
        self.external_cache_list = []
        self.external_cache_size = 500  # cache that hold external sites
        self.external_links_checked = 0
        self.add_internal_page_OK_only = True
        self.output_queue = output_queue
        self.output_all_external = output_all_external
        self.controller = controller
        self.result_delegate = result_delegate
        self.page_count_lock = threading.RLock()
        self.internal_page_count_lock = threading.RLock()
        self.level_lock = threading.RLock()
        self.page_look_up_lock = threading.RLock()
        self.external_link_check_lock = threading.RLock()
        self._finihsed = False
        self.task_control_max = 1
        self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \
                     "if you have an enquiry, please email to: [email protected])"
        self.agent_from = "*****@*****.**"
        if check_robot_text:
            self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme)
        else:
            self.robot_agent = None
        self.site_crawl_delay = 0.60

        if isinstance(self.robot_agent, Rules):
            delay_temp = self.robot_agent.delay(self.agent)
            if delay_temp is not None and delay_temp != self.site_crawl_delay:
                self.site_crawl_delay = delay_temp

        self.task_control_counter = 1
        self._speed_penalty_count = 0
        self._speed_penalty_threshold = 10
        self._progress_logging_speed = 120
        self._output_period = 120
        self._output_batch_size = 100
        self._death_wish_sent = False
        SiteChecker._is_lxml_parser_exist()
        self._output_thread = None
        self._output_queue = None
        self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event)
        self._status = "Start"
        self._populate_with_state()  # restore laste known state
class SiteChecker(FeedbackInterface, SiteTempDataSrcRefInterface, ProgressLogInterface, ExternalTempInterface):
    full_link_key = "full_link"
    datasource_key = "data_source"
    controller_ley = "controller"
    max_level_key = "max_level"
    max_page_key = "max_page"
    output_queue_key = "output_queue"

    _use_lxml_parser = False

    def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None,
                 controller: SiteCheckerController=None,
                 max_level=10, max_page=1000, delegate=None, output_buff_size=2000,
                 output_queue=None, output_all_external=False, result_delegate=None,
                 memory_control_terminate_event=None, check_robot_text=True,
                 **kwargs):
        """
        :param full_link: The full link of a domain, e.g: https://www.google.co.uk
        :param domain: domain to crawl
        :param max_level: stop crawling if it reaches this level
        :param max_page: maximum pages to check within a site, also stop crawling
        :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999
        :param result_delegate: send site_info upon finish
        :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process.
        :return:
        """
        FeedbackInterface.__init__(self, **kwargs)
        #super(SiteChecker, self).__init__(**kwargs)
        if full_link is None or len(full_link) == 0:
            raise ValueError()

        original_path = ""
        try:
            paras = urlsplit(full_link)
            self.scheme, self.domain, original_path = paras[0], paras[1], paras[2]
        except:
            pass

        domain_data = LinkChecker.get_root_domain(full_link, False)
        self.root_domain = domain_data[1]
        self.sub_domain = domain_data[4]
        self.domain_suffix = domain_data[5]
        self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix)
        if self.scheme == "":
            self.scheme = "http"
        if self.domain == "":
            self.domain = self.root_domain
        self.orginal_link = full_link
        self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme)
        self.max_level = max_level
        self.max_page = max_page
        self.page_count = 0  # keep track page done
        self._page_count_shadow = 0 # track previous count
        self._all_page_count_shadow = 0 #track previous count in datasource
        self.internal_page_count = 0
        self.internal_page_last_count = 0
        self.page_allocated = 0
        self.current_level = 0  # if this = 0, it is root domain/home_page
        self._stop_event = Event()
        valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link)
        self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self,
                                                              stop_event=self._stop_event,
                                                              buf_size=int(output_buff_size/2),
                                                              dir_path=get_db_buffer_default_dir(),
                                                              convert_output=False)
        self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False)
        self._memory_control_terminate_event = memory_control_terminate_event
        self.task_control_lock = threading.RLock()
        if data_source is None:
            #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self)
            self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self)
        else:
            self.data_source = data_source  # a list of OnSiteLink
        self.delegate = delegate
        if LinkChecker.might_be_link_html_page(original_path):
            self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point
        self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1))
        self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1))
        self.cache_list = []  # internal page cache
        self.page_need_look_up_temp = 0
        self.cache_list.append(self.domain_link)
        if "www." not in self.sub_domain:
            self.cache_list.append(self.scheme + "://www."+self.sub_domain)
        self.cache_list.append(self.scheme + "://" + self.domain)
        self.page_need_look_up = self.data_source.count_all()
        self.cache_size = 500  # create a small cache list to avoid going to check link in file system with lots of read and write
        self._double_check_cache_lock = threading.RLock()
        self._double_check_cache = deque(maxlen=self.cache_size)
        self.external_cache_list = []
        self.external_cache_size = 500  # cache that hold external sites
        self.external_links_checked = 0
        self.add_internal_page_OK_only = True
        self.output_queue = output_queue
        self.output_all_external = output_all_external
        self.controller = controller
        self.result_delegate = result_delegate
        self.page_count_lock = threading.RLock()
        self.internal_page_count_lock = threading.RLock()
        self.level_lock = threading.RLock()
        self.page_look_up_lock = threading.RLock()
        self.external_link_check_lock = threading.RLock()
        self._finihsed = False
        self.task_control_max = 1
        self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \
                     "if you have an enquiry, please email to: [email protected])"
        self.agent_from = "*****@*****.**"
        if check_robot_text:
            self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme)
        else:
            self.robot_agent = None
        self.site_crawl_delay = 0.60

        if isinstance(self.robot_agent, Rules):
            delay_temp = self.robot_agent.delay(self.agent)
            if delay_temp is not None and delay_temp != self.site_crawl_delay:
                self.site_crawl_delay = delay_temp

        self.task_control_counter = 1
        self._speed_penalty_count = 0
        self._speed_penalty_threshold = 10
        self._progress_logging_speed = 120
        self._output_period = 120
        self._output_batch_size = 100
        self._death_wish_sent = False
        SiteChecker._is_lxml_parser_exist()
        self._output_thread = None
        self._output_queue = None
        self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event)
        self._status = "Start"
        self._populate_with_state()  # restore laste known state
        # self.data_source.additional_startup_procedures()  # use the data set in self._populate_with_state() to start

    # def _empty_external_links_db(self):
    #     if self.output_queue is not None:
    def _put_result_in_output_queue_loop(self, item_list: list):
        if not self._stop_event.is_set():
            try:
                self._output_queue.put(item_list, True, 2)
            except Exception as ex:
                if self._output_queue is None:
                    manager, self._output_queue = get_queue_client(QueueManager.MachineSettingCrawler,
                                                             QueueManager.Method_Whois_Input)
                time.sleep(0.1)
                ErrorLogger.log_error("SiteChecker._get_external_links_to_queue", self.sub_domain+" "+str(ex))
                self._put_result_in_output_queue_loop(item_list)

    def _get_external_links_to_queue(self):
        ref_time = time.time()
        manager, self._output_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input)
        self.output_queue = self._output_queue  # override output_queue
        # if result_queue is None:
        #     ErrorLogger.log_error("SiteChecker._get_external_links_to_queue()", ValueError("result queue is none, cannot put item in queue."))
        # else:
        batch = list()
        counter = 0
        for item in self._external_db_buffer:
            if self._stop_event.is_set() or self.external_links_checked >= self._external_db_buffer.count_all():
                try:
                    manager.shutdown()
                except:
                    pass
                finally:
                    # print("exist _get_external_links_to_queue")
            # if self._stop_event.is_set() and self.external_links_checked >= self._external_db_buffer.count_all():
                    break
            elif isinstance(item, tuple):
                # print("outputting item: ", str(item))
                batch.append((item[0], item[1]))
                counter += 1
            if len(batch) > 0:
                current_time = time.time()
                if current_time - ref_time or len(batch) >= self._output_batch_size:
                    self._put_result_in_output_queue_loop(batch)
                    self.external_links_checked += len(batch)
                    ref_time = time.time()
                    batch.clear()

            time.sleep(0.0001)

    @staticmethod
    def _is_lxml_parser_exist():
        try:
            import lxml

        except ImportError:
            SiteChecker._use_lxml_parser = False
        else:
            SiteChecker._use_lxml_parser = True

    def use_lxml_parser(self):
        return SiteChecker._use_lxml_parser

    @staticmethod
    def get_input_parameter_base(full_link: str, max_page: int, max_level: int, output_queue) -> dict:
        return {SiteChecker.full_link_key: full_link, SiteChecker.max_page_key: max_page,
                SiteChecker.max_level_key: max_level, SiteChecker.output_queue_key: output_queue}

    def get_external_count_finished(self) -> int:
        """
        ExternalTempInterface, get the number of job done in ExternalTempDataDiskBuffer
        :return:
        """
        return self.external_links_checked

    def set_internal_count(self, count: int):
        """
        ExternalTempInterface, set the number of job done in ExternalTempDataDiskBuffer
        :param count:
        :return:
        """
        self.external_links_checked = count

    def _set_task_control_max(self, concurrent_task: int):
        if concurrent_task <= 0:
            raise ValueError
        self.task_control_max = concurrent_task
        self.task_control_counter = concurrent_task
        min_page_per_s = concurrent_task/20
        self._speed_penalty_threshold = self._progress_logging_speed * min_page_per_s
        if self.site_crawl_delay > 1/min_page_per_s:
            ErrorLogger.log_error("SiteChecker._set_task_control_max()",
                                  ValueError("site has crawl delay greater than mas delay."), self.domain_link)
            self._status = "Stopped"
            self.sudden_death()

    def get_site_feedback(self) -> SeedSiteFeedback:
        return SeedSiteFeedback(self.orginal_link, page_count=self.get_page_need_look_up())

    def get_site_info(self) -> SiteInfo:  # keep the original reference when sending back the site infomation
        info = SiteInfo(self.orginal_link, self.data_source)
        return info

    def populate_with_state(self, state):
        if state is not None and isinstance(state, SiteCheckerState):
            self._status = "Restarted"
            self.page_count = state.page_count
            self.page_allocated = state.page_count
            self.internal_page_count = state.internal_page_count
            self.internal_page_last_count = state.internal_page_count
            self.external_links_checked = state.external_page_count
            self._external_db_buffer.set_progress(state.external_page_count)
            self.page_need_look_up = state.page_need_look_up
            self.current_level = state.current_level
            self.progress_logger.set_reference(state.log_sample_index, state.log_started_time)
            counter = 0
            if self.data_source is not None:
                try:
                    for item in self.data_source.get_next():
                        if counter >= self.cache_size:
                            break
                        if isinstance(item, OnSiteLink) and not LinkChecker.is_external_link(self.root_domain, item.link):
                            self.cache_list.append(item.link)
                            # print("--restore: ", item)
                            counter += 1
                except Exception as ex:
                    msg = "error in SiteChecker.populate_with_state(), trying to populate cache, " + self.root_domain
                    ErrorLogger.log_error("SiteChecker", ex, msg)

                self.data_source.ref = state.datasource_ref
                self.data_source.output_c = state.datasource_output_c
                self.data_source.set_progress(state.datasource_index if state.datasource_index < state.page_count else state.page_count)
                self.data_source.set_continue_lock(True)

    def get_file_name(self):
        return self.data_source.ref

    def get_limit(self):
        return 100000

    def get_column_names(self):
        return ["Page Index", "External", "All", "Status"]

    def get_progress(self):
        data_source_count = self.data_source.count_all()
        if self.page_count - self._page_count_shadow <= self._speed_penalty_threshold:  # determine if site is slow
            self._speed_penalty_count += 1
            if self._speed_penalty_count > 2:
                self._status = "Stopped"
                self.sudden_death()
        else:
            self._speed_penalty_count = 0

        if self.page_count == self._page_count_shadow and data_source_count == self._all_page_count_shadow:  # determine if site is stucked
            self._status = "Stopped"
            self.sudden_death()

        self._page_count_shadow = self.page_count
        self._all_page_count_shadow = data_source_count
        return [self.page_count, self.external_links_checked, data_source_count, self._status]

    def is_programme_finshed(self):
        return self._finihsed

    def get_callback_data(self):
        with self.page_count_lock:
            gap = self.internal_page_count - self.internal_page_last_count
            self.internal_page_last_count = self.internal_page_count
            seed_feedback = None
            if self._finihsed:
                seed_feedback = self.get_site_feedback()

        return SiteFeedback(gap, self._finihsed, seed_feedback=seed_feedback, datasource_ref=self.data_source.ref)

    def get_state(self):
        return SiteCheckerState(page_count=self.page_count, page_need_look_up=self.page_need_look_up,
                                current_level=self.current_level, internal_page_count=self.internal_page_count,
                                external_page_count= self.external_links_checked,
                                datasource_index=self.data_source.temp_counter,
                                datasource_output_c=self.data_source.output_c,
                                datasource_ref=self.data_source.ref, log_started_time=self.progress_logger.begin_time,
                                log_sample_index=self.progress_logger.limit_counter,)

    def additional_reset(self):
        pass

    def addtional_clear(self):
        pass

    def stop(self):
        # natural stop
        self._status = "Stopped"
        self.progress_logger.report_progress()
        self._stop_event.set()
        if self.progress_logger.is_alive():
            self.progress_logger.join()

    def clear(self):
        self.cache_list.clear()
        self.addtional_clear()

    def acquire_task(self, level: int, link: str):
        tasked_acquired = True
        if link.endswith('/'):
            temp = link
        else:
            temp = link + '/'
        with self.task_control_lock:
            if len(self._double_check_cache) > 0:
                if temp in self._double_check_cache:
                    print("duplicate link found:", link)
                    tasked_acquired = False
                else:
                    if len(self._double_check_cache) >= self.cache_size:
                        self._double_check_cache.popleft()
                    self._double_check_cache.append(temp)
            self.task_control_counter -= 1
            self.page_allocated += 1
            if tasked_acquired:
                if level > self.current_level:
                    self.current_level = level
            # time.sleep(self.site_crawl_delay)
        return tasked_acquired

    def release_task(self, new_page: int):
        with self.task_control_lock:
            if self.page_need_look_up == 1 and new_page == 0:
                PrintLogger.print("set to stop data source")
                self.data_source.set_continue_lock(False)
            else:
                self.page_count += 1
                self.page_need_look_up += new_page
                #self.external_links_checked += external_page_count
                self.task_control_counter += 1
                # was determine if it is internal or external page
                self.internal_page_count += 1
                if self.internal_page_count > self.max_page or self.current_level > self.max_level:
                    if self.data_source.can_continue():
                        PrintLogger.print("set stop: " + str(self.internal_page_count)+" level: "+str(self.current_level))
                        self.data_source.set_continue_lock(False)

    def get_page_count(self):
        with self.page_count_lock:
            page_count = self.page_count
        return page_count

    def set_page_count(self, page_count: int):
        with self.page_count_lock:
            self.page_count = page_count

    def set_internal_page_count(self, count: int):
        with self.internal_page_count_lock:
            self.internal_page_count += count

    def get_internal_page_count(self):
        with self.internal_page_count_lock:
            count = self.internal_page_count
        return count

    def get_current_level(self):
        with self.level_lock:
            current_level = self.current_level
        return current_level

    def set_current_level(self, level):
        with self.level_lock:
            self.current_level = level

    def get_page_need_look_up(self):
        with self.page_look_up_lock:
            page_look_up = self.page_need_look_up
        #self.page_look_up_lock.release()
        return page_look_up

    def set_page_need_look_up(self, page_count):
        with self.page_look_up_lock:
            #time.sleep(0.1)
            self.page_need_look_up = page_count
        # self.page_look_up_lock.release()

    def set_page_need_look_up_plus_more(self, count: int):
        with self.page_look_up_lock:
            self.page_need_look_up += count

    def get_internal_page_progress_index(self)->int:
        return self.get_page_count()

    def set_internal_page_progress_index(self, index: int):
        self.page_count = index
        self.page_allocated = index

    def is_idle(self):
        idle = False
        with self.task_control_lock:
            page_need_look_up = self.get_page_need_look_up()
            new_task_added = page_need_look_up - self.page_need_look_up_temp
            has_new_task = True if new_task_added > 0 else False
            #page_count = self.get_page_count()
            if has_new_task:
                self.page_need_look_up_temp = page_need_look_up
            else:
                if self.task_control_counter >= self.task_control_max:
                    idle = True
                #     print("is idle")
                # else:
                #     print("is working")
        return idle

    def add_link_to_cache(self, link):
        if len(self.cache_list) > self.cache_size:
            return
        else:
            if link.endswith('/'):
                self.cache_list.append(link)
            else:
                self.cache_list.append(link+'/')

    def is_link_in_cache(self, link):
        if link.endswith('/'):
            temp = link
        else:
            temp = link + '/'
        return True if temp in self.cache_list else False

    def reset_as(self, domain: str, link: str=""):  # reset the target domain
        PrintLogger.print("crawl reset as: "+domain)
        self.domain = domain
        self.domain_link = self.scheme + "://" + self.domain
        self.page_count = 0
        self.current_level = 0
        self.set_page_need_look_up(1)
       # self.set_page_looked_up(0)
        self.clear()
        if len(link) == 0:
            self.cache_list.append(self.domain_link)
            self.data_source.re_target(self.domain_link, OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1))
            #self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1))
        else:
            self.cache_list.append(link)
            self.data_source.re_target(link, OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1))
            #self.data_source.append(OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1))
        self.additional_reset()
        self.data_source.additional_startup_procedures()

    def crawling(self):  # call this method to start operation
        self._start_sending_feedback()
        self._output_thread = threading.Thread(target=self._get_external_links_to_queue)
        if self.data_source.can_continue():
            self.data_source.additional_startup_procedures()  # use the data set in self._populate_with_state() to start
            self._external_db_buffer.start_input_output_cycle()
            self._output_thread.start()
            self.progress_logger.start()
            self.progress_logger.report_progress()  # log first row
            self._status = "Work"
            self.begin_crawl()
            # prefix = "www."
            # page_count_limit = 2
            # if self.page_count <= page_count_limit and prefix not in self.domain_link:
            #     new_domain = prefix + self.sub_domain
            #     self.reset_as(new_domain)
            #     self._status = "Work"
            #     self.begin_crawl()
            # print("going to stop all.")
            self.stop()
            self.clear()

            self.data_source.additional_finish_procedures()
            # print("going to finish output buffer.")
            self._external_db_buffer.terminate()
            # print("going to stop output_thread.")
            if self._output_thread.is_alive():
                self._output_thread.join()
        PrintLogger.print("finished naturally: "+self.domain_link)
        # print("finished naturally.")
        self._finihsed = True
            #calling this at the end of operation
        PrintLogger.print("send last response")
        # print("send last response")
        # print("send last response.")
        self._end_sending_feedback()
        if self._memory_control_terminate_event is not None:
            self._memory_control_terminate_event.set()

    def sudden_death(self):
        if not self._finihsed:
            self._finihsed = True
            PrintLogger.print("start sudden death: "+self.orginal_link)
            #self.stop()
            self.stop()

            self.clear()
            self.data_source.set_continue_lock(False)
            self.data_source.additional_finish_procedures()
            self._external_db_buffer.terminate()
            if isinstance(self._output_thread, threading.Thread):
                if self._output_thread.is_alive():
                    self._output_thread.join()
                #calling this at the end of operation
            PrintLogger.print("send last response")
            self._end_sending_feedback()
            if self._memory_control_terminate_event is not None:
                ErrorLogger.log_error("SiteChecker", TimeoutError("slow processing speed, terminated."), self.orginal_link)
                self._memory_control_terminate_event.set()

    def begin_crawl(self, level=0):  # subclass this to make different behaviour
        pass