def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None,
                 controller: SiteCheckerController=None,
                 max_level=10, max_page=1000, delegate=None, output_buff_size=2000,
                 output_queue=None, output_all_external=False, result_delegate=None,
                 memory_control_terminate_event=None, check_robot_text=True,
                 **kwargs):
        """
        :param full_link: The full link of a domain, e.g: https://www.google.co.uk
        :param domain: domain to crawl
        :param max_level: stop crawling if it reaches this level
        :param max_page: maximum pages to check within a site, also stop crawling
        :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999
        :param result_delegate: send site_info upon finish
        :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process.
        :return:
        """
        FeedbackInterface.__init__(self, **kwargs)
        #super(SiteChecker, self).__init__(**kwargs)
        if full_link is None or len(full_link) == 0:
            raise ValueError()

        original_path = ""
        try:
            paras = urlsplit(full_link)
            self.scheme, self.domain, original_path = paras[0], paras[1], paras[2]
        except:
            pass

        domain_data = LinkChecker.get_root_domain(full_link, False)
        self.root_domain = domain_data[1]
        self.sub_domain = domain_data[4]
        self.domain_suffix = domain_data[5]
        self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix)
        if self.scheme == "":
            self.scheme = "http"
        if self.domain == "":
            self.domain = self.root_domain
        self.orginal_link = full_link
        self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme)
        self.max_level = max_level
        self.max_page = max_page
        self.page_count = 0  # keep track page done
        self._page_count_shadow = 0 # track previous count
        self._all_page_count_shadow = 0 #track previous count in datasource
        self.internal_page_count = 0
        self.internal_page_last_count = 0
        self.page_allocated = 0
        self.current_level = 0  # if this = 0, it is root domain/home_page
        self._stop_event = Event()
        valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link)
        self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self,
                                                              stop_event=self._stop_event,
                                                              buf_size=int(output_buff_size/2),
                                                              dir_path=get_db_buffer_default_dir(),
                                                              convert_output=False)
        self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False)
        self._memory_control_terminate_event = memory_control_terminate_event
        self.task_control_lock = threading.RLock()
        if data_source is None:
            #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self)
            self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self)
        else:
            self.data_source = data_source  # a list of OnSiteLink
        self.delegate = delegate
        if LinkChecker.might_be_link_html_page(original_path):
            self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point
        self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1))
        self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1))
        self.cache_list = []  # internal page cache
        self.page_need_look_up_temp = 0
        self.cache_list.append(self.domain_link)
        if "www." not in self.sub_domain:
            self.cache_list.append(self.scheme + "://www."+self.sub_domain)
        self.cache_list.append(self.scheme + "://" + self.domain)
        self.page_need_look_up = self.data_source.count_all()
        self.cache_size = 500  # create a small cache list to avoid going to check link in file system with lots of read and write
        self._double_check_cache_lock = threading.RLock()
        self._double_check_cache = deque(maxlen=self.cache_size)
        self.external_cache_list = []
        self.external_cache_size = 500  # cache that hold external sites
        self.external_links_checked = 0
        self.add_internal_page_OK_only = True
        self.output_queue = output_queue
        self.output_all_external = output_all_external
        self.controller = controller
        self.result_delegate = result_delegate
        self.page_count_lock = threading.RLock()
        self.internal_page_count_lock = threading.RLock()
        self.level_lock = threading.RLock()
        self.page_look_up_lock = threading.RLock()
        self.external_link_check_lock = threading.RLock()
        self._finihsed = False
        self.task_control_max = 1
        self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \
                     "if you have an enquiry, please email to: [email protected])"
        self.agent_from = "*****@*****.**"
        if check_robot_text:
            self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme)
        else:
            self.robot_agent = None
        self.site_crawl_delay = 0.60

        if isinstance(self.robot_agent, Rules):
            delay_temp = self.robot_agent.delay(self.agent)
            if delay_temp is not None and delay_temp != self.site_crawl_delay:
                self.site_crawl_delay = delay_temp

        self.task_control_counter = 1
        self._speed_penalty_count = 0
        self._speed_penalty_threshold = 10
        self._progress_logging_speed = 120
        self._output_period = 120
        self._output_batch_size = 100
        self._death_wish_sent = False
        SiteChecker._is_lxml_parser_exist()
        self._output_thread = None
        self._output_queue = None
        self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event)
        self._status = "Start"
        self._populate_with_state()  # restore laste known state
class SiteChecker(FeedbackInterface, SiteTempDataSrcRefInterface, ProgressLogInterface, ExternalTempInterface):
    full_link_key = "full_link"
    datasource_key = "data_source"
    controller_ley = "controller"
    max_level_key = "max_level"
    max_page_key = "max_page"
    output_queue_key = "output_queue"

    _use_lxml_parser = False

    def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None,
                 controller: SiteCheckerController=None,
                 max_level=10, max_page=1000, delegate=None, output_buff_size=2000,
                 output_queue=None, output_all_external=False, result_delegate=None,
                 memory_control_terminate_event=None, check_robot_text=True,
                 **kwargs):
        """
        :param full_link: The full link of a domain, e.g: https://www.google.co.uk
        :param domain: domain to crawl
        :param max_level: stop crawling if it reaches this level
        :param max_page: maximum pages to check within a site, also stop crawling
        :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999
        :param result_delegate: send site_info upon finish
        :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process.
        :return:
        """
        FeedbackInterface.__init__(self, **kwargs)
        #super(SiteChecker, self).__init__(**kwargs)
        if full_link is None or len(full_link) == 0:
            raise ValueError()

        original_path = ""
        try:
            paras = urlsplit(full_link)
            self.scheme, self.domain, original_path = paras[0], paras[1], paras[2]
        except:
            pass

        domain_data = LinkChecker.get_root_domain(full_link, False)
        self.root_domain = domain_data[1]
        self.sub_domain = domain_data[4]
        self.domain_suffix = domain_data[5]
        self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix)
        if self.scheme == "":
            self.scheme = "http"
        if self.domain == "":
            self.domain = self.root_domain
        self.orginal_link = full_link
        self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme)
        self.max_level = max_level
        self.max_page = max_page
        self.page_count = 0  # keep track page done
        self._page_count_shadow = 0 # track previous count
        self._all_page_count_shadow = 0 #track previous count in datasource
        self.internal_page_count = 0
        self.internal_page_last_count = 0
        self.page_allocated = 0
        self.current_level = 0  # if this = 0, it is root domain/home_page
        self._stop_event = Event()
        valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link)
        self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self,
                                                              stop_event=self._stop_event,
                                                              buf_size=int(output_buff_size/2),
                                                              dir_path=get_db_buffer_default_dir(),
                                                              convert_output=False)
        self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False)
        self._memory_control_terminate_event = memory_control_terminate_event
        self.task_control_lock = threading.RLock()
        if data_source is None:
            #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self)
            self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self)
        else:
            self.data_source = data_source  # a list of OnSiteLink
        self.delegate = delegate
        if LinkChecker.might_be_link_html_page(original_path):
            self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point
        self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1))
        self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1))
        self.cache_list = []  # internal page cache
        self.page_need_look_up_temp = 0
        self.cache_list.append(self.domain_link)
        if "www." not in self.sub_domain:
            self.cache_list.append(self.scheme + "://www."+self.sub_domain)
        self.cache_list.append(self.scheme + "://" + self.domain)
        self.page_need_look_up = self.data_source.count_all()
        self.cache_size = 500  # create a small cache list to avoid going to check link in file system with lots of read and write
        self._double_check_cache_lock = threading.RLock()
        self._double_check_cache = deque(maxlen=self.cache_size)
        self.external_cache_list = []
        self.external_cache_size = 500  # cache that hold external sites
        self.external_links_checked = 0
        self.add_internal_page_OK_only = True
        self.output_queue = output_queue
        self.output_all_external = output_all_external
        self.controller = controller
        self.result_delegate = result_delegate
        self.page_count_lock = threading.RLock()
        self.internal_page_count_lock = threading.RLock()
        self.level_lock = threading.RLock()
        self.page_look_up_lock = threading.RLock()
        self.external_link_check_lock = threading.RLock()
        self._finihsed = False
        self.task_control_max = 1
        self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \
                     "if you have an enquiry, please email to: [email protected])"
        self.agent_from = "*****@*****.**"
        if check_robot_text:
            self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme)
        else:
            self.robot_agent = None
        self.site_crawl_delay = 0.60

        if isinstance(self.robot_agent, Rules):
            delay_temp = self.robot_agent.delay(self.agent)
            if delay_temp is not None and delay_temp != self.site_crawl_delay:
                self.site_crawl_delay = delay_temp

        self.task_control_counter = 1
        self._speed_penalty_count = 0
        self._speed_penalty_threshold = 10
        self._progress_logging_speed = 120
        self._output_period = 120
        self._output_batch_size = 100
        self._death_wish_sent = False
        SiteChecker._is_lxml_parser_exist()
        self._output_thread = None
        self._output_queue = None
        self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event)
        self._status = "Start"
        self._populate_with_state()  # restore laste known state
        # self.data_source.additional_startup_procedures()  # use the data set in self._populate_with_state() to start

    # def _empty_external_links_db(self):
    #     if self.output_queue is not None:
    def _put_result_in_output_queue_loop(self, item_list: list):
        if not self._stop_event.is_set():
            try:
                self._output_queue.put(item_list, True, 2)
            except Exception as ex:
                if self._output_queue is None:
                    manager, self._output_queue = get_queue_client(QueueManager.MachineSettingCrawler,
                                                             QueueManager.Method_Whois_Input)
                time.sleep(0.1)
                ErrorLogger.log_error("SiteChecker._get_external_links_to_queue", self.sub_domain+" "+str(ex))
                self._put_result_in_output_queue_loop(item_list)

    def _get_external_links_to_queue(self):
        ref_time = time.time()
        manager, self._output_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input)
        self.output_queue = self._output_queue  # override output_queue
        # if result_queue is None:
        #     ErrorLogger.log_error("SiteChecker._get_external_links_to_queue()", ValueError("result queue is none, cannot put item in queue."))
        # else:
        batch = list()
        counter = 0
        for item in self._external_db_buffer:
            if self._stop_event.is_set() or self.external_links_checked >= self._external_db_buffer.count_all():
                try:
                    manager.shutdown()
                except:
                    pass
                finally:
                    # print("exist _get_external_links_to_queue")
            # if self._stop_event.is_set() and self.external_links_checked >= self._external_db_buffer.count_all():
                    break
            elif isinstance(item, tuple):
                # print("outputting item: ", str(item))
                batch.append((item[0], item[1]))
                counter += 1
            if len(batch) > 0:
                current_time = time.time()
                if current_time - ref_time or len(batch) >= self._output_batch_size:
                    self._put_result_in_output_queue_loop(batch)
                    self.external_links_checked += len(batch)
                    ref_time = time.time()
                    batch.clear()

            time.sleep(0.0001)

    @staticmethod
    def _is_lxml_parser_exist():
        try:
            import lxml

        except ImportError:
            SiteChecker._use_lxml_parser = False
        else:
            SiteChecker._use_lxml_parser = True

    def use_lxml_parser(self):
        return SiteChecker._use_lxml_parser

    @staticmethod
    def get_input_parameter_base(full_link: str, max_page: int, max_level: int, output_queue) -> dict:
        return {SiteChecker.full_link_key: full_link, SiteChecker.max_page_key: max_page,
                SiteChecker.max_level_key: max_level, SiteChecker.output_queue_key: output_queue}

    def get_external_count_finished(self) -> int:
        """
        ExternalTempInterface, get the number of job done in ExternalTempDataDiskBuffer
        :return:
        """
        return self.external_links_checked

    def set_internal_count(self, count: int):
        """
        ExternalTempInterface, set the number of job done in ExternalTempDataDiskBuffer
        :param count:
        :return:
        """
        self.external_links_checked = count

    def _set_task_control_max(self, concurrent_task: int):
        if concurrent_task <= 0:
            raise ValueError
        self.task_control_max = concurrent_task
        self.task_control_counter = concurrent_task
        min_page_per_s = concurrent_task/20
        self._speed_penalty_threshold = self._progress_logging_speed * min_page_per_s
        if self.site_crawl_delay > 1/min_page_per_s:
            ErrorLogger.log_error("SiteChecker._set_task_control_max()",
                                  ValueError("site has crawl delay greater than mas delay."), self.domain_link)
            self._status = "Stopped"
            self.sudden_death()

    def get_site_feedback(self) -> SeedSiteFeedback:
        return SeedSiteFeedback(self.orginal_link, page_count=self.get_page_need_look_up())

    def get_site_info(self) -> SiteInfo:  # keep the original reference when sending back the site infomation
        info = SiteInfo(self.orginal_link, self.data_source)
        return info

    def populate_with_state(self, state):
        if state is not None and isinstance(state, SiteCheckerState):
            self._status = "Restarted"
            self.page_count = state.page_count
            self.page_allocated = state.page_count
            self.internal_page_count = state.internal_page_count
            self.internal_page_last_count = state.internal_page_count
            self.external_links_checked = state.external_page_count
            self._external_db_buffer.set_progress(state.external_page_count)
            self.page_need_look_up = state.page_need_look_up
            self.current_level = state.current_level
            self.progress_logger.set_reference(state.log_sample_index, state.log_started_time)
            counter = 0
            if self.data_source is not None:
                try:
                    for item in self.data_source.get_next():
                        if counter >= self.cache_size:
                            break
                        if isinstance(item, OnSiteLink) and not LinkChecker.is_external_link(self.root_domain, item.link):
                            self.cache_list.append(item.link)
                            # print("--restore: ", item)
                            counter += 1
                except Exception as ex:
                    msg = "error in SiteChecker.populate_with_state(), trying to populate cache, " + self.root_domain
                    ErrorLogger.log_error("SiteChecker", ex, msg)

                self.data_source.ref = state.datasource_ref
                self.data_source.output_c = state.datasource_output_c
                self.data_source.set_progress(state.datasource_index if state.datasource_index < state.page_count else state.page_count)
                self.data_source.set_continue_lock(True)

    def get_file_name(self):
        return self.data_source.ref

    def get_limit(self):
        return 100000

    def get_column_names(self):
        return ["Page Index", "External", "All", "Status"]

    def get_progress(self):
        data_source_count = self.data_source.count_all()
        if self.page_count - self._page_count_shadow <= self._speed_penalty_threshold:  # determine if site is slow
            self._speed_penalty_count += 1
            if self._speed_penalty_count > 2:
                self._status = "Stopped"
                self.sudden_death()
        else:
            self._speed_penalty_count = 0

        if self.page_count == self._page_count_shadow and data_source_count == self._all_page_count_shadow:  # determine if site is stucked
            self._status = "Stopped"
            self.sudden_death()

        self._page_count_shadow = self.page_count
        self._all_page_count_shadow = data_source_count
        return [self.page_count, self.external_links_checked, data_source_count, self._status]

    def is_programme_finshed(self):
        return self._finihsed

    def get_callback_data(self):
        with self.page_count_lock:
            gap = self.internal_page_count - self.internal_page_last_count
            self.internal_page_last_count = self.internal_page_count
            seed_feedback = None
            if self._finihsed:
                seed_feedback = self.get_site_feedback()

        return SiteFeedback(gap, self._finihsed, seed_feedback=seed_feedback, datasource_ref=self.data_source.ref)

    def get_state(self):
        return SiteCheckerState(page_count=self.page_count, page_need_look_up=self.page_need_look_up,
                                current_level=self.current_level, internal_page_count=self.internal_page_count,
                                external_page_count= self.external_links_checked,
                                datasource_index=self.data_source.temp_counter,
                                datasource_output_c=self.data_source.output_c,
                                datasource_ref=self.data_source.ref, log_started_time=self.progress_logger.begin_time,
                                log_sample_index=self.progress_logger.limit_counter,)

    def additional_reset(self):
        pass

    def addtional_clear(self):
        pass

    def stop(self):
        # natural stop
        self._status = "Stopped"
        self.progress_logger.report_progress()
        self._stop_event.set()
        if self.progress_logger.is_alive():
            self.progress_logger.join()

    def clear(self):
        self.cache_list.clear()
        self.addtional_clear()

    def acquire_task(self, level: int, link: str):
        tasked_acquired = True
        if link.endswith('/'):
            temp = link
        else:
            temp = link + '/'
        with self.task_control_lock:
            if len(self._double_check_cache) > 0:
                if temp in self._double_check_cache:
                    print("duplicate link found:", link)
                    tasked_acquired = False
                else:
                    if len(self._double_check_cache) >= self.cache_size:
                        self._double_check_cache.popleft()
                    self._double_check_cache.append(temp)
            self.task_control_counter -= 1
            self.page_allocated += 1
            if tasked_acquired:
                if level > self.current_level:
                    self.current_level = level
            # time.sleep(self.site_crawl_delay)
        return tasked_acquired

    def release_task(self, new_page: int):
        with self.task_control_lock:
            if self.page_need_look_up == 1 and new_page == 0:
                PrintLogger.print("set to stop data source")
                self.data_source.set_continue_lock(False)
            else:
                self.page_count += 1
                self.page_need_look_up += new_page
                #self.external_links_checked += external_page_count
                self.task_control_counter += 1
                # was determine if it is internal or external page
                self.internal_page_count += 1
                if self.internal_page_count > self.max_page or self.current_level > self.max_level:
                    if self.data_source.can_continue():
                        PrintLogger.print("set stop: " + str(self.internal_page_count)+" level: "+str(self.current_level))
                        self.data_source.set_continue_lock(False)

    def get_page_count(self):
        with self.page_count_lock:
            page_count = self.page_count
        return page_count

    def set_page_count(self, page_count: int):
        with self.page_count_lock:
            self.page_count = page_count

    def set_internal_page_count(self, count: int):
        with self.internal_page_count_lock:
            self.internal_page_count += count

    def get_internal_page_count(self):
        with self.internal_page_count_lock:
            count = self.internal_page_count
        return count

    def get_current_level(self):
        with self.level_lock:
            current_level = self.current_level
        return current_level

    def set_current_level(self, level):
        with self.level_lock:
            self.current_level = level

    def get_page_need_look_up(self):
        with self.page_look_up_lock:
            page_look_up = self.page_need_look_up
        #self.page_look_up_lock.release()
        return page_look_up

    def set_page_need_look_up(self, page_count):
        with self.page_look_up_lock:
            #time.sleep(0.1)
            self.page_need_look_up = page_count
        # self.page_look_up_lock.release()

    def set_page_need_look_up_plus_more(self, count: int):
        with self.page_look_up_lock:
            self.page_need_look_up += count

    def get_internal_page_progress_index(self)->int:
        return self.get_page_count()

    def set_internal_page_progress_index(self, index: int):
        self.page_count = index
        self.page_allocated = index

    def is_idle(self):
        idle = False
        with self.task_control_lock:
            page_need_look_up = self.get_page_need_look_up()
            new_task_added = page_need_look_up - self.page_need_look_up_temp
            has_new_task = True if new_task_added > 0 else False
            #page_count = self.get_page_count()
            if has_new_task:
                self.page_need_look_up_temp = page_need_look_up
            else:
                if self.task_control_counter >= self.task_control_max:
                    idle = True
                #     print("is idle")
                # else:
                #     print("is working")
        return idle

    def add_link_to_cache(self, link):
        if len(self.cache_list) > self.cache_size:
            return
        else:
            if link.endswith('/'):
                self.cache_list.append(link)
            else:
                self.cache_list.append(link+'/')

    def is_link_in_cache(self, link):
        if link.endswith('/'):
            temp = link
        else:
            temp = link + '/'
        return True if temp in self.cache_list else False

    def reset_as(self, domain: str, link: str=""):  # reset the target domain
        PrintLogger.print("crawl reset as: "+domain)
        self.domain = domain
        self.domain_link = self.scheme + "://" + self.domain
        self.page_count = 0
        self.current_level = 0
        self.set_page_need_look_up(1)
       # self.set_page_looked_up(0)
        self.clear()
        if len(link) == 0:
            self.cache_list.append(self.domain_link)
            self.data_source.re_target(self.domain_link, OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1))
            #self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1))
        else:
            self.cache_list.append(link)
            self.data_source.re_target(link, OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1))
            #self.data_source.append(OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1))
        self.additional_reset()
        self.data_source.additional_startup_procedures()

    def crawling(self):  # call this method to start operation
        self._start_sending_feedback()
        self._output_thread = threading.Thread(target=self._get_external_links_to_queue)
        if self.data_source.can_continue():
            self.data_source.additional_startup_procedures()  # use the data set in self._populate_with_state() to start
            self._external_db_buffer.start_input_output_cycle()
            self._output_thread.start()
            self.progress_logger.start()
            self.progress_logger.report_progress()  # log first row
            self._status = "Work"
            self.begin_crawl()
            # prefix = "www."
            # page_count_limit = 2
            # if self.page_count <= page_count_limit and prefix not in self.domain_link:
            #     new_domain = prefix + self.sub_domain
            #     self.reset_as(new_domain)
            #     self._status = "Work"
            #     self.begin_crawl()
            # print("going to stop all.")
            self.stop()
            self.clear()

            self.data_source.additional_finish_procedures()
            # print("going to finish output buffer.")
            self._external_db_buffer.terminate()
            # print("going to stop output_thread.")
            if self._output_thread.is_alive():
                self._output_thread.join()
        PrintLogger.print("finished naturally: "+self.domain_link)
        # print("finished naturally.")
        self._finihsed = True
            #calling this at the end of operation
        PrintLogger.print("send last response")
        # print("send last response")
        # print("send last response.")
        self._end_sending_feedback()
        if self._memory_control_terminate_event is not None:
            self._memory_control_terminate_event.set()

    def sudden_death(self):
        if not self._finihsed:
            self._finihsed = True
            PrintLogger.print("start sudden death: "+self.orginal_link)
            #self.stop()
            self.stop()

            self.clear()
            self.data_source.set_continue_lock(False)
            self.data_source.additional_finish_procedures()
            self._external_db_buffer.terminate()
            if isinstance(self._output_thread, threading.Thread):
                if self._output_thread.is_alive():
                    self._output_thread.join()
                #calling this at the end of operation
            PrintLogger.print("send last response")
            self._end_sending_feedback()
            if self._memory_control_terminate_event is not None:
                ErrorLogger.log_error("SiteChecker", TimeoutError("slow processing speed, terminated."), self.orginal_link)
                self._memory_control_terminate_event.set()

    def begin_crawl(self, level=0):  # subclass this to make different behaviour
        pass