Example #1
0
def site_check_process_wraper(func,
                              func_arg=(),
                              func_kwarg=None,
                              callback=None,
                              Memlimit=200,
                              external_stop=None):
    mem_pro = MemoryControlPs(func, func_arg, func_kwarg, callback, Memlimit,
                              external_stop)
    mem_pro.start()
 def _filtering_process_wrapper(self):
     self.filter_process = MemoryControlPs(
         func=filtering_process,
         func_kwargs=FilterController.get_input_parameters(
             "filtering.db", get_recovery_dir_path(),
             self._filter_input_queue, self._filter_output_queue,
             self._stop_event, self._filter_matrix, self._accounts,
             self._filtering_only, self._filtering_offset,
             self._filtering_total),
         external_stop_event=self._stop_event)
     self.filter_process.start()
def checking_whois():
    # optinmal = self.max_prcess * self.concurrent_page/5
    optinmal = 260 * 3 / 5
    if optinmal < 10:
        worker_number = 10
    else:
        worker_number = int(optinmal)
    mem_limit = 1000
    if mem_limit < 200:
        mem_limit = 200
    stop_event = Event()
    kwargs = {"is_debug": True, "stop_event": stop_event, "max_worker": worker_number}
    whois_process_wrapper = MemoryControlPs(
        whois_process, func_kwargs=kwargs, mem_limit=mem_limit, external_stop_event=stop_event
    )
    whois_process_wrapper.start()
Example #4
0
 def checking_whois(self):
     optinmal = self.max_prcess * self.concurrent_page / 5
     if optinmal < 10:
         worker_number = 10
     else:
         worker_number = int(optinmal)
     mem_limit = self.memory_limit_per_process / 2
     if mem_limit < 200:
         mem_limit = 200
     self.whois_process = MemoryControlPs(
         whois_process,
         func_kwargs=WhoisChecker.get_input_parameters(
             self._whoisQueue, self.outputQueue, self.stop_event,
             worker_number),
         mem_limit=mem_limit,
         external_stop_event=self.stop_event)
     self.whois_process.start()
def checking_whois():
    # optinmal = self.max_prcess * self.concurrent_page/5
    optinmal = 260 * 3 / 5
    if optinmal < 10:
        worker_number = 10
    else:
        worker_number = int(optinmal)
    mem_limit = 1000
    if mem_limit < 200:
        mem_limit = 200
    stop_event = Event()
    kwargs = {
        "is_debug": True,
        "stop_event": stop_event,
        "max_worker": worker_number
    }
    whois_process_wrapper = MemoryControlPs(whois_process,
                                            func_kwargs=kwargs,
                                            mem_limit=mem_limit,
                                            external_stop_event=stop_event)
    whois_process_wrapper.start()
 def checking_whois(self):
     optinmal = self.max_prcess * self.concurrent_page/5
     if optinmal < 10:
         worker_number = 10
     else:
         worker_number = int(optinmal)
     mem_limit = self.memory_limit_per_process/2
     if mem_limit < 200:
         mem_limit = 200
     self.whois_process = MemoryControlPs(whois_process,
                                   func_kwargs=WhoisChecker.get_input_parameters(self._whoisQueue, self.outputQueue,
                                                                                 self.stop_event, worker_number),
                                   mem_limit=mem_limit, external_stop_event=self.stop_event)
     self.whois_process.start()
    def _filtering_process_wrapper(self):
        self.filter_process = MemoryControlPs(func=filtering_process,
                                         func_kwargs=FilterController.get_input_parameters("filtering.db", get_recovery_dir_path(),
                                                                                           self._filter_input_queue,
                                                                                           self._filter_output_queue,
                                                                                           self._stop_event,
                                                                                           self._filter_matrix,

                                                                                           self._accounts,
                                                                                           self._filtering_only,
                                                                                           self._filtering_offset,
                                                                                           self._filtering_total),
                                         external_stop_event=self._stop_event)
        self.filter_process.start()
def site_check_process_wraper(func, func_arg=(), func_kwarg=None, callback=None, Memlimit=200, external_stop=None):
    mem_pro = MemoryControlPs(func, func_arg, func_kwarg, callback, Memlimit, external_stop)
    mem_pro.start()
class SiteCheckProcessManager(Thread, SiteCheckerController):
    MEM_MINIMUM_REQ = 100

    def __init__(self, job_name: str="", input_Q:multiprocessing.Queue=None, max_procss=4, concurrent_page=1,
                 page_max_level=10, max_page_per_site=1000, output_delegate=None,
                 memory_limit_per_process=100, **kwargs):
        """

        :param job_name:
        :param input_Q:
        :param max_procss:
        :param concurrent_page:
        :param page_max_level:
        :param max_page_per_site:
        :param output_delegate:
        :param memory_limit_per_process: if value is less than 100, throw ValueException
        :param kwargs:
        :return:
        """
        Thread.__init__(self)
        #FeedbackInterface.__init__(**kwargs)
        #super(SiteCheckProcessManager, self).__init__(**kwargs)
        #self.process_queue = multiprocessing.Queue()
        self.name = job_name
        if max_procss <= 0:
            max_procss = 1
        self.max_prcess = max_procss
        if input_Q is None:
            self.inputQueue = multiprocessing.Queue()
        else:
            self.inputQueue = input_Q
        self.outputQueue = multiprocessing.Queue()
        self._whoisQueue = multiprocessing.Queue()
        #self.output_lock = threading.RLock()
        #self.tempList = site_list # if there is a need to add new sites during scripting, add to this list
        self.processPrfix = "Process-"
        self.threadPrfix = "Thread-"
        self.page_max_level = page_max_level
        self.max_page_per_site = max_page_per_site

        if output_delegate is None:
            self.output_delegate = self.default_delegate
        else:
            self.output_delegate = output_delegate # delegate of type f(x:OnSiteLink)
        self.stop_event = multiprocessing.Event()
        self.finished = False
        self.pool = ThreadPool(processes=self.max_prcess)
        #self.pool = multiprocessing.Pool(processes=self.max_prcess)
        self.output_thread = None
        self.job_all = 0
        self.job_done = 0
        self.job_waiting = 0
        self.total_page_done = 0
        self.page_per_sec = 0  # need to do this
        self.average_page_per_site = 0
        self.patch_limit = self.max_prcess
        self.temp_results = []
        self.site_info = []  # collect site info after the job done
        self.db_trash_list = []
        self.concurrent_page = concurrent_page
        self.continue_lock = threading.RLock()
        self.db_trash_lock = threading.RLock()
        self.state_lock = threading.RLock()
        self.temp_result_lock = threading.RLock()
        self.site_info_lock = threading.RLock()
        if memory_limit_per_process < SiteCheckProcessManager.MEM_MINIMUM_REQ:
            ex = ValueError("minimum memory requirement to run the crawler is 100 MB, otherwise too many memory control looping.")
            msg = "error in SiteCheckProcessManager.__init__(), with database: " + job_name
            ErrorLogger.log_error("SiteCheckProcessManager", ex, msg)
        self.memory_limit_per_process = memory_limit_per_process
        self.whois_process = None
        self.whois_queue_process = Process(target=run_queue_server)
        #self.input_iter = SiteInputIter(self.inputQueue, self, self.concurrent_page, self.page_max_level,
        #                                self.max_page_per_site, self.outputQueue, self.process_site_info)
        self.input_iter = SiteInputIter(self.inputQueue, func=site_check_process, external_stop=self.stop_event)

    def _create_all_file_dirs(self):
        try:
            FileHandler.create_file_if_not_exist(get_log_dir())
            FileHandler.create_file_if_not_exist(get_recovery_dir_path())
            FileHandler.create_file_if_not_exist(get_temp_db_dir())
            FileHandler.create_file_if_not_exist(get_task_backup_dir())
            FileHandler.create_file_if_not_exist(get_db_buffer_default_dir())
        except Exception as ex:
            ErrorLogger.log_error("SiteCheckProcessManager", ex, "_create_all_file_dirs()")

    def clear_cache(self):
        try:
            FileHandler.clear_dir(get_log_dir())
            FileHandler.clear_dir(get_recovery_dir_path())
            FileHandler.clear_dir(get_temp_db_dir())
            FileHandler.clear_dir(get_task_backup_dir())
            FileHandler.clear_dir(get_db_buffer_default_dir())
        except Exception as ex:
            ErrorLogger.log_error("SiteCheckProcessManager", ex, "clear_cache()")

    def set_system_limit(self):
        try:
            os.system('sudo -s')
            os.system('ulimit -n 204800')
            # os.system('ulimit -s 1024')
        except Exception as ex:
            print(ex)

    def get_temp_result_count(self):
        #with self.temp_result_lock:
        return len(self.temp_results)

    def get_temp_result_and_clear(self) -> []:
        with self.temp_result_lock:
            copied = self.temp_results.copy()
            self.temp_results.clear()
        return copied

    def default_delegate(self, result):
        with self.temp_result_lock:
            if isinstance(result, OnSiteLink):
                self.temp_results.append(result)  # make no difference
                #CsvLogger.log_to_file("ExternalSiteTemp", [(result.link, result.response_code), ])
            elif isinstance(result, str):
                self.temp_results.append(result)
            elif isinstance(result, tuple) and len(result) == 2:
                temp = OnSiteLink(result[0], result[1])
                print("new domain:", temp)
                self.temp_results.append(temp)
            else:
                pass

    def get_state(self) -> SiteCheckProcessState:
        print("get state from slave crawler")
        with self.state_lock:
            state = SiteCheckProcessState(self.job_all, self.job_done, self.job_waiting, self.total_page_done,
                                          self.average_page_per_site, self.get_temp_result_count())
        print("get state from slave crawler finished")
        return state

    def get_filter_progress(self):
        if isinstance(self.whois_process, MemoryControlPs):
            state = self.whois_process.get_last_state()
            if isinstance(state, WhoisCheckerState):
                return state.progress_count, state.data_total
            else:
                return 0, 0
        else:
            return 0, 0

    def clear_trash(self):  # run with a thread
        while not self.stop_event.is_set():
            with self.db_trash_lock:
                removed_list = []
                trash_len = len(self.db_trash_list)
                if trash_len > 0:
                    for item in self.db_trash_list:
                        if TempDBInterface.force_clear(item):
                            #print("removed trash:", item)
                            removed_list.append(item)
                    for removed_item in removed_list:
                        self.db_trash_list.remove(removed_item)
                    CsvLogger.log_to_file("job_finished", [(x, str(datetime.datetime.now())) for x in removed_list], get_task_backup_dir())
                    removed_list.clear()
            time.sleep(2)

    def put_to_input_queue(self, data: []):
        if data is not None:
            for item in data:
                self.inputQueue.put(item)
                self.job_all += 1

    def get_site_info_list_and_clear(self):
        with self.site_info_lock:
            copied = self.site_info.copy()
            self.site_info.clear()
        return copied

    def get_site_info_list_count(self):
        return len(self.site_info)

    def process_site_info(self, site_info):
        if site_info is not None:
            with self.site_info_lock:
                PrintLogger.print("finished site info: " + str(site_info.__dict__))
                self.site_info.append(site_info)

    def process_feedback(self, feedback: SiteFeedback):
        self.add_page_done(feedback.page_done)
        if feedback.finished:
            # print("should process feedback!")
            self.site_finished()
            self.process_site_info(feedback.seed_feedback)
            with self.db_trash_lock:
                self.db_trash_list.append(feedback.datasource_ref)
                self.db_trash_list.append(feedback.datasource_ref+".ext.db")

    def add_page_done(self, number_page_done: int):  # make sure it is thread safe
        with self.state_lock:
            self.total_page_done += number_page_done
        time.sleep(0.001)

    def site_finished(self):
        # print("one more site done")
        with self.state_lock:
            self.job_done += 1
            self.average_page_per_site = self.total_page_done/self.job_done
        time.sleep(0.001)

    def set_stop(self):
        self.stop_event.set()

    def can_continue(self):
        return not self.stop_event.is_set()

    def checking_whois(self):
        optinmal = self.max_prcess * self.concurrent_page/5
        if optinmal < 10:
            worker_number = 10
        else:
            worker_number = int(optinmal)
        mem_limit = self.memory_limit_per_process/2
        if mem_limit < 200:
            mem_limit = 200
        self.whois_process = MemoryControlPs(whois_process,
                                      func_kwargs=WhoisChecker.get_input_parameters(self._whoisQueue, self.outputQueue,
                                                                                    self.stop_event, worker_number),
                                      mem_limit=mem_limit, external_stop_event=self.stop_event)
        self.whois_process.start()

    def queue_failure_reset(self):
        manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
        return self.outputQueue

    def run(self):
        # self.set_system_limit()
        self._create_all_file_dirs()
        self.whois_queue_process.start()
        whois_thread = Thread(target=self.checking_whois)
        trash_clean_thread = Thread(target=self.clear_trash)
        manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
        # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue,
        #                           delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset)
        self.output_thread = outputThread(threadID=0, name=self.threadPrfix+"Output", stop_event=self.stop_event,
                                          inputQ=self.outputQueue, delegate=self.output_delegate,
                                          failsure_reset_queue=self.queue_failure_reset)
        self.output_thread.start()
        trash_clean_thread.start()
        whois_thread.start()
        # self.whois_queue_process.start()
        self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter(full_link="", # this parameter will be updated in self.input_iter
                                                                           max_page=self.max_page_per_site,
                                                                           max_level=self.page_max_level,
                                                                           output_queue=self._whoisQueue,
                                                                           pool_size=self.concurrent_page)
        self.input_iter.callback = self.process_feedback
        self.input_iter.Memlimit = self.memory_limit_per_process
        try:
            #print("monitor process started: pid: ", os.getpid())
            self.pool.imap(site_check_process_iter, self.input_iter, 1)
            #self.pool.imap_unordered(site_check_process_iter, self.input_iter)
            while self.can_continue():
                time.sleep(0.5)
        except Exception as ex:
            msg = "run(), with database: " + self.name
            ErrorLogger.log_error("SiteCheckProcessManager", ex, msg)
        finally:
            print("terminate miner!")
            self.pool.terminate()
            whois_thread.join()
            self.whois_queue_process.terminate()
            self.temp_results.clear()
            self.site_info.clear()
            self.finished = True
Example #10
0
class SiteCheckProcessManager(Thread, SiteCheckerController):
    MEM_MINIMUM_REQ = 100

    def __init__(self,
                 job_name: str = "",
                 input_Q: multiprocessing.Queue = None,
                 max_procss=4,
                 concurrent_page=1,
                 page_max_level=10,
                 max_page_per_site=1000,
                 output_delegate=None,
                 memory_limit_per_process=100,
                 **kwargs):
        """

        :param job_name:
        :param input_Q:
        :param max_procss:
        :param concurrent_page:
        :param page_max_level:
        :param max_page_per_site:
        :param output_delegate:
        :param memory_limit_per_process: if value is less than 100, throw ValueException
        :param kwargs:
        :return:
        """
        Thread.__init__(self)
        #FeedbackInterface.__init__(**kwargs)
        #super(SiteCheckProcessManager, self).__init__(**kwargs)
        #self.process_queue = multiprocessing.Queue()
        self.name = job_name
        if max_procss <= 0:
            max_procss = 1
        self.max_prcess = max_procss
        if input_Q is None:
            self.inputQueue = multiprocessing.Queue()
        else:
            self.inputQueue = input_Q
        self.outputQueue = multiprocessing.Queue()
        self._whoisQueue = multiprocessing.Queue()
        #self.output_lock = threading.RLock()
        #self.tempList = site_list # if there is a need to add new sites during scripting, add to this list
        self.processPrfix = "Process-"
        self.threadPrfix = "Thread-"
        self.page_max_level = page_max_level
        self.max_page_per_site = max_page_per_site

        if output_delegate is None:
            self.output_delegate = self.default_delegate
        else:
            self.output_delegate = output_delegate  # delegate of type f(x:OnSiteLink)
        self.stop_event = multiprocessing.Event()
        self.finished = False
        self.pool = ThreadPool(processes=self.max_prcess)
        #self.pool = multiprocessing.Pool(processes=self.max_prcess)
        self.output_thread = None
        self.job_all = 0
        self.job_done = 0
        self.job_waiting = 0
        self.total_page_done = 0
        self.page_per_sec = 0  # need to do this
        self.average_page_per_site = 0
        self.patch_limit = self.max_prcess
        self.temp_results = []
        self.site_info = []  # collect site info after the job done
        self.db_trash_list = []
        self.concurrent_page = concurrent_page
        self.continue_lock = threading.RLock()
        self.db_trash_lock = threading.RLock()
        self.state_lock = threading.RLock()
        self.temp_result_lock = threading.RLock()
        self.site_info_lock = threading.RLock()
        if memory_limit_per_process < SiteCheckProcessManager.MEM_MINIMUM_REQ:
            ex = ValueError(
                "minimum memory requirement to run the crawler is 100 MB, otherwise too many memory control looping."
            )
            msg = "error in SiteCheckProcessManager.__init__(), with database: " + job_name
            ErrorLogger.log_error("SiteCheckProcessManager", ex, msg)
        self.memory_limit_per_process = memory_limit_per_process
        self.whois_process = None
        self.whois_queue_process = Process(target=run_queue_server)
        #self.input_iter = SiteInputIter(self.inputQueue, self, self.concurrent_page, self.page_max_level,
        #                                self.max_page_per_site, self.outputQueue, self.process_site_info)
        self.input_iter = SiteInputIter(self.inputQueue,
                                        func=site_check_process,
                                        external_stop=self.stop_event)

    def _create_all_file_dirs(self):
        try:
            FileHandler.create_file_if_not_exist(get_log_dir())
            FileHandler.create_file_if_not_exist(get_recovery_dir_path())
            FileHandler.create_file_if_not_exist(get_temp_db_dir())
            FileHandler.create_file_if_not_exist(get_task_backup_dir())
            FileHandler.create_file_if_not_exist(get_db_buffer_default_dir())
        except Exception as ex:
            ErrorLogger.log_error("SiteCheckProcessManager", ex,
                                  "_create_all_file_dirs()")

    def clear_cache(self):
        try:
            FileHandler.clear_dir(get_log_dir())
            FileHandler.clear_dir(get_recovery_dir_path())
            FileHandler.clear_dir(get_temp_db_dir())
            FileHandler.clear_dir(get_task_backup_dir())
            FileHandler.clear_dir(get_db_buffer_default_dir())
        except Exception as ex:
            ErrorLogger.log_error("SiteCheckProcessManager", ex,
                                  "clear_cache()")

    def set_system_limit(self):
        try:
            os.system('sudo -s')
            os.system('ulimit -n 204800')
            # os.system('ulimit -s 1024')
        except Exception as ex:
            print(ex)

    def get_temp_result_count(self):
        #with self.temp_result_lock:
        return len(self.temp_results)

    def get_temp_result_and_clear(self) -> []:
        with self.temp_result_lock:
            copied = self.temp_results.copy()
            self.temp_results.clear()
        return copied

    def default_delegate(self, result):
        with self.temp_result_lock:
            if isinstance(result, OnSiteLink):
                self.temp_results.append(result)  # make no difference
                #CsvLogger.log_to_file("ExternalSiteTemp", [(result.link, result.response_code), ])
            elif isinstance(result, str):
                self.temp_results.append(result)
            elif isinstance(result, tuple) and len(result) == 2:
                temp = OnSiteLink(result[0], result[1])
                print("new domain:", temp)
                self.temp_results.append(temp)
            else:
                pass

    def get_state(self) -> SiteCheckProcessState:
        print("get state from slave crawler")
        with self.state_lock:
            state = SiteCheckProcessState(self.job_all, self.job_done,
                                          self.job_waiting,
                                          self.total_page_done,
                                          self.average_page_per_site,
                                          self.get_temp_result_count())
        print("get state from slave crawler finished")
        return state

    def get_filter_progress(self):
        if isinstance(self.whois_process, MemoryControlPs):
            state = self.whois_process.get_last_state()
            if isinstance(state, WhoisCheckerState):
                return state.progress_count, state.data_total
            else:
                return 0, 0
        else:
            return 0, 0

    def clear_trash(self):  # run with a thread
        while not self.stop_event.is_set():
            with self.db_trash_lock:
                removed_list = []
                trash_len = len(self.db_trash_list)
                if trash_len > 0:
                    for item in self.db_trash_list:
                        if TempDBInterface.force_clear(item):
                            #print("removed trash:", item)
                            removed_list.append(item)
                    for removed_item in removed_list:
                        self.db_trash_list.remove(removed_item)
                    CsvLogger.log_to_file("job_finished",
                                          [(x, str(datetime.datetime.now()))
                                           for x in removed_list],
                                          get_task_backup_dir())
                    removed_list.clear()
            time.sleep(2)

    def put_to_input_queue(self, data: []):
        if data is not None:
            for item in data:
                self.inputQueue.put(item)
                self.job_all += 1

    def get_site_info_list_and_clear(self):
        with self.site_info_lock:
            copied = self.site_info.copy()
            self.site_info.clear()
        return copied

    def get_site_info_list_count(self):
        return len(self.site_info)

    def process_site_info(self, site_info):
        if site_info is not None:
            with self.site_info_lock:
                PrintLogger.print("finished site info: " +
                                  str(site_info.__dict__))
                self.site_info.append(site_info)

    def process_feedback(self, feedback: SiteFeedback):
        self.add_page_done(feedback.page_done)
        if feedback.finished:
            # print("should process feedback!")
            self.site_finished()
            self.process_site_info(feedback.seed_feedback)
            with self.db_trash_lock:
                self.db_trash_list.append(feedback.datasource_ref)
                self.db_trash_list.append(feedback.datasource_ref + ".ext.db")

    def add_page_done(self,
                      number_page_done: int):  # make sure it is thread safe
        with self.state_lock:
            self.total_page_done += number_page_done
        time.sleep(0.001)

    def site_finished(self):
        # print("one more site done")
        with self.state_lock:
            self.job_done += 1
            self.average_page_per_site = self.total_page_done / self.job_done
        time.sleep(0.001)

    def set_stop(self):
        self.stop_event.set()

    def can_continue(self):
        return not self.stop_event.is_set()

    def checking_whois(self):
        optinmal = self.max_prcess * self.concurrent_page / 5
        if optinmal < 10:
            worker_number = 10
        else:
            worker_number = int(optinmal)
        mem_limit = self.memory_limit_per_process / 2
        if mem_limit < 200:
            mem_limit = 200
        self.whois_process = MemoryControlPs(
            whois_process,
            func_kwargs=WhoisChecker.get_input_parameters(
                self._whoisQueue, self.outputQueue, self.stop_event,
                worker_number),
            mem_limit=mem_limit,
            external_stop_event=self.stop_event)
        self.whois_process.start()

    def queue_failure_reset(self):
        manager, self.outputQueue = get_queue_client(
            QueueManager.MachineSettingCrawler,
            QueueManager.Method_Whois_Output)
        return self.outputQueue

    def run(self):
        # self.set_system_limit()
        self._create_all_file_dirs()
        self.whois_queue_process.start()
        whois_thread = Thread(target=self.checking_whois)
        trash_clean_thread = Thread(target=self.clear_trash)
        manager, self.outputQueue = get_queue_client(
            QueueManager.MachineSettingCrawler,
            QueueManager.Method_Whois_Output)
        # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue,
        #                           delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset)
        self.output_thread = outputThread(
            threadID=0,
            name=self.threadPrfix + "Output",
            stop_event=self.stop_event,
            inputQ=self.outputQueue,
            delegate=self.output_delegate,
            failsure_reset_queue=self.queue_failure_reset)
        self.output_thread.start()
        trash_clean_thread.start()
        whois_thread.start()
        # self.whois_queue_process.start()
        self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter(
            full_link="",  # this parameter will be updated in self.input_iter
            max_page=self.max_page_per_site,
            max_level=self.page_max_level,
            output_queue=self._whoisQueue,
            pool_size=self.concurrent_page)
        self.input_iter.callback = self.process_feedback
        self.input_iter.Memlimit = self.memory_limit_per_process
        try:
            #print("monitor process started: pid: ", os.getpid())
            self.pool.imap(site_check_process_iter, self.input_iter, 1)
            #self.pool.imap_unordered(site_check_process_iter, self.input_iter)
            while self.can_continue():
                time.sleep(0.5)
        except Exception as ex:
            msg = "run(), with database: " + self.name
            ErrorLogger.log_error("SiteCheckProcessManager", ex, msg)
        finally:
            print("terminate miner!")
            self.pool.terminate()
            whois_thread.join()
            self.whois_queue_process.terminate()
            self.temp_results.clear()
            self.site_info.clear()
            self.finished = True
class MiningMasterController(threading.Thread):
    def __init__(self,
                 accounts: list = [],
                 ref="",
                 cap_slave=0,
                 cap_slave_process=1,
                 cap_concurrent_page=1,
                 all_job=0,
                 offset=0,
                 max_page_level=100,
                 max_page_limit=1000,
                 loopback_database=False,
                 refresh_rate=10,
                 min_page_count=0,
                 filters=DBFilterCollection(),
                 crawl_matrix=CrawlMatrix(),
                 filtering_only_mode=False,
                 filtering_offset=0,
                 filtering_total=0):
        """
        init a master controller
        :param ref: dataBase Table reference
        :param cap: max number of slaves
        :param all_job:
        :return:
        """
        print("MiningMasterController.__init__")
        print("setup data:")
        if isinstance(accounts, list):
            print("accounts: ")
            for item in accounts:
                print(item)
        print("ref:", ref)
        print("cap_slave:", cap_slave)
        print("cap_slave_process:", cap_slave_process)
        print("cap_concurrent_page:", cap_concurrent_page)
        if crawl_matrix is not None:
            print("crawl matrix:", crawl_matrix.__dict__)
        threading.Thread.__init__(self)
        self.state = ServerState.State_Init
        self.ref = ref  # database
        self.slaves = []
        self.auto_scale_slaves(cap_slave)
        self.cap_slave_process = cap_slave_process  # how many process can a slave run, if it is 0, then it will auto scale
        self.concurrent_page = cap_concurrent_page
        self.stop_Mining = False
        self.job_done = 0
        self.job_wait = 0
        self.job_allocated = 0
        self.job_all = all_job
        self.offset = offset
        self.max_page_level = max_page_level
        self.max_page_limit = max_page_limit
        self.start_time = time.time()
        self.end_time = time.time()
        self.loopback_database = False
        self.refresh_rate = refresh_rate
        self.in_progress = False
        self.min_page_count = min_page_count  # only crawl sites with page greater than this number

        self.db_seed = None
        if filters is None:
            self.db_filters = DBFilterCollection()
            self.db_filters.external_filter.update_interval = 30
            self.db_filters.filtered_result.update_interval = 30
            self.db_filters.seed_filter.update_interval = 1200
        else:
            self.db_filters = filters
        self.filter_shadow = filters.copy_attrs()
        self.db_stats = []
        self.seed_db_update_time = time.time()
        self.external_db_update_time = time.time()
        self.filtered_db_update_time = time.time()
        self.db_update_lock = threading.RLock()
        self._seed_db_lock = RLock()
        self._external_db_lock = RLock()
        self._result_db_lock = RLock()
        self._result_bad_db_lock = RLock()
        self._redemption_db_lock = RLock()
        self.update_db_stats(force_update=True)
        self._stop_event = Event()

        #this is for filters
        self._filter_input_queue = Queue()
        self._filter_output_queue = Queue()
        self.filter_process = None
        if isinstance(crawl_matrix, CrawlMatrix) and crawl_matrix.tf == 0:
            self._filter_matrix = CrawlMatrix(tf=15,
                                              cf=15,
                                              da=15,
                                              ref_domains=10,
                                              tf_cf_deviation=0.80)
        else:
            self._filter_matrix = crawl_matrix
        self._accounts = accounts
        self._filtering_only = filtering_only_mode
        self._filtering_offset = filtering_offset
        self._filtering_total = filtering_total

    def update_db_stats(self, force_update=False):
        print("update db stats, do not interrupt!")
        if self.filter_shadow is not None:
            names = SiteSource.get_all_table_names(SiteSource.Seed)
            if len(names) > 0:
                databases = []
                fil = self.filter_shadow
                if force_update:
                    for name in names:
                        if name is not None and len(name) > 0:
                            with self._seed_db_lock:
                                seed = SeedSiteDB(name,
                                                  db_filter=fil.seed_filter)
                                seed_count = seed.site_count()
                                seed.close()
                            with self._external_db_lock:
                                external = ExternalSiteDB(
                                    name, db_filter=fil.external_filter)
                                external_count = external.site_count()
                                external.close()
                            with self._result_db_lock:
                                filtered = FilteredResultDB(
                                    name, db_filter=fil.filtered_result)
                                filtered_count = filtered.site_count()
                                filtered.close()
                            with self._result_bad_db_lock:
                                filtered_bad = FilteredResultDB(
                                    name,
                                    bad_db=True,
                                    db_filter=fil.filtered_result)
                                filtered_count_bad = filtered_bad.site_count()
                                filtered_bad.close()
                            x = DatabaseStatus(name, seed_count,
                                               external_count, filtered_count,
                                               filtered_count_bad)
                            databases.append(x)
                    self.seed_db_update_time = time.time()
                    self.external_db_update_time = time.time()
                    self.filtered_db_update_time = time.time()
                    self.db_stats = databases
                    #return databases
                else:
                    time_now = time.time()
                    if len(self.db_stats) == 0:
                        for name in names:
                            self.db_stats.append(DatabaseStatus(name=name))
                    else:
                        dying_db = [
                            x for x in self.db_stats if x.name not in names
                        ]
                        for item in dying_db:
                            self.db_stats.remove(item)
                    external_need_update = True if time_now - self.external_db_update_time > fil.external_filter.update_interval else False
                    if external_need_update:
                        self.external_db_update_time = time.time()
                    seed_need_update = True if time_now - self.seed_db_update_time > fil.seed_filter.update_interval else False
                    if seed_need_update:
                        self.seed_db_update_time = time.time()
                    filterd_need_update = True if time_now - self.filtered_db_update_time > fil.filtered_result.update_interval else False
                    if filterd_need_update:
                        self.filtered_db_update_time = time.time()
                    for name in names:  # update stats
                        db_s = next(
                            (x for x in self.db_stats if name == x.name), None)
                        if db_s is None and len(name) > 0:
                            db_s = DatabaseStatus(name)
                            self.db_stats.append(db_s)
                        if db_s is not None:
                            if seed_need_update:
                                seed = SeedSiteDB(name,
                                                  db_filter=fil.seed_filter)
                                db_s.seeds = seed.site_count()
                                seed.close()
                            if external_need_update:
                                external = ExternalSiteDB(
                                    name, db_filter=fil.external_filter)
                                db_s.results = external.site_count()
                                external.close()
                            if filterd_need_update:
                                filtered = FilteredResultDB(
                                    name, db_filter=fil.filtered_result)
                                db_s.filtered = filtered.site_count()
                                filtered.close()

                                filtered_bad = FilteredResultDB(
                                    name,
                                    bad_db=True,
                                    db_filter=fil.filtered_result)
                                db_s.bad_filtered = filtered_bad.site_count()
                                filtered_bad.close()
                    #return self.db_stats
            else:
                pass
                #return []

        else:
            pass
            #return []
        print("update db stats completed")

    def remove_db(self, db_type: str, db_name: str):
        if db_type == DBType.Type_All:
            with self._seed_db_lock:
                seed = SeedSiteDB(db_name)
                seed.drop_table()
                seed.close()
            with self._external_db_lock:
                external = ExternalSiteDB(db_name)
                external.drop_table()
                external.close()
            with self._result_db_lock:
                filtered = FilteredResultDB(db_name)
                filtered.drop_table()
                filtered.close()
            with self._result_bad_db_lock:
                filtered_bad = FilteredResultDB(db_name, bad_db=True)
                filtered_bad.drop_table()
                filtered_bad.close()

        elif db_type == DBType.Type_External:
            with self._external_db_lock:
                external = ExternalSiteDB(db_name)
                external.drop_table()
                external.close()

        elif db_type == DBType.Type_Filtered_Result:
            with self._result_db_lock:
                filtered = FilteredResultDB(db_name)
                filtered.drop_table()
                filtered.close()
        elif db_type == DBType.Type_Filtered_Result_Bad:
            with self._result_bad_db_lock:
                filtered_bad = FilteredResultDB(db_name, bad_db=True)
                filtered_bad.drop_table()
                filtered_bad.close()

        self.update_db_stats(force_update=True)

    def add_seeds(self, seed):
        if isinstance(seed, MiningList):
            try:
                with self._seed_db_lock:
                    db = SeedSiteDB(seed.ref)
                    db.add_sites(seed.data)
                    db.close()
                self.update_db_stats(force_update=True)
            except Exception as ex:
                ErrorLogger.log_error("MiningMasterController.add_seeds()", ex,
                                      seed.ref)

    def get_db_stats(self):
        #print("copy db stats and send back")
        stats = MiningList(self.ref, self.db_stats)
        stats_copy = stats.copy_attrs()
        print("copy db stats completed")
        return stats_copy

    def get_filter_progress(self):
        if isinstance(self.filter_process, MemoryControlPs):
            state = self.filter_process.get_last_state()
            if isinstance(state, _FilterState):
                return state.progress, state.all_data
            else:
                return 0, 0
        else:
            return 0, 0

    def clear_host_cache(self):
        try:
            FileHandler.clear_dir(get_log_dir())
            FileHandler.clear_dir(get_recovery_dir_path())
            FileHandler.clear_dir(get_task_backup_dir())
            FileHandler.clear_dir(get_db_buffer_default_dir())
        except Exception as ex:
            ErrorLogger.log_error("MiningControllers", ex,
                                  "clear_host_cache()")

    def clear_slave_cache(self):
        if self.state == ServerState.State_Idle:
            threads = []
            for slave in self.slaves:
                if isinstance(slave, Server):
                    threads.append(
                        MiningController(slave,
                                         cmd=ServerCommand.Com_Clear_Cache))
            if len(threads) > 0:
                for thread in threads:
                    thread.start()
                for thread in threads:
                    thread.join(30)
                threads.clear()

    def get_db_seed(self):
        return SeedSiteDB(table=self.ref,
                          offset=0,
                          db_filter=self.db_filters.seed_filter)

    def get_db_external(self):
        return ExternalSiteDB(table=self.ref,
                              offset=0,
                              db_filter=self.db_filters.external_filter)

    def get_db_filtered(self):
        return FilteredResultDB(table=self.ref,
                                offset=0,
                                db_filter=self.db_filters.filtered_result)

    def get_db_filtered_bad(self):
        return FilteredResultDB(table=self.ref,
                                offset=0,
                                bad_db=True,
                                db_filter=self.db_filters.filtered_result)

    def get_db_redemption(self):
        return ExternalSiteDB(table="temp",
                              db_addr=get_temp_db_dir() + "Redemption.db")

    def get_db_results(self, db_type: str, db_name: str, index: int,
                       length: int) -> MiningList:
        try:
            if db_type == DBType.Type_Filtered_Result:
                with self._result_db_lock:
                    db = FilteredResultDB(db_name, offset=index)
                    data = db.get_next_patch(count=length, rollover=False)
                    db.close()
            elif db_type == DBType.Type_Filtered_Result_Bad:
                with self._result_bad_db_lock:
                    db = FilteredResultDB(db_name, bad_db=True, offset=index)
                    data = db.get_next_patch(count=length, rollover=False)
                    db.close()
            elif db_type == DBType.Type_External:
                with self._external_db_lock:
                    db = ExternalSiteDB(db_name, offset=index)
                    data = db.get_next_patch(count=length, rollover=False)
                    db.close()
            elif db_type == DBType.Type_Seed:
                with self._seed_db_lock:
                    db = SeedSiteDB(db_name, offset=index)
                    data = db.get_next_patch(count=length, rollover=False)
                    db.close()
            else:
                data = []
            result = MiningList(db_name, data)
            return result
        except Exception as ex:
            ErrorLogger.log_error("MiningController.get_db_results()", ex,
                                  db_name + " type:" + db_type)
            return MiningList(db_name, [])

    def auto_scale_slaves(self, count: int):
        pass

    def stop_all_slave(self):
        threads = []
        for slave in self.slaves:
            if isinstance(slave, Server):
                threads.append(
                    MiningController(slave, cmd=ServerCommand.Com_Stop_Mining))
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join(30)
            threads.clear()

    def setup_minging_slaves(
            self):  # salve should restart based on this new settings
        print("setup_minging_slaves....")
        threads = []
        for slave in self.slaves:
            if isinstance(slave, Server):
                threads.append(
                    MiningController(slave,
                                     cmd=ServerCommand.Com_Setup,
                                     in_data=SetupData(
                                         self.name,
                                         cap2=self.cap_slave_process,
                                         cap3=self.concurrent_page,
                                         max_page_level=self.max_page_level,
                                         max_page_limit=self.max_page_limit)))
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join(30)
            threads.clear()
        print("setup_minging_slaves completed")

    def check_slaves_status(self, timeout=15):
        threads = []
        for slave in self.slaves:
            if isinstance(slave, Server):
                threads.append(
                    MiningController(slave, cmd=ServerCommand.Com_Status))
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join(timeout)
            threads.clear()
        total_done = 0  # update number of job done, and job wait
        wait = 0
        for slave in self.slaves:
            if isinstance(slave, Server):
                print(slave.status)
                total_done += slave.status.done_job
                wait += slave.status.wait_job
        if total_done > self.job_done:
            self.job_done = total_done
        self.job_wait = wait

    @staticmethod
    def is_in_list(data, target_list: []):
        if len(target_list):
            target = next((x for x in target_list if data.domain == x.domain),
                          None)
            return True if target is not None else False
        else:
            return False

    def get_slaves_result(self) -> []:
        threads = []
        result = []
        resultList = []
        for slave in self.slaves:
            if isinstance(slave, Server):
                if isinstance(slave.status, dict):
                    print("in get_slaves_result, data type is wrong")
                    print(slave.status)
                    slave.status = Serializable.get_deserialized(slave.status)
                elif slave.status is not None and slave.status.result > 0:
                    result_holder = []
                    threads.append(
                        MiningController(slave,
                                         cmd=ServerCommand.Com_Get_Data,
                                         out_data=result_holder))
                    result.append(result_holder)
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join(30)
            threads.clear()
            if len(result) > 0:
                for item in result:
                    if isinstance(item, list) and len(item) > 0 and hasattr(
                            item[0],
                            "data"):  # data is the attribute of MiningList
                        data = getattr(item[0], "data")
                        resultList += data
        return resultList

    def is_in_slave_list(self, addr: str) -> True:
        if addr != "":
            match = next((x for x in self.slaves if x.address.address == addr),
                         None)
            return True if match is not None else False
        else:
            return False

    def add_slaves(self, slaves: []):
        print("adding slave...")
        if slaves is not None:
            for slave in slaves:
                if isinstance(slave, ServerAddress):
                    if slave.address == "localhost":
                        slave.address = "127.0.0.1"
                    if self.is_in_slave_list(slave.address):
                        continue
                    ser = Server(server_type=ServerType.ty_MiningSlaveSmall,
                                 address=slave)
                    self.slaves.append(ser)
                elif isinstance(slave, str):
                    print("adding slave:", slave)
                    temp = slave
                    if slave == "localhost":
                        temp = "127.0.0.1"
                    if self.is_in_slave_list(slave):
                        continue
                    ser = Server(server_type=ServerType.ty_MiningSlaveSmall,
                                 address=ServerAddress(
                                     temp, MiningTCPServer.DefaultListenPort))
                    self.slaves.append(ser)
        print("adding slave finished.")

    def remove_slaves(self, slaves: []):
        if slaves is not None:
            for slave in slaves:
                found = None
                if isinstance(slave, ServerAddress):
                    found = next(x for x in self.slaves
                                 if x.address.address == slave.address)
                elif isinstance(slave, str):
                    found = next(x for x in self.slaves
                                 if x.address.address == slave)
                if found is not None:
                    self.slaves.remove(found)

    def get_slaves(self):
        return self.slaves

    def allocate_task(self):

        try:
            threads = []
            with self._seed_db_lock:
                db_seed = SeedSiteDB(offset=self.offset,
                                     table=self.ref,
                                     db_filter=self.db_filters.seed_filter)
                for slave in self.slaves:
                    if isinstance(slave, Server) and slave.status is not None:
                        if isinstance(slave.status, dict):
                            print(
                                "in allocate_task, data type is invalid, the following data was received, need to redo"
                            )
                            print(slave.status)
                            slave.status = Serializable.get_deserialized(
                                slave.status)
                            #raise ValueError("slave status is not valid data type")
                        if slave.status.is_server_down():
                            print("server is down, continue..")
                            continue
                        if slave.status.all_job - slave.status.done_job <= slave.status.cap_process * 4:
                            job_temp = int(
                                slave.status.cap_process
                            )  # give half an 1/4 hour worth of data
                            if job_temp < 5:  # give minimum of 5 jobs
                                job_temp = 5
                            if not self.loopback_database and job_temp + self.job_allocated > self.job_all:
                                job_temp = self.job_all - self.job_allocated
                            sites = db_seed.get_next_patch(
                                count=job_temp,
                                rollover=self.loopback_database)
                            print("allocate task:", len(sites))
                            self.job_allocated += len(sites)
                            ref = db_seed.tab
                            mlist = MiningList(ref, sites)
                            number_sites = len(sites)
                            if number_sites > 0:
                                # try:
                                #     CsvLogger.log_to_file(slave.address.address, [(link,) for link in sites],
                                #                           dir_path=get_task_backup_dir())
                                # except:
                                #     pass
                                # self.offset += number_sites

                                t = MiningController(
                                    slave,
                                    cmd=ServerCommand.Com_Data,
                                    in_data=mlist)
                                try:
                                    t.start()
                                    t.join(30)
                                    self.offset += number_sites
                                except Exception as inner_ex:
                                    print(inner_ex)
                            else:
                                return
                db_seed.close()
            # if len(threads) > 0:
            #     for thread in threads:
            #         thread.start()
            #     for thread in threads:
            #         thread.join()
            # threads.clear()
        except Exception as ex:
            ErrorLogger.log_error("MiningMasterController.allocate_task()", ex,
                                  self.ref)

    def allocate_task_v1(self):
        try:
            threads = []
            with self._seed_db_lock:
                db_seed = SeedSiteDB(offset=self.offset,
                                     table=self.ref,
                                     db_filter=self.db_filters.seed_filter)
                for slave in self.slaves:
                    if isinstance(slave, Server) and slave.status is not None:
                        if isinstance(slave.status, dict):
                            print(
                                "in allocate_task, data type is invalid, the following data was received, need to redo"
                            )
                            print(slave.status)
                            slave.status = Serializable.get_deserialized(
                                slave.status)
                            #raise ValueError("slave status is not valid data type")
                        if slave.status.all_job - slave.status.done_job <= slave.status.cap_process + 2 > 2:
                            job_temp = int(
                                slave.status.cap_process /
                                2)  # give half an 1/4 hour worth of data
                            if job_temp < 5:  # give minimum of 5 jobs
                                job_temp = 5
                            if not self.loopback_database and job_temp + self.job_allocated > self.job_all:
                                job_temp = self.job_all - self.job_allocated
                            sites = db_seed.get_next_patch(
                                count=job_temp,
                                rollover=self.loopback_database)
                            print("allocate task:")
                            print(sites)
                            self.job_allocated += len(sites)
                            ref = db_seed.tab
                            mlist = MiningList(ref, sites)
                            number_sites = len(sites)
                            if number_sites > 0:
                                try:
                                    CsvLogger.log_to_file(
                                        slave.address.address,
                                        [(link, ) for link in sites],
                                        dir_path=get_task_backup_dir())
                                except:
                                    pass
                                self.offset += number_sites
                                threads.append(
                                    MiningController(
                                        slave,
                                        cmd=ServerCommand.Com_Data,
                                        in_data=mlist))
                            else:
                                return
                db_seed.close()
            if len(threads) > 0:
                for thread in threads:
                    thread.start()
                for thread in threads:
                    thread.join(30)
            threads.clear()
        except Exception as ex:
            ErrorLogger.log_error("MiningMasterController.allocate_task()", ex,
                                  self.ref)

    def process_filtering_output_results(self):
        results = []
        bad_results = []
        tuples = []
        while not self._filter_output_queue.empty():
            item = self._filter_output_queue.get()
            if isinstance(item, FilteredDomainData):
                if len(item.exception) > 0:
                    bad_results.append(item)
                else:
                    results.append(item)
                tuples.append(item.to_tuple())
        if len(results) > 0:
            try:
                with self._result_db_lock:
                    db = self.get_db_filtered()
                    db.add_sites(results, skip_check=False)
                    db.close()
            except Exception as ex:
                ErrorLogger.log_error(
                    "MingingMasterController", ex,
                    "process_filtering_output_results() " + self.ref)
            finally:
                CsvLogger.log_to_file("filtered_domains.csv", tuples)
        if len(bad_results) > 0:
            try:
                with self._result_bad_db_lock:
                    bad_db = self.get_db_filtered_bad()
                    bad_db.add_sites(bad_results, skip_check=False)
                    bad_db.close()
            except Exception as ex:
                ErrorLogger.log_error(
                    "MingingMasterController", ex,
                    "process_filtering_output_results() " + self.ref)
            finally:
                CsvLogger.log_to_file("filtered_domains.csv", tuples)

    def process_result(self, result: []):
        if result is not None and len(result) > 0:
            print("processing external site and seeds results")
            external = []
            sitesfeedback = []
            redemption_list = []
            try:
                for item in result:
                    #print("item: ", str(item.__dict__))
                    if isinstance(item, ScrapeDomainData):
                        #print(item)
                        #if not MiningMasterController.is_in_list(item, external) and not all_external.is_domain_in_db(item.domain):
                        raw_data = (item.domain, item.code)
                        if item.code == ResponseCode.MightBeExpired:
                            redemption_list.append(raw_data)
                        else:
                            external.append(raw_data)
                            self._filter_input_queue.put(
                                raw_data)  # also put into filtering queue
                    elif isinstance(item, SeedSiteFeedback):
                        #print("udpate:", str(item.__dict__))
                        sitesfeedback.append(item)
                    else:
                        continue
                with self._external_db_lock:
                    all_external = self.get_db_external()
                    all_external.add_sites(external, True)
                    all_external.close()
                with self._redemption_db_lock:
                    redemption_db = self.get_db_redemption()
                    redemption_db.add_sites(redemption_list, True)
                    redemption_db.close()
                with self._seed_db_lock:
                    seed_sites = self.get_db_seed()
                    seed_sites.update_sites(sitesfeedback)
                    seed_sites.close()
            except Exception as ex:
                ErrorLogger.log_error("MingingMasterController", ex,
                                      "process_result() " + self.ref)

    def pause(self):
        self.in_progress = False
        self.stop_all_slave()
        self.stop_Mining = True

    def stop(self):
        print("external set to stop!")
        self._stop_event.set()
        self.in_progress = False
        self.stop_all_slave()

    def continue_work(self):
        self.stop_Mining = False

    def _filtering_process_wrapper(self):
        self.filter_process = MemoryControlPs(
            func=filtering_process,
            func_kwargs=FilterController.get_input_parameters(
                "filtering.db", get_recovery_dir_path(),
                self._filter_input_queue, self._filter_output_queue,
                self._stop_event, self._filter_matrix, self._accounts,
                self._filtering_only, self._filtering_offset,
                self._filtering_total),
            external_stop_event=self._stop_event)
        self.filter_process.start()

    def run(
        self
    ):  # this is the nornal routine, should setup slaves before doing this
        filter_t = threading.Thread(target=self._filtering_process_wrapper)
        if len(self.slaves) > 0 or self._filtering_only:
            filter_t.start()

        while not self._stop_event.is_set():
            self.state = ServerState.State_Idle
            print("check status")
            self.check_slaves_status()
            #time.sleep(1)
            if not self.stop_Mining and (len(self.slaves) > 0
                                         or self._filtering_only):
                self.state = ServerState.State_Active
                if not self._filtering_only:
                    print("allocate task")
                    self.allocate_task()
                    #time.sleep(1)
                    print("get and process results")
                    result = self.get_slaves_result()
                    self.process_result(result)
                    result.clear()
                self.process_filtering_output_results(
                )  # get filtered result into Filtered DB
                if (self.loopback_database or
                        self.job_done < self.job_all) and len(self.slaves) > 0:
                    self.end_time = time.time()
                    self.in_progress = True
                elif self._filtering_only:
                    self.end_time = time.time()
                    self.in_progress = True
                else:
                    self.in_progress = False
                print("finished getting results")
            self.update_db_stats()
            print("update db finished")
            if self._stop_event.is_set():
                break

            time.sleep(15)
        print("should finish filtering process!")
        if filter_t.is_alive():
            filter_t.join()
        print("master server shut down!")
class MiningMasterController(threading.Thread):

    def __init__(self, accounts: list=[], ref="", cap_slave=0, cap_slave_process=1, cap_concurrent_page=1, all_job=0, offset = 0, max_page_level=100, max_page_limit=1000,
                 loopback_database=False, refresh_rate=10, min_page_count=0, filters=DBFilterCollection(), crawl_matrix=CrawlMatrix(),
                 filtering_only_mode=False, filtering_offset=0, filtering_total=0):
        """
        init a master controller
        :param ref: dataBase Table reference
        :param cap: max number of slaves
        :param all_job:
        :return:
        """
        print("MiningMasterController.__init__")
        print("setup data:")
        if isinstance(accounts, list):
            print("accounts: ")
            for item in accounts:
                print(item)
        print("ref:", ref)
        print("cap_slave:", cap_slave)
        print("cap_slave_process:", cap_slave_process)
        print("cap_concurrent_page:", cap_concurrent_page)
        if crawl_matrix is not None:
            print("crawl matrix:", crawl_matrix.__dict__)
        threading.Thread.__init__(self)
        self.state = ServerState.State_Init
        self.ref = ref  # database
        self.slaves = []
        self.auto_scale_slaves(cap_slave)
        self.cap_slave_process = cap_slave_process  # how many process can a slave run, if it is 0, then it will auto scale
        self.concurrent_page = cap_concurrent_page
        self.stop_Mining = False
        self.job_done = 0
        self.job_wait = 0
        self.job_allocated = 0
        self.job_all = all_job
        self.offset = offset
        self.max_page_level = max_page_level
        self.max_page_limit = max_page_limit
        self.start_time = time.time()
        self.end_time = time.time()
        self.loopback_database = False
        self.refresh_rate = refresh_rate
        self.in_progress = False
        self.min_page_count = min_page_count  # only crawl sites with page greater than this number

        self.db_seed = None
        if filters is None:
            self.db_filters = DBFilterCollection()
            self.db_filters.external_filter.update_interval = 30
            self.db_filters.filtered_result.update_interval = 30
            self.db_filters.seed_filter.update_interval = 1200
        else:
            self.db_filters = filters
        self.filter_shadow = filters.copy_attrs()
        self.db_stats = []
        self.seed_db_update_time = time.time()
        self.external_db_update_time = time.time()
        self.filtered_db_update_time = time.time()
        self.db_update_lock = threading.RLock()
        self._seed_db_lock = RLock()
        self._external_db_lock = RLock()
        self._result_db_lock = RLock()
        self._result_bad_db_lock = RLock()
        self._redemption_db_lock = RLock()
        self.update_db_stats(force_update=True)
        self._stop_event = Event()

        #this is for filters
        self._filter_input_queue = Queue()
        self._filter_output_queue = Queue()
        self.filter_process = None
        if isinstance(crawl_matrix, CrawlMatrix) and crawl_matrix.tf == 0:
            self._filter_matrix = CrawlMatrix(tf=15, cf=15, da=15, ref_domains=10, tf_cf_deviation=0.80)
        else:
            self._filter_matrix = crawl_matrix
        self._accounts = accounts
        self._filtering_only = filtering_only_mode
        self._filtering_offset = filtering_offset
        self._filtering_total = filtering_total

    def update_db_stats(self, force_update=False):
        print("update db stats, do not interrupt!")
        if self.filter_shadow is not None:
            names = SiteSource.get_all_table_names(SiteSource.Seed)
            if len(names) > 0:
                databases = []
                fil = self.filter_shadow
                if force_update:
                    for name in names:
                        if name is not None and len(name) > 0:
                            with self._seed_db_lock:
                                seed = SeedSiteDB(name, db_filter=fil.seed_filter)
                                seed_count = seed.site_count()
                                seed.close()
                            with self._external_db_lock:
                                external = ExternalSiteDB(name, db_filter=fil.external_filter)
                                external_count = external.site_count()
                                external.close()
                            with self._result_db_lock:
                                filtered = FilteredResultDB(name, db_filter=fil.filtered_result)
                                filtered_count = filtered.site_count()
                                filtered.close()
                            with self._result_bad_db_lock:
                                filtered_bad = FilteredResultDB(name, bad_db=True, db_filter=fil.filtered_result)
                                filtered_count_bad = filtered_bad.site_count()
                                filtered_bad.close()
                            x = DatabaseStatus(name, seed_count, external_count, filtered_count, filtered_count_bad)
                            databases.append(x)
                    self.seed_db_update_time = time.time()
                    self.external_db_update_time = time.time()
                    self.filtered_db_update_time = time.time()
                    self.db_stats = databases
                    #return databases
                else:
                    time_now = time.time()
                    if len(self.db_stats) == 0:
                        for name in names:
                            self.db_stats.append(DatabaseStatus(name=name))
                    else:
                        dying_db = [x for x in self.db_stats if x.name not in names]
                        for item in dying_db:
                            self.db_stats.remove(item)
                    external_need_update = True if time_now - self.external_db_update_time > fil.external_filter.update_interval else False
                    if external_need_update:
                        self.external_db_update_time = time.time()
                    seed_need_update = True if time_now - self.seed_db_update_time > fil.seed_filter.update_interval else False
                    if seed_need_update:
                        self.seed_db_update_time = time.time()
                    filterd_need_update = True if time_now - self.filtered_db_update_time > fil.filtered_result.update_interval else False
                    if filterd_need_update:
                        self.filtered_db_update_time = time.time()
                    for name in names:  # update stats
                        db_s = next((x for x in self.db_stats if name == x.name), None)
                        if db_s is None and len(name) > 0:
                            db_s = DatabaseStatus(name)
                            self.db_stats.append(db_s)
                        if db_s is not None:
                            if seed_need_update:
                                seed = SeedSiteDB(name, db_filter=fil.seed_filter)
                                db_s.seeds = seed.site_count()
                                seed.close()
                            if external_need_update:
                                external = ExternalSiteDB(name, db_filter=fil.external_filter)
                                db_s.results = external.site_count()
                                external.close()
                            if filterd_need_update:
                                filtered = FilteredResultDB(name, db_filter=fil.filtered_result)
                                db_s.filtered = filtered.site_count()
                                filtered.close()

                                filtered_bad = FilteredResultDB(name, bad_db=True, db_filter=fil.filtered_result)
                                db_s.bad_filtered = filtered_bad.site_count()
                                filtered_bad.close()
                    #return self.db_stats
            else:
                pass
                #return []

        else:
            pass
            #return []
        print("update db stats completed")

    def remove_db(self, db_type: str, db_name: str):
        if db_type == DBType.Type_All:
            with self._seed_db_lock:
                seed = SeedSiteDB(db_name)
                seed.drop_table()
                seed.close()
            with self._external_db_lock:
                external = ExternalSiteDB(db_name)
                external.drop_table()
                external.close()
            with self._result_db_lock:
                filtered = FilteredResultDB(db_name)
                filtered.drop_table()
                filtered.close()
            with self._result_bad_db_lock:
                filtered_bad = FilteredResultDB(db_name, bad_db=True)
                filtered_bad.drop_table()
                filtered_bad.close()

        elif db_type == DBType.Type_External:
            with self._external_db_lock:
                external = ExternalSiteDB(db_name)
                external.drop_table()
                external.close()

        elif db_type == DBType.Type_Filtered_Result:
            with self._result_db_lock:
                filtered = FilteredResultDB(db_name)
                filtered.drop_table()
                filtered.close()
        elif db_type == DBType.Type_Filtered_Result_Bad:
            with self._result_bad_db_lock:
                filtered_bad = FilteredResultDB(db_name, bad_db=True)
                filtered_bad.drop_table()
                filtered_bad.close()

        self.update_db_stats(force_update=True)

    def add_seeds(self, seed):
        if isinstance(seed, MiningList):
            try:
                with self._seed_db_lock:
                    db = SeedSiteDB(seed.ref)
                    db.add_sites(seed.data)
                    db.close()
                self.update_db_stats(force_update=True)
            except Exception as ex:
                ErrorLogger.log_error("MiningMasterController.add_seeds()", ex, seed.ref)

    def get_db_stats(self):
        #print("copy db stats and send back")
        stats = MiningList(self.ref, self.db_stats)
        stats_copy = stats.copy_attrs()
        print("copy db stats completed")
        return stats_copy

    def get_filter_progress(self):
        if isinstance(self.filter_process, MemoryControlPs):
            state = self.filter_process.get_last_state()
            if isinstance(state, _FilterState):
                return state.progress, state.all_data
            else:
                return 0, 0
        else:
            return 0, 0

    def clear_host_cache(self):
        try:
            FileHandler.clear_dir(get_log_dir())
            FileHandler.clear_dir(get_recovery_dir_path())
            FileHandler.clear_dir(get_task_backup_dir())
            FileHandler.clear_dir(get_db_buffer_default_dir())
        except Exception as ex:
            ErrorLogger.log_error("MiningControllers", ex, "clear_host_cache()")

    def clear_slave_cache(self):
        if self.state == ServerState.State_Idle:
            threads = []
            for slave in self.slaves:
                if isinstance(slave, Server):
                    threads.append(MiningController(slave, cmd=ServerCommand.Com_Clear_Cache))
            if len(threads) > 0:
                for thread in threads:
                    thread.start()
                for thread in threads:
                    thread.join(30)
                threads.clear()

    def get_db_seed(self):
        return SeedSiteDB(table=self.ref, offset=0, db_filter=self.db_filters.seed_filter)

    def get_db_external(self):
        return ExternalSiteDB(table=self.ref, offset=0, db_filter=self.db_filters.external_filter)

    def get_db_filtered(self):
        return FilteredResultDB(table=self.ref, offset=0, db_filter=self.db_filters.filtered_result)

    def get_db_filtered_bad(self):
        return FilteredResultDB(table=self.ref, offset=0, bad_db=True, db_filter=self.db_filters.filtered_result)

    def get_db_redemption(self):
        return ExternalSiteDB(table="temp", db_addr=get_temp_db_dir()+"Redemption.db")

    def get_db_results(self, db_type: str, db_name: str, index: int, length: int) -> MiningList:
        try:
            if db_type == DBType.Type_Filtered_Result:
                with self._result_db_lock:
                    db = FilteredResultDB(db_name, offset=index)
                    data = db.get_next_patch(count=length, rollover=False)
                    db.close()
            elif db_type == DBType.Type_Filtered_Result_Bad:
                with self._result_bad_db_lock:
                    db = FilteredResultDB(db_name, bad_db=True, offset=index)
                    data = db.get_next_patch(count=length, rollover=False)
                    db.close()
            elif db_type == DBType.Type_External:
                with self._external_db_lock:
                    db = ExternalSiteDB(db_name, offset=index)
                    data = db.get_next_patch(count=length, rollover=False)
                    db.close()
            elif db_type == DBType.Type_Seed:
                with self._seed_db_lock:
                    db = SeedSiteDB(db_name, offset=index)
                    data = db.get_next_patch(count=length, rollover=False)
                    db.close()
            else:
                data = []
            result = MiningList(db_name, data)
            return result
        except Exception as ex:
            ErrorLogger.log_error("MiningController.get_db_results()", ex, db_name + " type:" + db_type)
            return MiningList(db_name, [])

    def auto_scale_slaves(self, count: int):
        pass

    def stop_all_slave(self):
        threads = []
        for slave in self.slaves:
            if isinstance(slave, Server):
                threads.append(MiningController(slave, cmd=ServerCommand.Com_Stop_Mining))
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join(30)
            threads.clear()

    def setup_minging_slaves(self): # salve should restart based on this new settings
        print("setup_minging_slaves....")
        threads = []
        for slave in self.slaves:
            if isinstance(slave, Server):
                threads.append(MiningController(slave, cmd=ServerCommand.Com_Setup,
                                                in_data=SetupData(self.name, cap2=self.cap_slave_process,
                                                                  cap3=self.concurrent_page,
                                                                  max_page_level=self.max_page_level,
                                                                  max_page_limit=self.max_page_limit)))
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join(30)
            threads.clear()
        print("setup_minging_slaves completed")

    def check_slaves_status(self, timeout=15):
        threads = []
        for slave in self.slaves:
            if isinstance(slave, Server):
                threads.append(MiningController(slave, cmd=ServerCommand.Com_Status))
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join(timeout)
            threads.clear()
        total_done = 0  # update number of job done, and job wait
        wait = 0
        for slave in self.slaves:
            if isinstance(slave, Server):
                print(slave.status)
                total_done += slave.status.done_job
                wait += slave.status.wait_job
        if total_done > self.job_done:
            self.job_done = total_done
        self.job_wait = wait

    @staticmethod
    def is_in_list(data, target_list: []):
        if len(target_list):
            target = next((x for x in target_list if data.domain == x.domain), None)
            return True if target is not None else False
        else:
            return False

    def get_slaves_result(self) -> []:
        threads = []
        result = []
        resultList = []
        for slave in self.slaves:
            if isinstance(slave, Server):
                if isinstance(slave.status, dict):
                    print("in get_slaves_result, data type is wrong")
                    print(slave.status)
                    slave.status = Serializable.get_deserialized(slave.status)
                elif slave.status is not None and slave.status.result > 0:
                    result_holder = []
                    threads.append(MiningController(slave, cmd=ServerCommand.Com_Get_Data, out_data=result_holder))
                    result.append(result_holder)
        if len(threads) > 0:
            for thread in threads:
                thread.start()
            for thread in threads:
                thread.join(30)
            threads.clear()
            if len(result) > 0:
                for item in result:
                    if isinstance(item, list) and len(item) > 0 and hasattr(item[0], "data"):  # data is the attribute of MiningList
                        data = getattr(item[0], "data")
                        resultList += data
        return resultList

    def is_in_slave_list(self, addr: str) -> True:
        if addr != "":
            match = next((x for x in self.slaves if x.address.address == addr), None)
            return True if match is not None else False
        else:
            return False

    def add_slaves(self, slaves: []):
        print("adding slave...")
        if slaves is not None:
            for slave in slaves:
                if isinstance(slave, ServerAddress):
                    if slave.address == "localhost":
                        slave.address = "127.0.0.1"
                    if self.is_in_slave_list(slave.address):
                        continue
                    ser = Server(server_type=ServerType.ty_MiningSlaveSmall, address=slave)
                    self.slaves.append(ser)
                elif isinstance(slave, str):
                    print("adding slave:", slave)
                    temp = slave
                    if slave == "localhost":
                        temp = "127.0.0.1"
                    if self.is_in_slave_list(slave):
                        continue
                    ser = Server(server_type=ServerType.ty_MiningSlaveSmall, address=ServerAddress(temp, MiningTCPServer.DefaultListenPort))
                    self.slaves.append(ser)
        print("adding slave finished.")

    def remove_slaves(self, slaves: []):
        if slaves is not None:
            for slave in slaves:
                found = None
                if isinstance(slave, ServerAddress):
                    found = next(x for x in self.slaves if x.address.address == slave.address)
                elif isinstance(slave, str):
                    found = next(x for x in self.slaves if x.address.address == slave)
                if found is not None:
                    self.slaves.remove(found)

    def get_slaves(self):
        return self.slaves

    def allocate_task(self):

        try:
            threads = []
            with self._seed_db_lock:
                db_seed = SeedSiteDB(offset=self.offset, table=self.ref, db_filter=self.db_filters.seed_filter)
                for slave in self.slaves:
                    if isinstance(slave, Server) and slave.status is not None:
                        if isinstance(slave.status, dict):
                            print("in allocate_task, data type is invalid, the following data was received, need to redo")
                            print(slave.status)
                            slave.status = Serializable.get_deserialized(slave.status)
                            #raise ValueError("slave status is not valid data type")
                        if slave.status.is_server_down():
                            print("server is down, continue..")
                            continue
                        if slave.status.all_job - slave.status.done_job <= slave.status.cap_process * 4:
                            job_temp = int(slave.status.cap_process)  # give half an 1/4 hour worth of data
                            if job_temp < 5:  # give minimum of 5 jobs
                                job_temp = 5
                            if not self.loopback_database and job_temp + self.job_allocated > self.job_all:
                                job_temp = self.job_all - self.job_allocated
                            sites = db_seed.get_next_patch(count=job_temp, rollover=self.loopback_database)
                            print("allocate task:", len(sites))
                            self.job_allocated += len(sites)
                            ref = db_seed.tab
                            mlist = MiningList(ref, sites)
                            number_sites = len(sites)
                            if number_sites > 0:
                                # try:
                                #     CsvLogger.log_to_file(slave.address.address, [(link,) for link in sites],
                                #                           dir_path=get_task_backup_dir())
                                # except:
                                #     pass
                                # self.offset += number_sites

                                t = MiningController(slave, cmd=ServerCommand.Com_Data, in_data=mlist)
                                try:
                                    t.start()
                                    t.join(30)
                                    self.offset += number_sites
                                except Exception as inner_ex:
                                    print(inner_ex)
                            else:
                                return
                db_seed.close()
            # if len(threads) > 0:
            #     for thread in threads:
            #         thread.start()
            #     for thread in threads:
            #         thread.join()
            # threads.clear()
        except Exception as ex:
            ErrorLogger.log_error("MiningMasterController.allocate_task()", ex, self.ref)

    def allocate_task_v1(self):
        try:
            threads = []
            with self._seed_db_lock:
                db_seed = SeedSiteDB(offset=self.offset, table=self.ref, db_filter=self.db_filters.seed_filter)
                for slave in self.slaves:
                    if isinstance(slave, Server) and slave.status is not None:
                        if isinstance(slave.status, dict):
                            print("in allocate_task, data type is invalid, the following data was received, need to redo")
                            print(slave.status)
                            slave.status = Serializable.get_deserialized(slave.status)
                            #raise ValueError("slave status is not valid data type")
                        if slave.status.all_job - slave.status.done_job <= slave.status.cap_process + 2 > 2:
                            job_temp = int(slave.status.cap_process/2)  # give half an 1/4 hour worth of data
                            if job_temp < 5:  # give minimum of 5 jobs
                                job_temp = 5
                            if not self.loopback_database and job_temp + self.job_allocated > self.job_all:
                                job_temp = self.job_all - self.job_allocated
                            sites = db_seed.get_next_patch(count=job_temp, rollover=self.loopback_database)
                            print("allocate task:")
                            print(sites)
                            self.job_allocated += len(sites)
                            ref = db_seed.tab
                            mlist = MiningList(ref, sites)
                            number_sites = len(sites)
                            if number_sites > 0:
                                try:
                                    CsvLogger.log_to_file(slave.address.address, [(link,) for link in sites],
                                                          dir_path=get_task_backup_dir())
                                except:
                                    pass
                                self.offset += number_sites
                                threads.append(MiningController(slave, cmd=ServerCommand.Com_Data, in_data=mlist))
                            else:
                                return
                db_seed.close()
            if len(threads) > 0:
                for thread in threads:
                    thread.start()
                for thread in threads:
                    thread.join(30)
            threads.clear()
        except Exception as ex:
            ErrorLogger.log_error("MiningMasterController.allocate_task()", ex, self.ref)

    def process_filtering_output_results(self):
        results = []
        bad_results = []
        tuples = []
        while not self._filter_output_queue.empty():
            item = self._filter_output_queue.get()
            if isinstance(item, FilteredDomainData):
                if len(item.exception) > 0:
                    bad_results.append(item)
                else:
                    results.append(item)
                tuples.append(item.to_tuple())
        if len(results) > 0:
            try:
                with self._result_db_lock:
                    db = self.get_db_filtered()
                    db.add_sites(results, skip_check=False)
                    db.close()
            except Exception as ex:
                ErrorLogger.log_error("MingingMasterController", ex, "process_filtering_output_results() " + self.ref)
            finally:
                CsvLogger.log_to_file("filtered_domains.csv", tuples)
        if len(bad_results) > 0:
            try:
                with self._result_bad_db_lock:
                    bad_db = self.get_db_filtered_bad()
                    bad_db.add_sites(bad_results, skip_check=False)
                    bad_db.close()
            except Exception as ex:
                ErrorLogger.log_error("MingingMasterController", ex, "process_filtering_output_results() " + self.ref)
            finally:
                CsvLogger.log_to_file("filtered_domains.csv", tuples)

    def process_result(self, result: []):
        if result is not None and len(result) > 0:
            print("processing external site and seeds results")
            external = []
            sitesfeedback = []
            redemption_list = []
            try:
                for item in result:
                    #print("item: ", str(item.__dict__))
                    if isinstance(item, ScrapeDomainData):
                        #print(item)
                        #if not MiningMasterController.is_in_list(item, external) and not all_external.is_domain_in_db(item.domain):
                        raw_data = (item.domain, item.code)
                        if item.code == ResponseCode.MightBeExpired:
                            redemption_list.append(raw_data)
                        else:
                            external.append(raw_data)
                            self._filter_input_queue.put(raw_data)  # also put into filtering queue
                    elif isinstance(item, SeedSiteFeedback):
                        #print("udpate:", str(item.__dict__))
                        sitesfeedback.append(item)
                    else:
                        continue
                with self._external_db_lock:
                    all_external = self.get_db_external()
                    all_external.add_sites(external, True)
                    all_external.close()
                with self._redemption_db_lock:
                    redemption_db = self.get_db_redemption()
                    redemption_db.add_sites(redemption_list, True)
                    redemption_db.close()
                with self._seed_db_lock:
                    seed_sites = self.get_db_seed()
                    seed_sites.update_sites(sitesfeedback)
                    seed_sites.close()
            except Exception as ex:
                ErrorLogger.log_error("MingingMasterController", ex, "process_result() " + self.ref)

    def pause(self):
        self.in_progress = False
        self.stop_all_slave()
        self.stop_Mining = True

    def stop(self):
        print("external set to stop!")
        self._stop_event.set()
        self.in_progress = False
        self.stop_all_slave()

    def continue_work(self):
        self.stop_Mining = False

    def _filtering_process_wrapper(self):
        self.filter_process = MemoryControlPs(func=filtering_process,
                                         func_kwargs=FilterController.get_input_parameters("filtering.db", get_recovery_dir_path(),
                                                                                           self._filter_input_queue,
                                                                                           self._filter_output_queue,
                                                                                           self._stop_event,
                                                                                           self._filter_matrix,

                                                                                           self._accounts,
                                                                                           self._filtering_only,
                                                                                           self._filtering_offset,
                                                                                           self._filtering_total),
                                         external_stop_event=self._stop_event)
        self.filter_process.start()

    def run(self):  # this is the nornal routine, should setup slaves before doing this
        filter_t = threading.Thread(target=self._filtering_process_wrapper)
        if len(self.slaves) > 0 or self._filtering_only:
            filter_t.start()

        while not self._stop_event.is_set():
            self.state = ServerState.State_Idle
            print("check status")
            self.check_slaves_status()
            #time.sleep(1)
            if not self.stop_Mining and (len(self.slaves) > 0 or self._filtering_only):
                self.state = ServerState.State_Active
                if not self._filtering_only:
                    print("allocate task")
                    self.allocate_task()
                    #time.sleep(1)
                    print("get and process results")
                    result = self.get_slaves_result()
                    self.process_result(result)
                    result.clear()
                self.process_filtering_output_results() # get filtered result into Filtered DB
                if (self.loopback_database or self.job_done < self.job_all) and len(self.slaves) > 0:
                    self.end_time = time.time()
                    self.in_progress = True
                elif self._filtering_only:
                    self.end_time = time.time()
                    self.in_progress = True
                else:
                    self.in_progress = False
                print("finished getting results")
            self.update_db_stats()
            print("update db finished")
            if self._stop_event.is_set():
                break

            time.sleep(15)
        print("should finish filtering process!")
        if filter_t.is_alive():
            filter_t.join()
        print("master server shut down!")