def testExternalDbBuffer(self): backup_db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/ResultSitesList.db" table = "24/10/2015 Gardening" backup_db = ExternalSiteDB(table, db_addr=backup_db_addr) total = backup_db.site_count(False) count = 0 db_addr = "/Users/superCat/Desktop/PycharmProjectPortable/sync/" stop_event = Event() db_buffer = ExternalTempDataDiskBuffer("TempDB.db", None, stop_event, dir_path=db_addr) db_buffer._input_convert_tuple = False while count < total: results = backup_db.get_next_patch_no_rollover(count, 5000) print(count) db_buffer.write(results) count += 5000
def __init__(self, stop_event: Event, input_queue: Queue = None, output_queue: Queue = None, max_worker: int = 10, dir_path="", is_debug=False, **kwargs): self._is_debug = is_debug FeedbackInterface.__init__(self, **kwargs) ExternalTempInterface.__init__(self) # do not use predefined queue here # self._input_q = input_queue # self._output_q = output_queue self._stop_event = stop_event self._internal_stop_event = Event() self._max_worker = max_worker self._job_done = 0 self._job_done_shadow = 0 self._job_done_lock = RLock() self._input_period = 0.0001 # time to sample data into the buffer self._max_sample_results = 100000 self._min_sampling_duration = 0.0001 self._sample_batch_size = 5000 self._sample_batch_timeout = 60 if is_debug: self._min_buff_delete_threshold = 10000 # default is 100000 else: self._min_buff_delete_threshold = 100000 self._speed_penalty_count = 0 self._finished = False manager, self._output_q = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) self._db_buffer = ExternalTempDataDiskBuffer( "whois_check.db", self, self._internal_stop_event, buf_size=self._max_worker * 50, terminate_callback=WhoisChecker.terminate_callback, dir_path=dir_path) self._populate_with_state() # FeedbackInterface if not is_debug: log_period = 120 else: log_period = 10 self._progress_logger = ProgressLogger(log_period, self, self._internal_stop_event)
def __init__(self, stop_event: Event, input_queue: Queue=None, output_queue: Queue=None, max_worker: int=10, dir_path="", is_debug=False, **kwargs): self._is_debug = is_debug FeedbackInterface.__init__(self, **kwargs) ExternalTempInterface.__init__(self) # do not use predefined queue here # self._input_q = input_queue # self._output_q = output_queue self._stop_event = stop_event self._internal_stop_event = Event() self._max_worker = max_worker self._job_done = 0 self._job_done_shadow = 0 self._job_done_lock = RLock() self._input_period = 0.0001 # time to sample data into the buffer self._max_sample_results = 100000 self._min_sampling_duration = 0.0001 self._sample_batch_size = 5000 self._sample_batch_timeout = 60 if is_debug: self._min_buff_delete_threshold = 10000 # default is 100000 else: self._min_buff_delete_threshold = 100000 self._speed_penalty_count = 0 self._finished = False manager, self._output_q = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) self._db_buffer = ExternalTempDataDiskBuffer("whois_check.db", self, self._internal_stop_event, buf_size=self._max_worker*50, terminate_callback=WhoisChecker.terminate_callback, dir_path=dir_path) self._populate_with_state() # FeedbackInterface if not is_debug: log_period = 120 else: log_period = 10 self._progress_logger = ProgressLogger(log_period, self, self._internal_stop_event)
def __init__(self, db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event, matrix: CrawlMatrix, accounts: list, force_mode=False, force_mode_offset=0, force_mode_total=0, **kwargs): FeedbackInterface.__init__(self, **kwargs) self._stop_event = stop_event self._matrix = matrix self._db_ref = db_ref self._input_queue = input_queue self._output_queue = output_queue self._pool_input = Queue() self._pool = FilterPool(self._pool_input, self._output_queue, self._queue_lock, self._stop_event, self._matrix, accounts=accounts) self._db_buffer = ExternalTempDataDiskBuffer( self._db_ref, self, self._stop_event, dir_path=db_dir, buf_size=2500, output_f=5000) # control how data flow speed, # it can keep input:output ratio = 1:1 at max 10 milion data row per hour #FeedbackInterface.__init__(self, **kwargs) ExternalTempInterface.__init__(self) self._populate_with_state() if force_mode: new_state = _FilterState(progress=force_mode_offset, all_data=force_mode_total) self.populate_with_state(new_state)
def __init__(self, db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event, matrix: CrawlMatrix, accounts: list, force_mode=False, force_mode_offset=0, force_mode_total=0, **kwargs): FeedbackInterface.__init__(self, **kwargs) self._stop_event = stop_event self._matrix = matrix self._db_ref = db_ref self._input_queue = input_queue self._output_queue = output_queue self._pool_input = Queue() self._pool = FilterPool(self._pool_input, self._output_queue, self._queue_lock, self._stop_event, self._matrix, accounts=accounts) self._db_buffer = ExternalTempDataDiskBuffer(self._db_ref, self, self._stop_event, dir_path=db_dir, buf_size=2500, output_f=5000) # control how data flow speed, # it can keep input:output ratio = 1:1 at max 10 milion data row per hour #FeedbackInterface.__init__(self, **kwargs) ExternalTempInterface.__init__(self) self._populate_with_state() if force_mode: new_state = _FilterState(progress=force_mode_offset, all_data=force_mode_total) self.populate_with_state(new_state)
class FilterController(FeedbackInterface, ExternalTempInterface): """ This Controller can be memory controlled since it implement FeedbackInterface """ def __init__(self, db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event, matrix: CrawlMatrix, accounts: list, force_mode=False, force_mode_offset=0, force_mode_total=0, **kwargs): FeedbackInterface.__init__(self, **kwargs) self._stop_event = stop_event self._matrix = matrix self._db_ref = db_ref self._input_queue = input_queue self._output_queue = output_queue self._pool_input = Queue() self._pool = FilterPool(self._pool_input, self._output_queue, self._queue_lock, self._stop_event, self._matrix, accounts=accounts) self._db_buffer = ExternalTempDataDiskBuffer( self._db_ref, self, self._stop_event, dir_path=db_dir, buf_size=2500, output_f=5000) # control how data flow speed, # it can keep input:output ratio = 1:1 at max 10 milion data row per hour #FeedbackInterface.__init__(self, **kwargs) ExternalTempInterface.__init__(self) self._populate_with_state() if force_mode: new_state = _FilterState(progress=force_mode_offset, all_data=force_mode_total) self.populate_with_state(new_state) @staticmethod def get_input_parameters(db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event, matrix: CrawlMatrix, accounts: list, force_mode: bool, force_mode_offset: int, force_mode_total: int): return { "db_ref": db_ref, "db_dir": db_dir, "input_queue": input_queue, "output_queue": output_queue, "stop_event": stop_event, "matrix": matrix, "accounts": accounts, "force_mode": force_mode, "force_mode_offset": force_mode_offset, "force_mode_total": force_mode_total, } def _sample_data_buffer_input(self): while not self._stop_event.is_set(): data_list = [] # if isinstance(self._queue_lock, multiprocessing.RLock): while not self._input_queue.empty(): #with self._queue_lock: data = self._input_queue.get() if isinstance(data, tuple) and len(data) == 2: data_list.append(data) if len(data_list) > 0: self._db_buffer.append_to_buffer(data_list, convert_tuple=False) data_list.clear() time.sleep(1) def _sample_data_buffer_output(self): for item in self._db_buffer: if self._stop_event.is_set(): break if isinstance(item, OnSiteLink): self._pool_input.put( FilteredDomainData(domain=item.link, found=int(time.time()))) def begin_filtering(self): self._start_sending_feedback() self._db_buffer.start_input_output_cycle( ) # start input and output data to/from file input_t = threading.Thread(target=self._sample_data_buffer_input) output_t = threading.Thread(target=self._sample_data_buffer_output) input_t.start() # start sampling data output_t.start() self._pool.start() if self._pool.is_alive(): self._pool.join() self._db_buffer.terminate() if input_t.is_alive(): input_t.join() if output_t.is_alive(): output_t.join() self._end_sending_feedback() def populate_with_state(self, state): """ FeedbackInterface, subclass implement this method to :param state: the state from previous :return: """ if isinstance(state, _FilterState): self._pool.set_job_done(state.progress) self._db_buffer.set_progress(state.progress) def get_state(self): """ FeedbackInterface, subclass this so that the controller can gather state info, which in turn will feedback into next iteration :return: """ return _FilterState(progress=self._pool.get_job_done(), all_data=self._db_buffer.get_total_record()) def get_callback_data(self): """ FeedbackInterface, subclass this so that any callback data can be gathered by the controller :return: """ return None def is_programme_finshed(self): """ FeedbackInterface, indicate if the programme has finished execution, if not it goes to next iteration :return: """ return self._stop_event.is_set() def get_external_count_finished(self) -> int: """ ExternalTempInterface, get the number of job done in ExternalTempDataDiskBuffer :return: """ return self._pool.get_job_done() def set_internal_count(self, count: int): """ ExternalTempInterface, set the number of job done in ExternalTempDataDiskBuffer :param count: :return: """ self._pool.set_job_done(count)
class WhoisChecker(FeedbackInterface, ExternalTempInterface, ProgressLogInterface): """ usage: checker = WhoisChecker(...) checker.run_farm() """ def __init__(self, stop_event: Event, input_queue: Queue = None, output_queue: Queue = None, max_worker: int = 10, dir_path="", is_debug=False, **kwargs): self._is_debug = is_debug FeedbackInterface.__init__(self, **kwargs) ExternalTempInterface.__init__(self) # do not use predefined queue here # self._input_q = input_queue # self._output_q = output_queue self._stop_event = stop_event self._internal_stop_event = Event() self._max_worker = max_worker self._job_done = 0 self._job_done_shadow = 0 self._job_done_lock = RLock() self._input_period = 0.0001 # time to sample data into the buffer self._max_sample_results = 100000 self._min_sampling_duration = 0.0001 self._sample_batch_size = 5000 self._sample_batch_timeout = 60 if is_debug: self._min_buff_delete_threshold = 10000 # default is 100000 else: self._min_buff_delete_threshold = 100000 self._speed_penalty_count = 0 self._finished = False manager, self._output_q = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) self._db_buffer = ExternalTempDataDiskBuffer( "whois_check.db", self, self._internal_stop_event, buf_size=self._max_worker * 50, terminate_callback=WhoisChecker.terminate_callback, dir_path=dir_path) self._populate_with_state() # FeedbackInterface if not is_debug: log_period = 120 else: log_period = 10 self._progress_logger = ProgressLogger(log_period, self, self._internal_stop_event) @staticmethod def terminate_callback(): ErrorLogger.log_error("WhoisChecker", StopIteration("terminated.")) @staticmethod def get_input_parameters(input_queue: Queue, output_queue: Queue, stop_event: Event, max_worker: int = 20) -> dict: return { "input_queue": input_queue, "output_queue": output_queue, "stop_event": stop_event, "max_worker": max_worker } def get_job_done_count(self): with self._job_done_lock: job_done = self._job_done return job_done def _add_job_done_one(self): with self._job_done_lock: self._job_done += 1 def _put_output_result_in_queue(self, domain_data: OnSiteLink): if not self._stop_event.is_set( ) or not self._internal_stop_event.is_set(): try: self._output_q.put( (domain_data.link, domain_data.response_code)) except Exception as inner_ex: if self._output_q is None: manager, self._output_q = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) ErrorLogger.log_error( "WhoisChecker", inner_ex, addtional="failed to put result in queue.") time.sleep(0.01) self._put_output_result_in_queue(domain_data) def _check_whois_v1(self, domain_data: OnSiteLink): root_domain = domain_data.link try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] real_response_code = domain_data.response_code whois = LinkChecker.check_whois(root_domain) # check whois record if whois[0]: if whois[2]: # domain is expired real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired if real_response_code == ResponseCode.Expired: #if ResponseCode.domain_might_be_expired(real_response_code): domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # if isinstance(self._queue_lock, multiprocessing.RLock): with self._queue_lock: self._output_q.put( (domain_data.link, domain_data.response_code)) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one() def _check_whois(self, domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if not self._is_debug: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain( domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois( root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) self._put_output_result_in_queue(domain_data) else: self._put_output_result_in_queue(domain_data) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one() def _check_whois_with_dns(self, page: OnSiteLink): real_response_code = ResponseCode.DNSError skip_whois_check = False try: if not self._is_debug: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] suffix = root_result[5] if len(sub_domain ) == 0 or suffix not in TldUtility.TOP_TLD_LIST: skip_whois_check = True else: if LinkChecker.is_domain_DNS_OK( sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError skip_whois_check = True elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError skip_whois_check = True page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain except Exception as ex: ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link) skip_whois_check = True finally: if not skip_whois_check and real_response_code == ResponseCode.DNSError: self._check_whois(page) else: self._add_job_done_one() def _sample_data(self): ref_time = time.time() manager, result_queue = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) if result_queue is None: ErrorLogger.log_error( "ExternalSiteChecker.WhoisChecker._sample_data()", ValueError("result queue is None, cannot get data.")) if not (self._stop_event.is_set() or self._internal_stop_event.is_set()): self._sample_data() else: while not (self._stop_event.is_set() or self._internal_stop_event.is_set()): data_list = [] counter = 0 while not result_queue.empty(): data = None try: data = result_queue.get() except Exception as ex: ErrorLogger.log_error("WhoisChecker._sampling_data", ex) if result_queue is None: manager, result_queue = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) if isinstance(data, OnSiteLink): counter += 1 data_list.append((data.link, data.response_code)) elif isinstance(data, tuple) and len(data) == 2: #print("External Site checker: recieved:", data) counter += 1 data_list.append(data) if isinstance(data, list): data_list += data counter += 1 if counter >= self._sample_batch_size: break current_time = time.time() if current_time - ref_time >= self._sample_batch_timeout: break time.sleep(self._min_sampling_duration) ref_time = time.time() if len(data_list) > 0: #print("whois checker input data in db_buff: ", len(data_list)) self._db_buffer.append_to_buffer(data_list, convert_tuple=False) data_list.clear() else: pass time.sleep(self._input_period) def sample_gen(self): while not self._stop_event.is_set( ) or not self._internal_stop_event.is_set(): try: if not self._db_buffer.reset_event.is_set(): for next_item in self._db_buffer: time.sleep(0.0001) return next_item print("going to next cycle in sample_gen.") time.sleep(1) except Exception as ex: print("sample_gen error:", ex) def run_farm(self): try: self._start_sending_feedback() input_t = threading.Thread(target=self._sample_data) input_t.start() # start sampling data self._progress_logger.start() self._db_buffer.start_input_output_cycle( ) # start input and output data to/from file pool = ThreadPool(processes=self._max_worker) # pool.imap_unordered(self._check_whois_with_dns, self._db_buffer, chunksize=1) pool.imap_unordered(self._check_whois_with_dns, iter(self.sample_gen, None), chunksize=1) while not self._stop_event.is_set( ) or not self._internal_stop_event.is_set(): time.sleep(1) if self._stop_event.is_set(): self._internal_stop_event.set() input_t.join() self._progress_logger.join() self._db_buffer.terminate() if self._stop_event.is_set(): self._finished = True self._end_sending_feedback() except Exception as ex: if self._stop_event.is_set(): self._finished = True ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "run_farm() index at:" + str(self._job_done)) def get_external_count_finished(self): """ ExternalTempInterface :return: """ return self.get_job_done_count() def set_internal_count(self, count: int): """ ExternalTempInterface :param count: :return: """ self._job_done = count def populate_with_state(self, state): """ FeedbackInterface, subclass implement this method to :param state: the state from previous :return: """ if isinstance(state, WhoisCheckerState): self._job_done = state.progress_count self._db_buffer.set_progress(state.progress_count) def get_state(self): """ FeedbackInterface, subclass this so that the controller can gather state info, which in turn will feedback into next iteration :return: """ if self._internal_stop_event.is_set(): return WhoisCheckerState(0, 0) else: return WhoisCheckerState(self.get_job_done_count(), self._db_buffer.get_total_record()) def get_callback_data(self): """ FeedbackInterface, subclass this so that any callback data can be gathered by the controller :return: """ return None def is_programme_finshed(self): """ FeedbackInterface, indicate if the programme has finished execution, if not it goes to next iteration :return: """ return self._finished def is_progamme_need_restart(self): """ FeedbackInterface, indicate if the programme needs to be restarted, if not it goes to next iteration """ return self._internal_stop_event.is_set() def get_file_name(self) -> str: """ ProgressLogInterface, the file name used to save in file system. :return: """ return "whois_check_progress.csv" def get_column_names(self) -> []: """ ProgressLogInterface, the column name for each prograss entry in get_prograss(), all in str format :return: array contains column names, length should match the length of prograss entries """ return ["Done", "Total"] def reset(self): self._job_done = 0 self._job_done_shadow = 0 self._speed_penalty_count = 0 def get_progress(self) -> []: """ ProgressLogInterface, get the prograss data in tuple format, so that it can be used to complie to standard format :return: array contains prograss data, which has the exact length of column names in get_column_names() """ total_record = self._db_buffer.get_total_record() if (self._job_done == self._job_done_shadow and self._job_done > 0 ) or (self._job_done > self._min_buff_delete_threshold * 0.9 and total_record > self._min_buff_delete_threshold): self._speed_penalty_count += 1 if self._speed_penalty_count >= 2: ErrorLogger.log_error( "WhoisChecker.get_progress()", TimeoutError("progress is stucked, restarted internal."), self._db_buffer._file_name) print("going to clear cache") self._db_buffer.clear_cache() self.reset() total_record = 0 self._db_buffer.start_input_output_cycle() else: print("no need to clear cache.") self._job_done_shadow = self._job_done self._speed_penalty_count = 0 return [self._job_done, total_record] def get_limit(self) -> int: """ ProgressLogInterface, the number of samples you want to collect. :return: max number of samples """ return self._max_sample_results
class WhoisChecker(FeedbackInterface, ExternalTempInterface, ProgressLogInterface): """ usage: checker = WhoisChecker(...) checker.run_farm() """ def __init__(self, stop_event: Event, input_queue: Queue=None, output_queue: Queue=None, max_worker: int=10, dir_path="", is_debug=False, **kwargs): self._is_debug = is_debug FeedbackInterface.__init__(self, **kwargs) ExternalTempInterface.__init__(self) # do not use predefined queue here # self._input_q = input_queue # self._output_q = output_queue self._stop_event = stop_event self._internal_stop_event = Event() self._max_worker = max_worker self._job_done = 0 self._job_done_shadow = 0 self._job_done_lock = RLock() self._input_period = 0.0001 # time to sample data into the buffer self._max_sample_results = 100000 self._min_sampling_duration = 0.0001 self._sample_batch_size = 5000 self._sample_batch_timeout = 60 if is_debug: self._min_buff_delete_threshold = 10000 # default is 100000 else: self._min_buff_delete_threshold = 100000 self._speed_penalty_count = 0 self._finished = False manager, self._output_q = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) self._db_buffer = ExternalTempDataDiskBuffer("whois_check.db", self, self._internal_stop_event, buf_size=self._max_worker*50, terminate_callback=WhoisChecker.terminate_callback, dir_path=dir_path) self._populate_with_state() # FeedbackInterface if not is_debug: log_period = 120 else: log_period = 10 self._progress_logger = ProgressLogger(log_period, self, self._internal_stop_event) @staticmethod def terminate_callback(): ErrorLogger.log_error("WhoisChecker", StopIteration("terminated.")) @staticmethod def get_input_parameters(input_queue: Queue, output_queue: Queue, stop_event: Event, max_worker: int=20) -> dict: return {"input_queue": input_queue, "output_queue": output_queue, "stop_event": stop_event, "max_worker": max_worker} def get_job_done_count(self): with self._job_done_lock: job_done = self._job_done return job_done def _add_job_done_one(self): with self._job_done_lock: self._job_done += 1 def _put_output_result_in_queue(self, domain_data: OnSiteLink): if not self._stop_event.is_set() or not self._internal_stop_event.is_set(): try: self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as inner_ex: if self._output_q is None: manager, self._output_q = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) ErrorLogger.log_error("WhoisChecker", inner_ex, addtional="failed to put result in queue.") time.sleep(0.01) self._put_output_result_in_queue(domain_data) def _check_whois_v1(self, domain_data: OnSiteLink): root_domain = domain_data.link try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] real_response_code = domain_data.response_code whois = LinkChecker.check_whois(root_domain) # check whois record if whois[0]: if whois[2]: # domain is expired real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired if real_response_code == ResponseCode.Expired: #if ResponseCode.domain_might_be_expired(real_response_code): domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # if isinstance(self._queue_lock, multiprocessing.RLock): with self._queue_lock: self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one() def _check_whois(self, domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if not self._is_debug: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) self._put_output_result_in_queue(domain_data) else: self._put_output_result_in_queue(domain_data) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one() def _check_whois_with_dns(self, page: OnSiteLink): real_response_code = ResponseCode.DNSError skip_whois_check = False try: if not self._is_debug: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] suffix = root_result[5] if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST: skip_whois_check = True else: if LinkChecker.is_domain_DNS_OK(sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError skip_whois_check = True elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError skip_whois_check = True page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain except Exception as ex: ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link) skip_whois_check = True finally: if not skip_whois_check and real_response_code == ResponseCode.DNSError: self._check_whois(page) else: self._add_job_done_one() def _sample_data(self): ref_time = time.time() manager, result_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) if result_queue is None: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker._sample_data()", ValueError("result queue is None, cannot get data.")) if not (self._stop_event.is_set() or self._internal_stop_event.is_set()): self._sample_data() else: while not (self._stop_event.is_set() or self._internal_stop_event.is_set()): data_list = [] counter = 0 while not result_queue.empty(): data = None try: data = result_queue.get() except Exception as ex: ErrorLogger.log_error("WhoisChecker._sampling_data", ex) if result_queue is None: manager, result_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) if isinstance(data, OnSiteLink): counter += 1 data_list.append((data.link, data.response_code)) elif isinstance(data, tuple) and len(data) == 2: #print("External Site checker: recieved:", data) counter += 1 data_list.append(data) if isinstance(data, list): data_list += data counter += 1 if counter >= self._sample_batch_size: break current_time = time.time() if current_time - ref_time >= self._sample_batch_timeout: break time.sleep(self._min_sampling_duration) ref_time = time.time() if len(data_list) > 0: #print("whois checker input data in db_buff: ", len(data_list)) self._db_buffer.append_to_buffer(data_list, convert_tuple=False) data_list.clear() else: pass time.sleep(self._input_period) def sample_gen(self): while not self._stop_event.is_set() or not self._internal_stop_event.is_set(): try: if not self._db_buffer.reset_event.is_set(): for next_item in self._db_buffer: time.sleep(0.0001) return next_item print("going to next cycle in sample_gen.") time.sleep(1) except Exception as ex: print("sample_gen error:", ex) def run_farm(self): try: self._start_sending_feedback() input_t = threading.Thread(target=self._sample_data) input_t.start() # start sampling data self._progress_logger.start() self._db_buffer.start_input_output_cycle() # start input and output data to/from file pool = ThreadPool(processes=self._max_worker) # pool.imap_unordered(self._check_whois_with_dns, self._db_buffer, chunksize=1) pool.imap_unordered(self._check_whois_with_dns, iter(self.sample_gen, None), chunksize=1) while not self._stop_event.is_set() or not self._internal_stop_event.is_set(): time.sleep(1) if self._stop_event.is_set(): self._internal_stop_event.set() input_t.join() self._progress_logger.join() self._db_buffer.terminate() if self._stop_event.is_set(): self._finished = True self._end_sending_feedback() except Exception as ex: if self._stop_event.is_set(): self._finished = True ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "run_farm() index at:" + str(self._job_done)) def get_external_count_finished(self): """ ExternalTempInterface :return: """ return self.get_job_done_count() def set_internal_count(self, count: int): """ ExternalTempInterface :param count: :return: """ self._job_done = count def populate_with_state(self, state): """ FeedbackInterface, subclass implement this method to :param state: the state from previous :return: """ if isinstance(state, WhoisCheckerState): self._job_done = state.progress_count self._db_buffer.set_progress(state.progress_count) def get_state(self): """ FeedbackInterface, subclass this so that the controller can gather state info, which in turn will feedback into next iteration :return: """ if self._internal_stop_event.is_set(): return WhoisCheckerState(0, 0) else: return WhoisCheckerState(self.get_job_done_count(), self._db_buffer.get_total_record()) def get_callback_data(self): """ FeedbackInterface, subclass this so that any callback data can be gathered by the controller :return: """ return None def is_programme_finshed(self): """ FeedbackInterface, indicate if the programme has finished execution, if not it goes to next iteration :return: """ return self._finished def is_progamme_need_restart(self): """ FeedbackInterface, indicate if the programme needs to be restarted, if not it goes to next iteration """ return self._internal_stop_event.is_set() def get_file_name(self) -> str: """ ProgressLogInterface, the file name used to save in file system. :return: """ return "whois_check_progress.csv" def get_column_names(self) -> []: """ ProgressLogInterface, the column name for each prograss entry in get_prograss(), all in str format :return: array contains column names, length should match the length of prograss entries """ return ["Done", "Total"] def reset(self): self._job_done = 0 self._job_done_shadow = 0 self._speed_penalty_count = 0 def get_progress(self) -> []: """ ProgressLogInterface, get the prograss data in tuple format, so that it can be used to complie to standard format :return: array contains prograss data, which has the exact length of column names in get_column_names() """ total_record = self._db_buffer.get_total_record() if (self._job_done == self._job_done_shadow and self._job_done > 0) or (self._job_done > self._min_buff_delete_threshold * 0.9 and total_record > self._min_buff_delete_threshold): self._speed_penalty_count += 1 if self._speed_penalty_count >= 2: ErrorLogger.log_error("WhoisChecker.get_progress()", TimeoutError("progress is stucked, restarted internal."), self._db_buffer._file_name) print("going to clear cache") self._db_buffer.clear_cache() self.reset() total_record = 0 self._db_buffer.start_input_output_cycle() else: print("no need to clear cache.") self._job_done_shadow = self._job_done self._speed_penalty_count = 0 return [self._job_done, total_record] def get_limit(self) -> int: """ ProgressLogInterface, the number of samples you want to collect. :return: max number of samples """ return self._max_sample_results
class FilterController(FeedbackInterface, ExternalTempInterface): """ This Controller can be memory controlled since it implement FeedbackInterface """ def __init__(self, db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event, matrix: CrawlMatrix, accounts: list, force_mode=False, force_mode_offset=0, force_mode_total=0, **kwargs): FeedbackInterface.__init__(self, **kwargs) self._stop_event = stop_event self._matrix = matrix self._db_ref = db_ref self._input_queue = input_queue self._output_queue = output_queue self._pool_input = Queue() self._pool = FilterPool(self._pool_input, self._output_queue, self._queue_lock, self._stop_event, self._matrix, accounts=accounts) self._db_buffer = ExternalTempDataDiskBuffer(self._db_ref, self, self._stop_event, dir_path=db_dir, buf_size=2500, output_f=5000) # control how data flow speed, # it can keep input:output ratio = 1:1 at max 10 milion data row per hour #FeedbackInterface.__init__(self, **kwargs) ExternalTempInterface.__init__(self) self._populate_with_state() if force_mode: new_state = _FilterState(progress=force_mode_offset, all_data=force_mode_total) self.populate_with_state(new_state) @staticmethod def get_input_parameters(db_ref: str, db_dir: str, input_queue: Queue, output_queue: Queue, stop_event: Event, matrix: CrawlMatrix, accounts: list, force_mode: bool, force_mode_offset:int,force_mode_total:int): return {"db_ref": db_ref, "db_dir": db_dir, "input_queue": input_queue, "output_queue": output_queue, "stop_event": stop_event, "matrix": matrix, "accounts": accounts, "force_mode": force_mode, "force_mode_offset": force_mode_offset, "force_mode_total": force_mode_total,} def _sample_data_buffer_input(self): while not self._stop_event.is_set(): data_list = [] # if isinstance(self._queue_lock, multiprocessing.RLock): while not self._input_queue.empty(): #with self._queue_lock: data = self._input_queue.get() if isinstance(data, tuple) and len(data) == 2: data_list.append(data) if len(data_list) > 0: self._db_buffer.append_to_buffer(data_list, convert_tuple=False) data_list.clear() time.sleep(1) def _sample_data_buffer_output(self): for item in self._db_buffer: if self._stop_event.is_set(): break if isinstance(item, OnSiteLink): self._pool_input.put(FilteredDomainData(domain=item.link, found=int(time.time()))) def begin_filtering(self): self._start_sending_feedback() self._db_buffer.start_input_output_cycle() # start input and output data to/from file input_t = threading.Thread(target=self._sample_data_buffer_input) output_t = threading.Thread(target=self._sample_data_buffer_output) input_t.start() # start sampling data output_t.start() self._pool.start() if self._pool.is_alive(): self._pool.join() self._db_buffer.terminate() if input_t.is_alive(): input_t.join() if output_t.is_alive(): output_t.join() self._end_sending_feedback() def populate_with_state(self, state): """ FeedbackInterface, subclass implement this method to :param state: the state from previous :return: """ if isinstance(state, _FilterState): self._pool.set_job_done(state.progress) self._db_buffer.set_progress(state.progress) def get_state(self): """ FeedbackInterface, subclass this so that the controller can gather state info, which in turn will feedback into next iteration :return: """ return _FilterState(progress=self._pool.get_job_done(), all_data=self._db_buffer.get_total_record()) def get_callback_data(self): """ FeedbackInterface, subclass this so that any callback data can be gathered by the controller :return: """ return None def is_programme_finshed(self): """ FeedbackInterface, indicate if the programme has finished execution, if not it goes to next iteration :return: """ return self._stop_event.is_set() def get_external_count_finished(self) -> int: """ ExternalTempInterface, get the number of job done in ExternalTempDataDiskBuffer :return: """ return self._pool.get_job_done() def set_internal_count(self, count: int): """ ExternalTempInterface, set the number of job done in ExternalTempDataDiskBuffer :param count: :return: """ self._pool.set_job_done(count)
def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None, controller: SiteCheckerController=None, max_level=10, max_page=1000, delegate=None, output_buff_size=2000, output_queue=None, output_all_external=False, result_delegate=None, memory_control_terminate_event=None, check_robot_text=True, **kwargs): """ :param full_link: The full link of a domain, e.g: https://www.google.co.uk :param domain: domain to crawl :param max_level: stop crawling if it reaches this level :param max_page: maximum pages to check within a site, also stop crawling :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999 :param result_delegate: send site_info upon finish :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process. :return: """ FeedbackInterface.__init__(self, **kwargs) #super(SiteChecker, self).__init__(**kwargs) if full_link is None or len(full_link) == 0: raise ValueError() original_path = "" try: paras = urlsplit(full_link) self.scheme, self.domain, original_path = paras[0], paras[1], paras[2] except: pass domain_data = LinkChecker.get_root_domain(full_link, False) self.root_domain = domain_data[1] self.sub_domain = domain_data[4] self.domain_suffix = domain_data[5] self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix) if self.scheme == "": self.scheme = "http" if self.domain == "": self.domain = self.root_domain self.orginal_link = full_link self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme) self.max_level = max_level self.max_page = max_page self.page_count = 0 # keep track page done self._page_count_shadow = 0 # track previous count self._all_page_count_shadow = 0 #track previous count in datasource self.internal_page_count = 0 self.internal_page_last_count = 0 self.page_allocated = 0 self.current_level = 0 # if this = 0, it is root domain/home_page self._stop_event = Event() valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link) self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self, stop_event=self._stop_event, buf_size=int(output_buff_size/2), dir_path=get_db_buffer_default_dir(), convert_output=False) self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False) self._memory_control_terminate_event = memory_control_terminate_event self.task_control_lock = threading.RLock() if data_source is None: #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self) self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self) else: self.data_source = data_source # a list of OnSiteLink self.delegate = delegate if LinkChecker.might_be_link_html_page(original_path): self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1)) self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1)) self.cache_list = [] # internal page cache self.page_need_look_up_temp = 0 self.cache_list.append(self.domain_link) if "www." not in self.sub_domain: self.cache_list.append(self.scheme + "://www."+self.sub_domain) self.cache_list.append(self.scheme + "://" + self.domain) self.page_need_look_up = self.data_source.count_all() self.cache_size = 500 # create a small cache list to avoid going to check link in file system with lots of read and write self._double_check_cache_lock = threading.RLock() self._double_check_cache = deque(maxlen=self.cache_size) self.external_cache_list = [] self.external_cache_size = 500 # cache that hold external sites self.external_links_checked = 0 self.add_internal_page_OK_only = True self.output_queue = output_queue self.output_all_external = output_all_external self.controller = controller self.result_delegate = result_delegate self.page_count_lock = threading.RLock() self.internal_page_count_lock = threading.RLock() self.level_lock = threading.RLock() self.page_look_up_lock = threading.RLock() self.external_link_check_lock = threading.RLock() self._finihsed = False self.task_control_max = 1 self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \ "if you have an enquiry, please email to: [email protected])" self.agent_from = "*****@*****.**" if check_robot_text: self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme) else: self.robot_agent = None self.site_crawl_delay = 0.60 if isinstance(self.robot_agent, Rules): delay_temp = self.robot_agent.delay(self.agent) if delay_temp is not None and delay_temp != self.site_crawl_delay: self.site_crawl_delay = delay_temp self.task_control_counter = 1 self._speed_penalty_count = 0 self._speed_penalty_threshold = 10 self._progress_logging_speed = 120 self._output_period = 120 self._output_batch_size = 100 self._death_wish_sent = False SiteChecker._is_lxml_parser_exist() self._output_thread = None self._output_queue = None self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event) self._status = "Start" self._populate_with_state() # restore laste known state
class SiteChecker(FeedbackInterface, SiteTempDataSrcRefInterface, ProgressLogInterface, ExternalTempInterface): full_link_key = "full_link" datasource_key = "data_source" controller_ley = "controller" max_level_key = "max_level" max_page_key = "max_page" output_queue_key = "output_queue" _use_lxml_parser = False def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None, controller: SiteCheckerController=None, max_level=10, max_page=1000, delegate=None, output_buff_size=2000, output_queue=None, output_all_external=False, result_delegate=None, memory_control_terminate_event=None, check_robot_text=True, **kwargs): """ :param full_link: The full link of a domain, e.g: https://www.google.co.uk :param domain: domain to crawl :param max_level: stop crawling if it reaches this level :param max_page: maximum pages to check within a site, also stop crawling :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999 :param result_delegate: send site_info upon finish :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process. :return: """ FeedbackInterface.__init__(self, **kwargs) #super(SiteChecker, self).__init__(**kwargs) if full_link is None or len(full_link) == 0: raise ValueError() original_path = "" try: paras = urlsplit(full_link) self.scheme, self.domain, original_path = paras[0], paras[1], paras[2] except: pass domain_data = LinkChecker.get_root_domain(full_link, False) self.root_domain = domain_data[1] self.sub_domain = domain_data[4] self.domain_suffix = domain_data[5] self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix) if self.scheme == "": self.scheme = "http" if self.domain == "": self.domain = self.root_domain self.orginal_link = full_link self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme) self.max_level = max_level self.max_page = max_page self.page_count = 0 # keep track page done self._page_count_shadow = 0 # track previous count self._all_page_count_shadow = 0 #track previous count in datasource self.internal_page_count = 0 self.internal_page_last_count = 0 self.page_allocated = 0 self.current_level = 0 # if this = 0, it is root domain/home_page self._stop_event = Event() valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link) self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self, stop_event=self._stop_event, buf_size=int(output_buff_size/2), dir_path=get_db_buffer_default_dir(), convert_output=False) self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False) self._memory_control_terminate_event = memory_control_terminate_event self.task_control_lock = threading.RLock() if data_source is None: #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self) self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self) else: self.data_source = data_source # a list of OnSiteLink self.delegate = delegate if LinkChecker.might_be_link_html_page(original_path): self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1)) self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1)) self.cache_list = [] # internal page cache self.page_need_look_up_temp = 0 self.cache_list.append(self.domain_link) if "www." not in self.sub_domain: self.cache_list.append(self.scheme + "://www."+self.sub_domain) self.cache_list.append(self.scheme + "://" + self.domain) self.page_need_look_up = self.data_source.count_all() self.cache_size = 500 # create a small cache list to avoid going to check link in file system with lots of read and write self._double_check_cache_lock = threading.RLock() self._double_check_cache = deque(maxlen=self.cache_size) self.external_cache_list = [] self.external_cache_size = 500 # cache that hold external sites self.external_links_checked = 0 self.add_internal_page_OK_only = True self.output_queue = output_queue self.output_all_external = output_all_external self.controller = controller self.result_delegate = result_delegate self.page_count_lock = threading.RLock() self.internal_page_count_lock = threading.RLock() self.level_lock = threading.RLock() self.page_look_up_lock = threading.RLock() self.external_link_check_lock = threading.RLock() self._finihsed = False self.task_control_max = 1 self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \ "if you have an enquiry, please email to: [email protected])" self.agent_from = "*****@*****.**" if check_robot_text: self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme) else: self.robot_agent = None self.site_crawl_delay = 0.60 if isinstance(self.robot_agent, Rules): delay_temp = self.robot_agent.delay(self.agent) if delay_temp is not None and delay_temp != self.site_crawl_delay: self.site_crawl_delay = delay_temp self.task_control_counter = 1 self._speed_penalty_count = 0 self._speed_penalty_threshold = 10 self._progress_logging_speed = 120 self._output_period = 120 self._output_batch_size = 100 self._death_wish_sent = False SiteChecker._is_lxml_parser_exist() self._output_thread = None self._output_queue = None self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event) self._status = "Start" self._populate_with_state() # restore laste known state # self.data_source.additional_startup_procedures() # use the data set in self._populate_with_state() to start # def _empty_external_links_db(self): # if self.output_queue is not None: def _put_result_in_output_queue_loop(self, item_list: list): if not self._stop_event.is_set(): try: self._output_queue.put(item_list, True, 2) except Exception as ex: if self._output_queue is None: manager, self._output_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) time.sleep(0.1) ErrorLogger.log_error("SiteChecker._get_external_links_to_queue", self.sub_domain+" "+str(ex)) self._put_result_in_output_queue_loop(item_list) def _get_external_links_to_queue(self): ref_time = time.time() manager, self._output_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) self.output_queue = self._output_queue # override output_queue # if result_queue is None: # ErrorLogger.log_error("SiteChecker._get_external_links_to_queue()", ValueError("result queue is none, cannot put item in queue.")) # else: batch = list() counter = 0 for item in self._external_db_buffer: if self._stop_event.is_set() or self.external_links_checked >= self._external_db_buffer.count_all(): try: manager.shutdown() except: pass finally: # print("exist _get_external_links_to_queue") # if self._stop_event.is_set() and self.external_links_checked >= self._external_db_buffer.count_all(): break elif isinstance(item, tuple): # print("outputting item: ", str(item)) batch.append((item[0], item[1])) counter += 1 if len(batch) > 0: current_time = time.time() if current_time - ref_time or len(batch) >= self._output_batch_size: self._put_result_in_output_queue_loop(batch) self.external_links_checked += len(batch) ref_time = time.time() batch.clear() time.sleep(0.0001) @staticmethod def _is_lxml_parser_exist(): try: import lxml except ImportError: SiteChecker._use_lxml_parser = False else: SiteChecker._use_lxml_parser = True def use_lxml_parser(self): return SiteChecker._use_lxml_parser @staticmethod def get_input_parameter_base(full_link: str, max_page: int, max_level: int, output_queue) -> dict: return {SiteChecker.full_link_key: full_link, SiteChecker.max_page_key: max_page, SiteChecker.max_level_key: max_level, SiteChecker.output_queue_key: output_queue} def get_external_count_finished(self) -> int: """ ExternalTempInterface, get the number of job done in ExternalTempDataDiskBuffer :return: """ return self.external_links_checked def set_internal_count(self, count: int): """ ExternalTempInterface, set the number of job done in ExternalTempDataDiskBuffer :param count: :return: """ self.external_links_checked = count def _set_task_control_max(self, concurrent_task: int): if concurrent_task <= 0: raise ValueError self.task_control_max = concurrent_task self.task_control_counter = concurrent_task min_page_per_s = concurrent_task/20 self._speed_penalty_threshold = self._progress_logging_speed * min_page_per_s if self.site_crawl_delay > 1/min_page_per_s: ErrorLogger.log_error("SiteChecker._set_task_control_max()", ValueError("site has crawl delay greater than mas delay."), self.domain_link) self._status = "Stopped" self.sudden_death() def get_site_feedback(self) -> SeedSiteFeedback: return SeedSiteFeedback(self.orginal_link, page_count=self.get_page_need_look_up()) def get_site_info(self) -> SiteInfo: # keep the original reference when sending back the site infomation info = SiteInfo(self.orginal_link, self.data_source) return info def populate_with_state(self, state): if state is not None and isinstance(state, SiteCheckerState): self._status = "Restarted" self.page_count = state.page_count self.page_allocated = state.page_count self.internal_page_count = state.internal_page_count self.internal_page_last_count = state.internal_page_count self.external_links_checked = state.external_page_count self._external_db_buffer.set_progress(state.external_page_count) self.page_need_look_up = state.page_need_look_up self.current_level = state.current_level self.progress_logger.set_reference(state.log_sample_index, state.log_started_time) counter = 0 if self.data_source is not None: try: for item in self.data_source.get_next(): if counter >= self.cache_size: break if isinstance(item, OnSiteLink) and not LinkChecker.is_external_link(self.root_domain, item.link): self.cache_list.append(item.link) # print("--restore: ", item) counter += 1 except Exception as ex: msg = "error in SiteChecker.populate_with_state(), trying to populate cache, " + self.root_domain ErrorLogger.log_error("SiteChecker", ex, msg) self.data_source.ref = state.datasource_ref self.data_source.output_c = state.datasource_output_c self.data_source.set_progress(state.datasource_index if state.datasource_index < state.page_count else state.page_count) self.data_source.set_continue_lock(True) def get_file_name(self): return self.data_source.ref def get_limit(self): return 100000 def get_column_names(self): return ["Page Index", "External", "All", "Status"] def get_progress(self): data_source_count = self.data_source.count_all() if self.page_count - self._page_count_shadow <= self._speed_penalty_threshold: # determine if site is slow self._speed_penalty_count += 1 if self._speed_penalty_count > 2: self._status = "Stopped" self.sudden_death() else: self._speed_penalty_count = 0 if self.page_count == self._page_count_shadow and data_source_count == self._all_page_count_shadow: # determine if site is stucked self._status = "Stopped" self.sudden_death() self._page_count_shadow = self.page_count self._all_page_count_shadow = data_source_count return [self.page_count, self.external_links_checked, data_source_count, self._status] def is_programme_finshed(self): return self._finihsed def get_callback_data(self): with self.page_count_lock: gap = self.internal_page_count - self.internal_page_last_count self.internal_page_last_count = self.internal_page_count seed_feedback = None if self._finihsed: seed_feedback = self.get_site_feedback() return SiteFeedback(gap, self._finihsed, seed_feedback=seed_feedback, datasource_ref=self.data_source.ref) def get_state(self): return SiteCheckerState(page_count=self.page_count, page_need_look_up=self.page_need_look_up, current_level=self.current_level, internal_page_count=self.internal_page_count, external_page_count= self.external_links_checked, datasource_index=self.data_source.temp_counter, datasource_output_c=self.data_source.output_c, datasource_ref=self.data_source.ref, log_started_time=self.progress_logger.begin_time, log_sample_index=self.progress_logger.limit_counter,) def additional_reset(self): pass def addtional_clear(self): pass def stop(self): # natural stop self._status = "Stopped" self.progress_logger.report_progress() self._stop_event.set() if self.progress_logger.is_alive(): self.progress_logger.join() def clear(self): self.cache_list.clear() self.addtional_clear() def acquire_task(self, level: int, link: str): tasked_acquired = True if link.endswith('/'): temp = link else: temp = link + '/' with self.task_control_lock: if len(self._double_check_cache) > 0: if temp in self._double_check_cache: print("duplicate link found:", link) tasked_acquired = False else: if len(self._double_check_cache) >= self.cache_size: self._double_check_cache.popleft() self._double_check_cache.append(temp) self.task_control_counter -= 1 self.page_allocated += 1 if tasked_acquired: if level > self.current_level: self.current_level = level # time.sleep(self.site_crawl_delay) return tasked_acquired def release_task(self, new_page: int): with self.task_control_lock: if self.page_need_look_up == 1 and new_page == 0: PrintLogger.print("set to stop data source") self.data_source.set_continue_lock(False) else: self.page_count += 1 self.page_need_look_up += new_page #self.external_links_checked += external_page_count self.task_control_counter += 1 # was determine if it is internal or external page self.internal_page_count += 1 if self.internal_page_count > self.max_page or self.current_level > self.max_level: if self.data_source.can_continue(): PrintLogger.print("set stop: " + str(self.internal_page_count)+" level: "+str(self.current_level)) self.data_source.set_continue_lock(False) def get_page_count(self): with self.page_count_lock: page_count = self.page_count return page_count def set_page_count(self, page_count: int): with self.page_count_lock: self.page_count = page_count def set_internal_page_count(self, count: int): with self.internal_page_count_lock: self.internal_page_count += count def get_internal_page_count(self): with self.internal_page_count_lock: count = self.internal_page_count return count def get_current_level(self): with self.level_lock: current_level = self.current_level return current_level def set_current_level(self, level): with self.level_lock: self.current_level = level def get_page_need_look_up(self): with self.page_look_up_lock: page_look_up = self.page_need_look_up #self.page_look_up_lock.release() return page_look_up def set_page_need_look_up(self, page_count): with self.page_look_up_lock: #time.sleep(0.1) self.page_need_look_up = page_count # self.page_look_up_lock.release() def set_page_need_look_up_plus_more(self, count: int): with self.page_look_up_lock: self.page_need_look_up += count def get_internal_page_progress_index(self)->int: return self.get_page_count() def set_internal_page_progress_index(self, index: int): self.page_count = index self.page_allocated = index def is_idle(self): idle = False with self.task_control_lock: page_need_look_up = self.get_page_need_look_up() new_task_added = page_need_look_up - self.page_need_look_up_temp has_new_task = True if new_task_added > 0 else False #page_count = self.get_page_count() if has_new_task: self.page_need_look_up_temp = page_need_look_up else: if self.task_control_counter >= self.task_control_max: idle = True # print("is idle") # else: # print("is working") return idle def add_link_to_cache(self, link): if len(self.cache_list) > self.cache_size: return else: if link.endswith('/'): self.cache_list.append(link) else: self.cache_list.append(link+'/') def is_link_in_cache(self, link): if link.endswith('/'): temp = link else: temp = link + '/' return True if temp in self.cache_list else False def reset_as(self, domain: str, link: str=""): # reset the target domain PrintLogger.print("crawl reset as: "+domain) self.domain = domain self.domain_link = self.scheme + "://" + self.domain self.page_count = 0 self.current_level = 0 self.set_page_need_look_up(1) # self.set_page_looked_up(0) self.clear() if len(link) == 0: self.cache_list.append(self.domain_link) self.data_source.re_target(self.domain_link, OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) #self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) else: self.cache_list.append(link) self.data_source.re_target(link, OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1)) #self.data_source.append(OnSiteLink(link, response_code=ResponseCode.LinkOK, link_level=1)) self.additional_reset() self.data_source.additional_startup_procedures() def crawling(self): # call this method to start operation self._start_sending_feedback() self._output_thread = threading.Thread(target=self._get_external_links_to_queue) if self.data_source.can_continue(): self.data_source.additional_startup_procedures() # use the data set in self._populate_with_state() to start self._external_db_buffer.start_input_output_cycle() self._output_thread.start() self.progress_logger.start() self.progress_logger.report_progress() # log first row self._status = "Work" self.begin_crawl() # prefix = "www." # page_count_limit = 2 # if self.page_count <= page_count_limit and prefix not in self.domain_link: # new_domain = prefix + self.sub_domain # self.reset_as(new_domain) # self._status = "Work" # self.begin_crawl() # print("going to stop all.") self.stop() self.clear() self.data_source.additional_finish_procedures() # print("going to finish output buffer.") self._external_db_buffer.terminate() # print("going to stop output_thread.") if self._output_thread.is_alive(): self._output_thread.join() PrintLogger.print("finished naturally: "+self.domain_link) # print("finished naturally.") self._finihsed = True #calling this at the end of operation PrintLogger.print("send last response") # print("send last response") # print("send last response.") self._end_sending_feedback() if self._memory_control_terminate_event is not None: self._memory_control_terminate_event.set() def sudden_death(self): if not self._finihsed: self._finihsed = True PrintLogger.print("start sudden death: "+self.orginal_link) #self.stop() self.stop() self.clear() self.data_source.set_continue_lock(False) self.data_source.additional_finish_procedures() self._external_db_buffer.terminate() if isinstance(self._output_thread, threading.Thread): if self._output_thread.is_alive(): self._output_thread.join() #calling this at the end of operation PrintLogger.print("send last response") self._end_sending_feedback() if self._memory_control_terminate_event is not None: ErrorLogger.log_error("SiteChecker", TimeoutError("slow processing speed, terminated."), self.orginal_link) self._memory_control_terminate_event.set() def begin_crawl(self, level=0): # subclass this to make different behaviour pass