def run_farm(self): try: self._start_sending_feedback() input_t = threading.Thread(target=self._sample_data) input_t.start() # start sampling data self._progress_logger.start() self._db_buffer.start_input_output_cycle( ) # start input and output data to/from file pool = ThreadPool(processes=self._max_worker) # pool.imap_unordered(self._check_whois_with_dns, self._db_buffer, chunksize=1) pool.imap_unordered(self._check_whois_with_dns, iter(self.sample_gen, None), chunksize=1) while not self._stop_event.is_set( ) or not self._internal_stop_event.is_set(): time.sleep(1) if self._stop_event.is_set(): self._internal_stop_event.set() input_t.join() self._progress_logger.join() self._db_buffer.terminate() if self._stop_event.is_set(): self._finished = True self._end_sending_feedback() except Exception as ex: if self._stop_event.is_set(): self._finished = True ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "run_farm() index at:" + str(self._job_done))
def force_clear(ref: str, dir_path="")->True: """ force to remove database in file :param ref: the file name :return: True if remove successfully, else false """ if len(dir_path) == 0: dir_path = get_db_buffer_default_dir() remove_ok = False filename = dir_path + ref try: time.sleep(1) # print("going to remove: ", filename) if os.path.exists(filename): os.remove(filename) # print("file removed: ", filename) temp_file = filename + TempDBInterface.sqlite_temp_suffix # print("going to remove:", temp_file) if os.path.exists(temp_file): os.remove(temp_file) # print("file removed: ", temp_file) # print("going to remove:", filename + TempDBInterface.sqlite_wal_suffix) if os.path.exists(filename + TempDBInterface.sqlite_wal_suffix): os.remove(filename + TempDBInterface.sqlite_wal_suffix) # print("file removed: ", filename + TempDBInterface.sqlite_wal_suffix) remove_ok = True except Exception as ex: msg = "error in SiteTempDatabase.force_clear(), " + filename ErrorLogger.log_error("SiteTempDatabase", ex, msg) finally: return remove_ok
def process_data(self, data: FilteredDomainData, **kwargs): #print("MozFilter processing: ", data) account = kwargs.get("Account") try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): if TldUtility.is_top_tld(data.domain): sleep_time =random.randint(self._min_sleep_time, self._max_wait) time.sleep(sleep_time) moz = MozCom(account) if not self._is_throughput_debug: ranking = moz.get_ranking_data(data.domain) else: ranking = 100 data.da = ranking else: pass else: raise ValueError("account is none in process_data") except Exception as ex: ErrorLogger.log_error("MozFilter", ex, "process_data() " + str(data) + " account: " + account.userID) finally: PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True if data.da >= self._min_DA_value: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(data.domain, data.da)]) # log this to file self._output_queue.put(data)
def _check_whois(self, domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if not self._is_debug: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain( domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois( root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) self._put_output_result_in_queue(domain_data) else: self._put_output_result_in_queue(domain_data) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def _check_whois_v1(self, domain_data: OnSiteLink): root_domain = domain_data.link try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] real_response_code = domain_data.response_code whois = LinkChecker.check_whois(root_domain) # check whois record if whois[0]: if whois[2]: # domain is expired real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired if real_response_code == ResponseCode.Expired: #if ResponseCode.domain_might_be_expired(real_response_code): domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # if isinstance(self._queue_lock, multiprocessing.RLock): with self._queue_lock: self._output_q.put( (domain_data.link, domain_data.response_code)) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def close(self): try: #print("close connection: ", self.connection_id) self.db.close() except Exception as ex: msg = "error in SiteTempDatabase.close(): trying to close db but failed, " + self.filename ErrorLogger.log_error("SiteTempDatabase", ex, msg)
def force_clear(ref: str, dir_path="") -> True: """ force to remove database in file :param ref: the file name :return: True if remove successfully, else false """ if len(dir_path) == 0: dir_path = get_db_buffer_default_dir() remove_ok = False filename = dir_path + ref try: time.sleep(1) # print("going to remove: ", filename) if os.path.exists(filename): os.remove(filename) # print("file removed: ", filename) temp_file = filename + TempDBInterface.sqlite_temp_suffix # print("going to remove:", temp_file) if os.path.exists(temp_file): os.remove(temp_file) # print("file removed: ", temp_file) # print("going to remove:", filename + TempDBInterface.sqlite_wal_suffix) if os.path.exists(filename + TempDBInterface.sqlite_wal_suffix): os.remove(filename + TempDBInterface.sqlite_wal_suffix) # print("file removed: ", filename + TempDBInterface.sqlite_wal_suffix) remove_ok = True except Exception as ex: msg = "error in SiteTempDatabase.force_clear(), " + filename ErrorLogger.log_error("SiteTempDatabase", ex, msg) finally: return remove_ok
def get_progress(self) -> []: """ ProgressLogInterface, get the prograss data in tuple format, so that it can be used to complie to standard format :return: array contains prograss data, which has the exact length of column names in get_column_names() """ total_record = self._db_buffer.get_total_record() if (self._job_done == self._job_done_shadow and self._job_done > 0 ) or (self._job_done > self._min_buff_delete_threshold * 0.9 and total_record > self._min_buff_delete_threshold): self._speed_penalty_count += 1 if self._speed_penalty_count >= 2: ErrorLogger.log_error( "WhoisChecker.get_progress()", TimeoutError("progress is stucked, restarted internal."), self._db_buffer._file_name) print("going to clear cache") self._db_buffer.clear_cache() self.reset() total_record = 0 self._db_buffer.start_input_output_cycle() else: print("no need to clear cache.") self._job_done_shadow = self._job_done self._speed_penalty_count = 0 return [self._job_done, total_record]
def _check_whois_with_dns(self, page: OnSiteLink): real_response_code = ResponseCode.DNSError skip_whois_check = False try: if not self._is_debug: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] suffix = root_result[5] if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST: skip_whois_check = True else: if LinkChecker.is_domain_DNS_OK(sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError skip_whois_check = True elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError skip_whois_check = True page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain except Exception as ex: ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link) skip_whois_check = True finally: if not skip_whois_check and real_response_code == ResponseCode.DNSError: self._check_whois(page) else: self._add_job_done_one()
def append_many(self, new_data_list, convert_tuple=True) -> bool: append_OK = False if new_data_list is not None and len(new_data_list) > 0: self.put_lock.acquire() try: tempdb = SiteTempDatabase(self.ref) try: if convert_tuple: to_tuple = [(x.link, x.response_code, x.link_level, x.link_type) for x in new_data_list] else: to_tuple = new_data_list tempdb.cur.execute("BEGIN") tempdb.cur.executemany("INSERT OR IGNORE INTO TEMP (LINK, RS_CODE, LEV, L_TYPE) " "VALUES (?, ?, ?, ?);", to_tuple) tempdb.db.commit() append_OK = True except OperationalError as ex: msg = "error in SiteTempDataDisk.append_many(), operation failed. " + self.ref ErrorLogger.log_error("SiteTempDataDisk", ex, msg) except Exception as outer_ex: msg = "error in SiteTempDataDisk.append_many() OperationalError, " + self.ref ErrorLogger.log_error("SiteTempDataDisk", outer_ex, msg) finally: self.put_lock.release() return append_OK
def process_data_batch(self, data: collections.Iterable, **kwargs): #print("MozFilter processing: ", data) account = kwargs.get("Account") temp = [] try: if isinstance(data, collections.Iterable) and isinstance(account, SiteAccount): temp = [x for x in data if isinstance(x, FilteredDomainData) and TldUtility.is_top_tld(x.domain)] check_list = [y.domain for y in temp] sleep_time =random.randint(self._min_sleep_time, self._max_wait) time.sleep(sleep_time) moz = MozCom(account) if not self._is_throughput_debug: rankings = moz.get_ranking_data_batch(check_list, limit=len(check_list)) else: rankings = [100] * len(temp) for i in range(len(temp)): temp[i].da = rankings[i] else: raise ValueError("account is none in process_data_batch()") except Exception as ex: ErrorLogger.log_error("MozFilter", ex, "process_data_batch() " + str(data) + " account: " + account.userID) finally: PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID) with self._sync_lock: job_done = [x for x in data if x is not None] self._job_done += len(job_done) if account is not None: account.Available = True for item in temp: if isinstance(item, FilteredDomainData): # print("moz processed:", item.domain) if item.da >= self._min_DA_value: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(item.domain, item.da)]) # log this to file self._output_queue.put(item)
def run(self): try: for item in self._filters: item.start() for item in self._filters: item.join() except Exception as ex: ErrorLogger.log_error("FilterPool", ex, "start()")
def remove_slaves(self, s: MiningMasterController, data: SlaveOperationData): if data.slave_addrs is not None and len(data.slave_addrs) > 0: s.remove_slaves(data.slave_addrs) elif data.ref != "" and data.count > 0: print("init from cloud") EC2 = EC2Controller("") # test this s.add_slaves(EC2.shut_down_machines(data.ref, data.count)) else: ErrorLogger.log_error("MasterRequestHandler.remove_slaves()", ValueError("Remove Slaves failed"))
def _put_output_result_in_queue(self, domain_data: OnSiteLink): if not self._stop_event.is_set() or not self._internal_stop_event.is_set(): try: self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as inner_ex: if self._output_q is None: manager, self._output_q = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) ErrorLogger.log_error("WhoisChecker", inner_ex, addtional="failed to put result in queue.") time.sleep(0.01) self._put_output_result_in_queue(domain_data)
def _sample_data(self): ref_time = time.time() manager, result_queue = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) if result_queue is None: ErrorLogger.log_error( "ExternalSiteChecker.WhoisChecker._sample_data()", ValueError("result queue is None, cannot get data.")) if not (self._stop_event.is_set() or self._internal_stop_event.is_set()): self._sample_data() else: while not (self._stop_event.is_set() or self._internal_stop_event.is_set()): data_list = [] counter = 0 while not result_queue.empty(): data = None try: data = result_queue.get() except Exception as ex: ErrorLogger.log_error("WhoisChecker._sampling_data", ex) if result_queue is None: manager, result_queue = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) if isinstance(data, OnSiteLink): counter += 1 data_list.append((data.link, data.response_code)) elif isinstance(data, tuple) and len(data) == 2: #print("External Site checker: recieved:", data) counter += 1 data_list.append(data) if isinstance(data, list): data_list += data counter += 1 if counter >= self._sample_batch_size: break current_time = time.time() if current_time - ref_time >= self._sample_batch_timeout: break time.sleep(self._min_sampling_duration) ref_time = time.time() if len(data_list) > 0: #print("whois checker input data in db_buff: ", len(data_list)) self._db_buffer.append_to_buffer(data_list, convert_tuple=False) data_list.clear() else: pass time.sleep(self._input_period)
def check_whois(domain: str): """ :param domain: :return: True domain might avaiable to buy now, date time of expire, True if action is 100% sure """ if domain is not None and len(domain) > 0: try: match = re.match( "^[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,10}$", domain) if match is None: raise ValueError("domain name error.") server = net.get_root_server(domain) raw = net.get_whois_raw(domain, server=server) parsed = parse.parse_raw_whois(raw_data=raw) expire_record = parsed.get("expiration_date") name_servers = parsed.get("nameservers") if len(parsed) <= 1: return True, None, True else: if expire_record is not None and len(expire_record) > 0: temp = expire_record[0] else: if name_servers is None: return True, None, True else: return False, None, False expire_dates = len(expire_record) if expire_dates > 1: for i in range(1, expire_dates): data = expire_record[i] if data > temp: temp = data date = datetime.datetime.utcnow() if temp is not None: if date < temp: #print(domain + " is not expired") return False, temp, True else: if name_servers is None: return True, temp, True else: return True, temp, False else: return True, None, False except Exception as ex: msg = "error in LinkChecker.check_whois(), checking " + domain ErrorLogger.log_error("LinkChecker", ex, msg) return False, None, False else: return False, None, True
def delete_db_data(self, data=None, **kwargs) -> bool: success = False try: if isinstance(data, MiningList): with self._db_lock: db = CategoryDomainSiteDB(self._db_addr) db.delete_sites(data.data) db.close() success = True except Exception as ex: ErrorLogger.log_error("MarketplaceDBManager.delete_db_data", ex) finally: return success
def get_next(self, link_tpye: int=OnSiteLink.TypeAll, response_code: int=ResponseCode.All): counter = 0 while True: if not self.can_continue(): # print("data source is set not to continue!") raise StopIteration item = None self.get_lock.acquire() try: tempdb = SiteTempDatabase(self.ref) cur = tempdb.cur.execute(u"SELECT LINK, RS_CODE, LEV, L_TYPE, rowid FROM TEMP " u"ORDER BY ID LIMIT 1 OFFSET {0:d};".format(counter,)) item = cur.fetchone() tempdb.close() except Exception as ex: msg = "error in SiteTempDataDisk.get_next(), " + self.ref ErrorLogger.log_error("SiteTempDataDisk", ex, msg) finally: self.get_lock.release() output_obj = None if item is not None and len(item) > 0: counter += 1 link = item[0] rs_code = item[1] level = item[2] inner_link_type = item[3] obj = OnSiteLink(link, response_code=rs_code, link_level=level, link_type=inner_link_type) #print("load: ", str(obj)) if link_tpye == OnSiteLink.TypeAll or inner_link_type == link_tpye: if response_code == ResponseCode.All: output_obj = obj elif response_code == ResponseCode.LinkNotBroken and not ResponseCode.is_link_broken(rs_code): output_obj = obj elif response_code == ResponseCode.LinkBroken and ResponseCode.is_link_broken(rs_code): output_obj = obj elif rs_code == response_code: output_obj = obj else: continue else: continue else: raise StopIteration if output_obj is not None: yield output_obj else: raise StopIteration
def add_db_data(self, data=None, **kwargs) -> bool: success = False try: if isinstance(data, MiningList): with self._db_lock: db = CategoryDomainSiteDB(self._db_addr) db.add_sites(data.data) db.close() success = True else: raise ValueError("input value has incorrect type.") except Exception as ex: ErrorLogger.log_error("MarketplaceDBManager.add_db_data", ex) finally: return success
def begin_crawl(self, level=0): #while self.can_continue() and self.data_source.can_continue(): #print("continue to work, page limit:", self.max_page, " max_level: ", self.max_level) #target_func = functools.partial(PageChecker.crawl_page, self) try: self.pool.imap_unordered(PageChecker.crawl_page_for_iter, self.data_source) while self.data_source.can_continue(): time.sleep(0.1) #results = [self.pool.apply_async(PageChecker.crawl_page, args=(self, page)) # for page in self.data_source.get_next(OnSiteLink.TypeOnSite, ResponseCode.LinkOK)] #[p.get() for p in results] except Exception as ex: #self.stop() msg = "begin_crawl() " + str(self.get_site_info()) ErrorLogger.log_error("SiteThreadChecker", ex, msg)
def wrap(*args, **kw): ts = time.time() result = method(*args, **kw) te = time.time() gap = te - ts if gap > log_if_longer > 0: PrintLogger.print('%r (%r, %r) %2.2f sec' % (method.__name__, args, kw, gap)) ErrorLogger.log_error(ref, ValueError("Operation took too long."), "completed in " + str(gap)) elif log_if_longer == 0: # PrintLogger.print('%r (%r, %r) %2.2f sec' % (method.__name__, args, kw, gap)) PrintLogger.print('%r took %2.2f sec' % (method.__name__, gap)) return result
def __init__(self, stop_event: Event, input_queue: Queue=None, output_queue: Queue=None, worker_number: int=1, queue_lock: multiprocessing.RLock=None, throughput_debug=False, batch=1, batch_get_timeout=60, **kwargs): self._input_queue = input_queue self._output_queue = output_queue self._stop_event = stop_event if worker_number <= 0: worker_number = 1 ErrorLogger.log_error("FilterInterface", ValueError("worker number is 0, reset to 1"), "__init__") self._worker_number = worker_number self._process_queue_lock = queue_lock self._is_throughput_debug = throughput_debug self._sync_lock = threading.RLock() self._job_done = 0 self._batch = batch self._batch_get_timeout = batch_get_timeout threading.Thread.__init__(self)
def _put_output_result_in_queue(self, domain_data: OnSiteLink): if not self._stop_event.is_set( ) or not self._internal_stop_event.is_set(): try: self._output_q.put( (domain_data.link, domain_data.response_code)) except Exception as inner_ex: if self._output_q is None: manager, self._output_q = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) ErrorLogger.log_error( "WhoisChecker", inner_ex, addtional="failed to put result in queue.") time.sleep(0.01) self._put_output_result_in_queue(domain_data)
def _sample_data(self): ref_time = time.time() manager, result_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) if result_queue is None: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker._sample_data()", ValueError("result queue is None, cannot get data.")) if not (self._stop_event.is_set() or self._internal_stop_event.is_set()): self._sample_data() else: while not (self._stop_event.is_set() or self._internal_stop_event.is_set()): data_list = [] counter = 0 while not result_queue.empty(): data = None try: data = result_queue.get() except Exception as ex: ErrorLogger.log_error("WhoisChecker._sampling_data", ex) if result_queue is None: manager, result_queue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Input) if isinstance(data, OnSiteLink): counter += 1 data_list.append((data.link, data.response_code)) elif isinstance(data, tuple) and len(data) == 2: #print("External Site checker: recieved:", data) counter += 1 data_list.append(data) if isinstance(data, list): data_list += data counter += 1 if counter >= self._sample_batch_size: break current_time = time.time() if current_time - ref_time >= self._sample_batch_timeout: break time.sleep(self._min_sampling_duration) ref_time = time.time() if len(data_list) > 0: #print("whois checker input data in db_buff: ", len(data_list)) self._db_buffer.append_to_buffer(data_list, convert_tuple=False) data_list.clear() else: pass time.sleep(self._input_period)
def get_progress(self) -> []: """ ProgressLogInterface, get the prograss data in tuple format, so that it can be used to complie to standard format :return: array contains prograss data, which has the exact length of column names in get_column_names() """ total_record = self._db_buffer.get_total_record() if (self._job_done == self._job_done_shadow and self._job_done > 0) or (self._job_done > self._min_buff_delete_threshold * 0.9 and total_record > self._min_buff_delete_threshold): self._speed_penalty_count += 1 if self._speed_penalty_count >= 2: ErrorLogger.log_error("WhoisChecker.get_progress()", TimeoutError("progress is stucked, restarted internal."), self._db_buffer._file_name) print("going to clear cache") self._db_buffer.clear_cache() self.reset() total_record = 0 self._db_buffer.start_input_output_cycle() else: print("no need to clear cache.") self._job_done_shadow = self._job_done self._speed_penalty_count = 0 return [self._job_done, total_record]
def _check_whois(self, domain_data: OnSiteLink): root_domain = domain_data.link.lower() try: if not self._is_debug: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain) # check whois record if is_available or is_redemption: if is_available: real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) self._put_output_result_in_queue(domain_data) else: self._put_output_result_in_queue(domain_data) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def _check_whois_with_dns(self, page: OnSiteLink): real_response_code = ResponseCode.DNSError skip_whois_check = False try: if not self._is_debug: root_result = LinkChecker.get_root_domain(page.link) root_domain = root_result[1] sub_domain = root_result[4] suffix = root_result[5] if len(sub_domain ) == 0 or suffix not in TldUtility.TOP_TLD_LIST: skip_whois_check = True else: if LinkChecker.is_domain_DNS_OK( sub_domain): # check DNS first real_response_code = ResponseCode.NoDNSError skip_whois_check = True elif not sub_domain.startswith("www."): if LinkChecker.is_domain_DNS_OK("www." + root_domain): real_response_code = ResponseCode.NoDNSError skip_whois_check = True page.response_code = real_response_code page.link_type = OnSiteLink.TypeOutbound page.link = root_domain except Exception as ex: ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link) skip_whois_check = True finally: if not skip_whois_check and real_response_code == ResponseCode.DNSError: self._check_whois(page) else: self._add_job_done_one()
def run_farm(self): try: self._start_sending_feedback() input_t = threading.Thread(target=self._sample_data) input_t.start() # start sampling data self._progress_logger.start() self._db_buffer.start_input_output_cycle() # start input and output data to/from file pool = ThreadPool(processes=self._max_worker) # pool.imap_unordered(self._check_whois_with_dns, self._db_buffer, chunksize=1) pool.imap_unordered(self._check_whois_with_dns, iter(self.sample_gen, None), chunksize=1) while not self._stop_event.is_set() or not self._internal_stop_event.is_set(): time.sleep(1) if self._stop_event.is_set(): self._internal_stop_event.set() input_t.join() self._progress_logger.join() self._db_buffer.terminate() if self._stop_event.is_set(): self._finished = True self._end_sending_feedback() except Exception as ex: if self._stop_event.is_set(): self._finished = True ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "run_farm() index at:" + str(self._job_done))
def _check_whois_v1(self, domain_data: OnSiteLink): root_domain = domain_data.link try: if root_domain.startswith("http"): root_domain = LinkChecker.get_root_domain(domain_data.link)[1] real_response_code = domain_data.response_code whois = LinkChecker.check_whois(root_domain) # check whois record if whois[0]: if whois[2]: # domain is expired real_response_code = ResponseCode.Expired else: real_response_code = ResponseCode.MightBeExpired if real_response_code == ResponseCode.Expired: #if ResponseCode.domain_might_be_expired(real_response_code): domain_data.link = root_domain domain_data.response_code = real_response_code #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound) # if isinstance(self._queue_lock, multiprocessing.RLock): with self._queue_lock: self._output_q.put((domain_data.link, domain_data.response_code)) except Exception as ex: ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain) finally: self._add_job_done_one()
def __init__(self, input_queue: Queue, output_queue: Queue, queue_lock: multiprocessing.RLock, stop_event: Event, matrix: CrawlMatrix, accounts=[]): self._input_queue = input_queue self._output_queue = output_queue self._queue_lock = queue_lock self._stop_event = stop_event self._maxtrix = matrix self._filters = [] manager = AccountManager() self._proxies = ProxyManager().get_proxies() # majestic_queue = Queue() # archive_queue = Queue() if accounts is None: ErrorLogger.log_error("FilterPool.___init__", ValueError("accounts len is None")) moz_batch = 50 moz_batch_timeout = int(moz_batch * 2) moz_accounts = manager.get_accounts( AccountType.Moz) if len(accounts) == 0 else [ x for x in accounts if x.siteType == AccountType.Moz ] majestic_accounts = manager.get_accounts( AccountType.Majestic) if len(accounts) == 0 else [ x for x in accounts if x.siteType == AccountType.Majestic ] filter_moz = MozFilter( #input_queue=self._input_queue, output_queue=archive_queue, stop_event=self._stop_event, min_DA_value=self._maxtrix.da, manager=manager, accounts=moz_accounts, proxies=self._proxies, batch=moz_batch, batch_get_timeout=moz_batch_timeout ) # depend on number of accounts workers_for_moz = len(moz_accounts) workers_for_archive = int(workers_for_moz / 32 * moz_batch) workers_for_majestic = int(workers_for_moz / 200 * moz_batch) # self._filters.append(filter_moz) # if is_majestic_filter_on: filter_archive = ArchiveOrgFilter( #input_queue=archive_queue, output_queue=majestic_queue, stop_event=self._stop_event, queue_lock=self._queue_lock, worker_number=workers_for_archive, en_profile_check=matrix.en_archive_check) # min one worker filter_maj = MajesticFilter( #input_queue=majestic_queue, output_queue=self._output_queue, stop_event=self._stop_event, TF=self._maxtrix.tf, CF=self._maxtrix.cf, CF_TF_Deviation=self._maxtrix.tf_cf_deviation, Ref_Domains=self._maxtrix.ref_domains, manager=manager, worker_number=workers_for_majestic, en_tf_check=matrix.en_tf_check, en_spam_check=matrix.en_spam_check, accounts=majestic_accounts) # depend on number of accounts if matrix.en_moz: self._filters.append(filter_moz) if matrix.archive_count: self._filters.append(filter_archive) if matrix.en_majestic: self._filters.append(filter_maj) filter_len = len(self._filters) if filter_len == 0: output_queue = input_queue # todo:short circuit, need to test else: if filter_len > 1: for i in range(0, filter_len - 1): new_queue = Queue() self._filters[i]._output_queue = new_queue self._filters[i + 1]._input_queue = new_queue self._filters[0]._input_queue = self._input_queue self._filters[filter_len - 1]._output_queue = self._output_queue # else: # filter_archive = ArchiveOrgFilter(input_queue=archive_queue, output_queue=self._output_queue, # stop_event=self._stop_event, queue_lock=self._queue_lock, # worker_number=workers_for_archive) # min one worker # self._filters.append(filter_archive) threading.Thread.__init__(self)
def __init__(self, input_queue: Queue, output_queue: Queue, queue_lock: multiprocessing.RLock, stop_event: Event, matrix: CrawlMatrix, accounts=[]): self._input_queue = input_queue self._output_queue = output_queue self._queue_lock = queue_lock self._stop_event = stop_event self._maxtrix = matrix self._filters = [] manager = AccountManager() self._proxies = ProxyManager().get_proxies() # majestic_queue = Queue() # archive_queue = Queue() if accounts is None: ErrorLogger.log_error("FilterPool.___init__", ValueError("accounts len is None")) moz_batch = 50 moz_batch_timeout = int(moz_batch*2) moz_accounts = manager.get_accounts(AccountType.Moz) if len(accounts) == 0 else [x for x in accounts if x.siteType == AccountType.Moz] majestic_accounts = manager.get_accounts(AccountType.Majestic) if len(accounts) == 0 else [x for x in accounts if x.siteType == AccountType.Majestic] filter_moz = MozFilter(#input_queue=self._input_queue, output_queue=archive_queue, stop_event=self._stop_event, min_DA_value=self._maxtrix.da, manager=manager, accounts=moz_accounts, proxies=self._proxies, batch=moz_batch, batch_get_timeout=moz_batch_timeout) # depend on number of accounts workers_for_moz = len(moz_accounts) workers_for_archive = int(workers_for_moz/32*moz_batch) workers_for_majestic = int(workers_for_moz/200*moz_batch) # self._filters.append(filter_moz) # if is_majestic_filter_on: filter_archive = ArchiveOrgFilter(#input_queue=archive_queue, output_queue=majestic_queue, stop_event=self._stop_event, queue_lock=self._queue_lock, worker_number=workers_for_archive, en_profile_check=matrix.en_archive_check) # min one worker filter_maj = MajesticFilter(#input_queue=majestic_queue, output_queue=self._output_queue, stop_event=self._stop_event, TF=self._maxtrix.tf, CF=self._maxtrix.cf, CF_TF_Deviation=self._maxtrix.tf_cf_deviation, Ref_Domains=self._maxtrix.ref_domains, manager=manager, worker_number=workers_for_majestic, en_tf_check=matrix.en_tf_check, en_spam_check=matrix.en_spam_check, accounts=majestic_accounts) # depend on number of accounts if matrix.en_moz: self._filters.append(filter_moz) if matrix.archive_count: self._filters.append(filter_archive) if matrix.en_majestic: self._filters.append(filter_maj) filter_len = len(self._filters) if filter_len == 0: output_queue = input_queue # todo:short circuit, need to test else: if filter_len > 1: for i in range(0, filter_len-1): new_queue = Queue() self._filters[i]._output_queue = new_queue self._filters[i+1]._input_queue = new_queue self._filters[0]._input_queue = self._input_queue self._filters[filter_len-1]._output_queue = self._output_queue # else: # filter_archive = ArchiveOrgFilter(input_queue=archive_queue, output_queue=self._output_queue, # stop_event=self._stop_event, queue_lock=self._queue_lock, # worker_number=workers_for_archive) # min one worker # self._filters.append(filter_archive) threading.Thread.__init__(self)
def terminate_callback(): ErrorLogger.log_error("WhoisChecker", StopIteration("terminated."))
def is_domain_available_whois(domain: str) -> (bool, bool): """ availability check with whois. :param domain: domain name to check, e.g: google.com. make sure the domain is in lower case in the first place. :return:True if the domain is available, True if domain in is redemption """ if domain is not None and len(domain) > 0: try: match = re.match( "^[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,10}$", domain) if match is None: raise ValueError("domain name error.") #gr0 = match.group(0) server = net.get_root_server(domain) raw_data = net.get_whois_raw(domain, server=server) available = False is_redemption = False force_break = False status = "" fomat_line = "" all_lines = [] patterns = [ status_pattern, available_pattern0, available_pattern1 ] raw_data = [segment.replace("\r", "") for segment in raw_data ] # Carriage returns are the devil for segment in raw_data: all_lines += str(segment).splitlines() for pattern in patterns: # if status is not None and len(status) > 0: # break if available or is_redemption or force_break: break for line in all_lines: if len(line) == 0: continue temp = line.strip() if temp.endswith(":"): fomat_line = temp continue if fomat_line.endswith(":"): fomat_line += temp else: fomat_line = temp if fomat_line.startswith("%"): continue else: fomat_line = fomat_line.lower() if pattern is status_pattern: match_status = re.search( status_pattern, fomat_line) if match_status is not None: status = match_status.group(2) if status is not None and len(status) > 0: if re.search(status_value_pattern, status) is not None: available = True break elif re.search(redemption_pattern, status) is not None: is_redemption = True break elif re.search( other_official_status_pattern, status) is not None: force_break = True break elif re.search(pattern, fomat_line) is not None: available = True break # if status is not None and len(status) > 0: # if re.search(status_value_pattern, status) is not None: # available = True # elif re.search(redemption_pattern, status) is not None: # is_redemption = True return available, is_redemption except ValueError: return False, False except Exception as ex: ErrorLogger.log_error("LinkChecker", ex, "is_domain_available_whois() " + domain) return False, False else: return False
def send_and_receive(self): in_buffer = self.rfile out_buffer = self.wfile s = self.server.addtional_obj command = CommandProcessor.receive_command(in_buffer) #print("process cmd: ", command.cmd) if command is not None and isinstance(s, MiningMasterController): reply = CommandStruct(cmd=ServerCommand.Com_ReplyOK) if command.cmd == ServerCommand.Com_Start: #print("start conversation:") CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Stop: #print("end conversation:") return # exit point elif command.cmd == ServerCommand.Com_Get_DB_DATA: data = command.data if isinstance(data, DBRequestFields): try: reply.data = s.get_db_results(db_type=data.db_type, db_name=data.db_name, index=data.index, length=data.length) except Exception as ex: ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com-Get-DB-DATA") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Get DB data failed" else: ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", Exception("wrong data type recieved."), "cmd = ServerCommand.Com-Get-DB-DATA") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Get DB data failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Remove_DB: data = command.data if isinstance(data, DBRequestFields): try: s.remove_db(db_type=data.db_type, db_name=data.db_name) except Exception as ex: ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_DB-RM-DB") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Remove DB failed" else: ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", Exception("wrong data type recieved."), "cmd = ServerCommand.Com_DB-RM-DB") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Remove DB failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Start_Filter: data = command.data try: if s.is_alive(): s.stop() s.join() if isinstance(data, FilteringSetupData): #MatrixFilterControl.FilteringSetupData self.server.addtional_obj = MiningMasterController(ref=data.ref, accounts=data.accounts, crawl_matrix=data.crawl_matrix, filtering_only_mode=True, filtering_offset=data.offset, filtering_total=data.total ) self.server.addtional_obj.start() except Exception as ex: print(ex) ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Start_Filter()") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Com_Start_Filter failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Setup: # test this data = command.data try: if s.is_alive(): s.stop() s.join() if isinstance(data, SetupData): self.server.addtional_obj = MiningMasterController(ref=data.ref, accounts=data.accounts, cap_slave=data.cap, cap_slave_process=data.cap2, cap_concurrent_page=data.cap3, all_job=data.total, offset=data.offset, max_page_level=data.max_page_level, max_page_limit=data.max_page_limit, loopback_database=data.loopback, refresh_rate=data.refresh_rate, filters=data.db_filter, crawl_matrix=data.crawl_matrix, ) if data.addtional_data is not None and isinstance(data.addtional_data, SlaveOperationData): self.add_slaves(self.server.addtional_obj, data.addtional_data) self.server.addtional_obj.setup_minging_slaves() self.server.addtional_obj.start() else: raise NotImplementedError("other data type is not implemented.") except Exception as ex: print(ex) ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Setup()") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Setup failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Clear_Cache: try: if s.is_alive(): s.stop() s.join() s.clear_host_cache() s.clear_slave_cache() except Exception as ex: print(ex) ErrorLogger.log_error("MasterRequestHandler.send_and_receive()", ex, "cmd = ServerCommand.Com_Clear_Cache()") reply.cmd = ServerCommand.Com_ReplyError reply.data = "Setup failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Add_Seed: data = command.data if isinstance(data, MiningList): s.add_seeds(data) else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "Add Seed Failed, format is wrong in server handler." CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Add_Slave: # test this try: data = command.data if isinstance(data, SlaveOperationData): self.add_slaves(s, data) else: raise NotImplementedError("other data type is not implemented.") except Exception as ex: print(ex) reply.cmd = ServerCommand.Com_ReplyError reply.data = "Add slave failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Del_Slave: # test this try: data = command.data if isinstance(data, SlaveOperationData): self.remove_slaves(s, data) else: raise NotImplementedError("other data type is not implemented.") except Exception as ex: print(ex) reply.cmd = ServerCommand.Com_ReplyError reply.data = "Add slave failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Begin_Mining: # not implemented, use setup to begin mining reply.cmd = ServerCommand.Com_ReplyError reply.data = "Add slave failed" elif command.cmd == ServerCommand.Com_Stop_Mining: # test this try: EC2 = EC2Controller("") addrs = [slave.address.address for slave in s.slaves if isinstance(slave, Server)] s.pause() #s.slaves.clear() #if s.isAlive: # s.join(0) #self.server.addtional_obj = MiningMasterController() EC2.shut_down_machines_list(addrs) except: reply.cmd = ServerCommand.Com_ReplyError reply.data = "Stop site failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Status: # test this try: CPU = MachineInfo.get_cpu(1) MEM = MachineInfo.get_memory() NET = MachineInfo.get_network(1) slaveStatus = [slave.status for slave in s.slaves] totalPage = sum([slave.total_page_done for slave in slaveStatus]) ave_page = 0 filter_progress = s.get_filter_progress() if len(s.slaves) > 0: ave_page = int(sum([slave.page_per_site for slave in slaveStatus])/len(s.slaves)) total_result = sum([slave.result for slave in slaveStatus]) total_cap_slave = sum([slave.cap_slave for slave in slaveStatus]) total_cap_process = sum([slave.cap_slave * slave.cap_process for slave in slaveStatus]) total_cap_page = sum([slave.cap_slave * slave.cap_process * slave.cap_concurrent_page for slave in slaveStatus]) status = ServerStatus(wait_job=s.job_all - s.job_done, done_job=s.job_done, all_job=s.job_all, total_page_done=totalPage, page_per_site=ave_page, result=total_result, cpu_cores=CPU[0], cpu_percent=CPU[1], toal_memory=MEM[0], memory_percent=MEM[1], net_recieved=NET[0], net_send=NET[1], cap_slave=total_cap_slave, cap_process= total_cap_process, cap_concurrent_page= total_cap_page, filter_done=filter_progress[0], filter_total=filter_progress[1]) server = Server(server_type=ServerType.ty_Host, status=status, address=ServerAddress("localhost", MiningTCPServer.DefaultListenPort)) servers = [] servers.append(server) servers += s.slaves reply.data = MiningList(s.ref, servers) except: reply.cmd = ServerCommand.Com_ReplyError reply.data = "getting status failed" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_DataBase_Status: # test this reply.data = s.get_db_stats() # send back a copy CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Set_DB_Filter: data = command.data if isinstance(data, DBFilterCollection): if data != s.filter_shadow: s.filter_shadow = data s.update_db_stats(True) else: reply.cmd =ServerCommand.Com_ReplyError reply.data = "wrong data type for filters, should be DBFilterCollection" CommandProcessor.send_command(out_buffer, reply) elif command.cmd == ServerCommand.Com_Progress: # this this reply.data = PrograssData(ref=s.ref, done=s.job_done, all_job=s.job_all,offset=s.offset, duration=s.end_time - s.start_time, in_progress=s.in_progress) CommandProcessor.send_command(out_buffer, reply) else: reply.cmd = ServerCommand.Com_ReplyError reply.data = "command is not valid, please try again" CommandProcessor.send_command(out_buffer, reply) #CommandProcessor.send_command(out_buffer, reply) #print("finished cmd ", command.cmd) self.send_and_receive() # recursive to make a conversation