def process_data(self, data: FilteredDomainData, **kwargs): account = kwargs.get("Account") # is_domain_good = False is_spammed = False try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): majestic = MajesticCom(account) if self._en_spam_check: self._filter_domain_name(domain=data.domain) # self._filter_anchor_text(majestic, data.domain) # self._filter_ref_domains(majestic, data.domain) if self._en_tf_check: data = self._filter_tf_cf_backlink_ratio(majestic, data) if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains): raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains)) # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio: # raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,)) if self._en_spam_check: self._filter_anchor_text(majestic, data.domain) self._filter_ref_domains(majestic, data.domain) # is_domain_good = True else: raise ValueError("account is none in process_data") except MajesticSpamException as mjx_ex: is_spammed = True data.exception = str(mjx_ex) except Exception as ex: data.exception = str(ex) # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data)) finally: PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True # if data.cf >= self._min_cf and data.tf >= self._min_tf: if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains: # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains: #print("Majatic output:", data) # PrintLogger.print("domain: " + data.domain + " is good.") if not self._is_throughput_debug: if is_spammed: CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) else: CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file self._output_queue.put(data) return data # elif is_spammed: # if not self._is_throughput_debug: # CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # self._output_queue.put(data) # return data else: if self._is_throughput_debug: self._output_queue.put(data) # return None # print("domain: " + data.domain + " has exception:" + data.exception) else: pass
def process_data_batch(self, data: collections.Iterable, **kwargs): #print("MozFilter processing: ", data) account = kwargs.get("Account") temp = [] try: if isinstance(data, collections.Iterable) and isinstance(account, SiteAccount): temp = [x for x in data if isinstance(x, FilteredDomainData) and TldUtility.is_top_tld(x.domain)] check_list = [y.domain for y in temp] sleep_time =random.randint(self._min_sleep_time, self._max_wait) time.sleep(sleep_time) moz = MozCom(account) if not self._is_throughput_debug: rankings = moz.get_ranking_data_batch(check_list, limit=len(check_list)) else: rankings = [100] * len(temp) for i in range(len(temp)): temp[i].da = rankings[i] else: raise ValueError("account is none in process_data_batch()") except Exception as ex: ErrorLogger.log_error("MozFilter", ex, "process_data_batch() " + str(data) + " account: " + account.userID) finally: PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID) with self._sync_lock: job_done = [x for x in data if x is not None] self._job_done += len(job_done) if account is not None: account.Available = True for item in temp: if isinstance(item, FilteredDomainData): # print("moz processed:", item.domain) if item.da >= self._min_DA_value: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(item.domain, item.da)]) # log this to file self._output_queue.put(item)
def process_data(self, data: FilteredDomainData, **kwargs): #print("MozFilter processing: ", data) account = kwargs.get("Account") try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): if TldUtility.is_top_tld(data.domain): sleep_time =random.randint(self._min_sleep_time, self._max_wait) time.sleep(sleep_time) moz = MozCom(account) if not self._is_throughput_debug: ranking = moz.get_ranking_data(data.domain) else: ranking = 100 data.da = ranking else: pass else: raise ValueError("account is none in process_data") except Exception as ex: ErrorLogger.log_error("MozFilter", ex, "process_data() " + str(data) + " account: " + account.userID) finally: PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True if data.da >= self._min_DA_value: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(data.domain, data.da)]) # log this to file self._output_queue.put(data)
def vaccum_db(self): try: self.db.interrupt() except Exception as ex: PrintLogger.print(ex) finally: self.db = sqlite3.connect(self.filename, timeout=10) self.db.execute("VACUUM {0:s};".format(self._table_name,))
def vaccum_db(self): try: self.db.interrupt() except Exception as ex: PrintLogger.print(ex) finally: self.db = sqlite3.connect(self.filename, timeout=10) self.db.execute("VACUUM {0:s};".format(self._table_name, ))
def empty_feedback_queue(self): try: PrintLogger.print("in MemoryControlPs: trying to empty queue") while not self._feedback_queue.empty(): obj = self._feedback_queue.get(block=False, timeout=0.001) if obj is not None: self.memory_limit_callback(obj) except Exception as ex: PrintLogger.print("in MemoryControlPs.empty_feedback_queue()" + str(ex))
def wrap(*args, **kw): ts = time.time() result = method(*args, **kw) te = time.time() gap = te - ts if gap > log_if_longer > 0: PrintLogger.print('%r (%r, %r) %2.2f sec' % (method.__name__, args, kw, gap)) ErrorLogger.log_error(ref, ValueError("Operation took too long."), "completed in " + str(gap)) elif log_if_longer == 0: # PrintLogger.print('%r (%r, %r) %2.2f sec' % (method.__name__, args, kw, gap)) PrintLogger.print('%r took %2.2f sec' % (method.__name__, gap)) return result