def process_data(self, data: FilteredDomainData, **kwargs): result_ok = False if isinstance(data, FilteredDomainData): try: if len(data.domain_var) == 0: data.domain_var = data.domain links = ArchiveOrg.get_url_info(data.domain_var, min_size=self._min_page_size, limit=-100) count = len(links) data.archive = count if count < self._min_profile: pass # raise ValueError("profile count is less than:" + str(self._min_profile)) result_ok = True except Exception as ex: if not self._is_throughput_debug: pass # ErrorLogger.log_error("ArchiveOrgFilter.process_data()", ex, data.domain_var) finally: with self._sync_lock: self._job_done += 1 #with self._process_queue_lock: if result_ok: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(data.domain, data.da, data.archive)]) # log this to file self._output_queue.put(data) # return data else: if self._is_throughput_debug: self._output_queue.put(data) # return None else: return None
def process_data(self, data: FilteredDomainData, **kwargs): account = kwargs.get("Account") # is_domain_good = False is_spammed = False try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): majestic = MajesticCom(account) if self._en_spam_check: self._filter_domain_name(domain=data.domain) # self._filter_anchor_text(majestic, data.domain) # self._filter_ref_domains(majestic, data.domain) if self._en_tf_check: data = self._filter_tf_cf_backlink_ratio(majestic, data) if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains): raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains)) # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio: # raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,)) if self._en_spam_check: self._filter_anchor_text(majestic, data.domain) self._filter_ref_domains(majestic, data.domain) # is_domain_good = True else: raise ValueError("account is none in process_data") except MajesticSpamException as mjx_ex: is_spammed = True data.exception = str(mjx_ex) except Exception as ex: data.exception = str(ex) # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data)) finally: PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True # if data.cf >= self._min_cf and data.tf >= self._min_tf: if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains: # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains: #print("Majatic output:", data) # PrintLogger.print("domain: " + data.domain + " is good.") if not self._is_throughput_debug: if is_spammed: CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) else: CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file self._output_queue.put(data) return data # elif is_spammed: # if not self._is_throughput_debug: # CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # self._output_queue.put(data) # return data else: if self._is_throughput_debug: self._output_queue.put(data) # return None # print("domain: " + data.domain + " has exception:" + data.exception) else: pass
def process_data(self, data: FilteredDomainData, **kwargs): #print("MozFilter processing: ", data) account = kwargs.get("Account") try: if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount): if TldUtility.is_top_tld(data.domain): sleep_time =random.randint(self._min_sleep_time, self._max_wait) time.sleep(sleep_time) moz = MozCom(account) if not self._is_throughput_debug: ranking = moz.get_ranking_data(data.domain) else: ranking = 100 data.da = ranking else: pass else: raise ValueError("account is none in process_data") except Exception as ex: ErrorLogger.log_error("MozFilter", ex, "process_data() " + str(data) + " account: " + account.userID) finally: PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID) if isinstance(data, FilteredDomainData): with self._sync_lock: self._job_done += 1 if account is not None: account.Available = True if data.da >= self._min_DA_value: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(data.domain, data.da)]) # log this to file self._output_queue.put(data)
def testArchiveProfileFilter1(self): input_param ={"input_queue": queue.Queue(), "output_queue": queue.Queue(), "stop_event": Event(), "throughput_debug": True, "worker_number": 2} archive_filter = get_archive_filter(**input_param) links = FileIO.FileHandler.read_lines_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/archive_domain_test.txt") for link in links: site = FilteredDomainData(domain=link) archive_filter.process_data(data=site, )
def testMajesticFilter(self): filter = get_majestic_filter(worker_number=1, input_queue=Queue(), output_queue=Queue(), stop_event=Event()) param = {"Account": majestic_account} links = FileIO.FileHandler.read_lines_from_file("/Users/superCat/Desktop/PycharmProjectPortable/test/spam_test2.txt") for link in links: link = LinkChecker.get_root_domain(link)[1] print("doing link:", link) site = FilteredDomainData(domain=link) filter.process_data(data=site, **param)
def _filter_tf_cf_backlink_ratio(self, majestic: MajesticCom, data: FilteredDomainData) -> FilteredDomainData: ranking = majestic.get_cf_tf_list(["http://"+data.domain, "www."+data.domain, "http://www."+data.domain], is_dev=DomainFinderSrc.IS_DEBUG) if ranking is not None and len(ranking) > 0: current_tf = 0 for item in ranking: if isinstance(item, MajesticComStruct): item_cf_tf_ratio = 999 data_cf_tf_ratio = 999 item_deviation = 999 data_deviation = 999 if item.tf > 0: item_cf_tf_ratio1 = abs(1-item.cf/item.tf) item_cf_tf_ratio2 = abs(1-item.tf/item.cf) item_deviation = min([item_cf_tf_ratio1, item_cf_tf_ratio2]) else: continue if data.tf > 0: data_cf_tf_ratio1 = abs(1-data.cf/data.tf) data_cf_tf_ratio2 = abs(1-data.tf/data.cf) data_deviation = min([data_cf_tf_ratio1, data_cf_tf_ratio2]) # data_deviation = abs(1-data_cf_tf_ratio) if item.tf >= self._min_tf and item.cf >=self._min_cf and item_deviation < data_deviation and item_deviation <= self._cf_tf_deviation: data.domain_var = item.domain data.tf = item.tf data.cf = item.cf data.backlinks = item.backlinks data.ref_domains = item.ref_domains data.topic = item.topic return data
def input_thread(): data_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/03-09-2015-Bad-Results.csv" input_c = 0 with open(data_path, mode='r', newline='') as csv_file: rd = csv.reader(csv_file, delimiter=',') for row in rd: if input_c < site_count: input_q.put(FilteredDomainData(domain=row[0])) time.sleep(0.0001) else: break input_c += 1
def testFilter(self): manager = AccountManager() manager.AccountList.append(majestic_account) input_param = { "input_queue": queue.Queue(), "output_queue": queue.Queue(), "stop_event": Event() } filter = MajesticFilter(manager=manager, **input_param) param = {"Account": majestic_account} links = FileIO.FileHandler.read_lines_from_file( "/Users/superCat/Desktop/PycharmProjectPortable/test/spam_test1.txt" ) for link in links: site = FilteredDomainData(domain=link) filter.process_data(data=site, **param)
def testWriteToDb(self): db_path = "/Users/superCat/Desktop/PycharmProjectPortable/sync/FilteredSitesList" good_db = "/Users/superCat/Desktop/PycharmProjectPortable/sync/Majestic_filtering_good.csv" table = "20/12/2015 Legal" db = FilteredResultDB(table=table, offset=0, db_addr=db_path) count = 0 temp_sites = [] with open(good_db, mode='r', newline='') as csv_file: rd = csv.reader(csv_file, delimiter=',') for row in rd: if int(row[10]) > 1450612100: data = FilteredDomainData.from_tuple(row) print(data.__dict__) count += 1 temp_sites.append(data) print("total:", count) db.add_sites(temp_sites, skip_check=False) db.close()
def testFilterAll1(self): db_path = "/Users/superCat/Desktop/PycharmProjectPortable/sync/" manager = AccountManager(db_path) input_param ={"input_queue": queue.Queue(), "output_queue": queue.Queue(), "stop_event": Event(), "throughput_debug": True, "worker_number": 2} file_path = "/Users/superCat/Desktop/PycharmProjectPortable/sync/Moz_filtering.csv" archive_filter = get_archive_filter(**input_param) majestic_filter = get_majestic_filter(**input_param) param = {"Account": majestic_account} count = 0 with open(file_path, 'rt') as csvfile: reader = csv.reader(csvfile, delimiter=',') for row in reader: link, da = row site = FilteredDomainData(domain=link, da=int(da)) print(count, " process:", link) archive_data = archive_filter.process_data(data=site, ) if archive_data is not None: majestic_data = majestic_filter.process_data(data=site, **param) print(majestic_data) count += 1
def format_input(data): if isinstance(data, FilteredDomainData): return data elif isinstance(data, tuple) and len(data) == 2: return FilteredDomainData(domain=data[0])
def input_thread(): for i in range(site_count): input_q.put(FilteredDomainData(domain="domain{0:d}.com".format(i,))) time.sleep(0.0001)