Exemple #1
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     account = kwargs.get("Account")
     # is_domain_good = False
     is_spammed = False
     try:
         if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount):
             majestic = MajesticCom(account)
             if self._en_spam_check:
                 self._filter_domain_name(domain=data.domain)
                 # self._filter_anchor_text(majestic, data.domain)
                 # self._filter_ref_domains(majestic, data.domain)
             if self._en_tf_check:
                 data = self._filter_tf_cf_backlink_ratio(majestic, data)
             if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains):
                 raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains))
             # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio:
             #     raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,))
             if self._en_spam_check:
                 self._filter_anchor_text(majestic, data.domain)
                 self._filter_ref_domains(majestic, data.domain)
             # is_domain_good = True
         else:
             raise ValueError("account is none in process_data")
     except MajesticSpamException as mjx_ex:
         is_spammed = True
         data.exception = str(mjx_ex)
     except Exception as ex:
         data.exception = str(ex)
         # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data))
     finally:
         PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID)
         if isinstance(data, FilteredDomainData):
             with self._sync_lock:
                 self._job_done += 1
                 if account is not None:
                     account.Available = True
                 # if data.cf >= self._min_cf and data.tf >= self._min_tf:
                 if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains:
                 # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains:
                     #print("Majatic output:", data)
                     # PrintLogger.print("domain: " + data.domain + " is good.")
                     if not self._is_throughput_debug:
                         if is_spammed:
                             CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir())
                         else:
                             CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file
                     self._output_queue.put(data)
                     return data
                 # elif is_spammed:
                 #     if not self._is_throughput_debug:
                 #         CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir())
                 #     self._output_queue.put(data)
                     # return data
                 else:
                     if self._is_throughput_debug:
                         self._output_queue.put(data)
                     # return None
                     # print("domain: " + data.domain + " has exception:" + data.exception)
         else:
             pass
Exemple #2
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     #print("MozFilter processing: ", data)
     account = kwargs.get("Account")
     try:
         if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount):
             if TldUtility.is_top_tld(data.domain):
                 sleep_time =random.randint(self._min_sleep_time, self._max_wait)
                 time.sleep(sleep_time)
                 moz = MozCom(account)
                 if not self._is_throughput_debug:
                     ranking = moz.get_ranking_data(data.domain)
                 else:
                     ranking = 100
                 data.da = ranking
             else:
                 pass
         else:
             raise ValueError("account is none in process_data")
     except Exception as ex:
         ErrorLogger.log_error("MozFilter", ex, "process_data() " + str(data) + " account: " + account.userID)
     finally:
         PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID)
         if isinstance(data, FilteredDomainData):
             with self._sync_lock:
                 self._job_done += 1
                 if account is not None:
                     account.Available = True
             if data.da >= self._min_DA_value:
                 if not self._is_throughput_debug:
                     CsvLogger.log_to_file(self._log_file, [(data.domain, data.da)]) # log this to file
                 self._output_queue.put(data)
Exemple #3
0
 def process_data_batch(self, data: collections.Iterable, **kwargs):
     #print("MozFilter processing: ", data)
     account = kwargs.get("Account")
     temp = []
     try:
         if isinstance(data, collections.Iterable) and isinstance(account, SiteAccount):
             temp = [x for x in data if isinstance(x, FilteredDomainData) and TldUtility.is_top_tld(x.domain)]
             check_list = [y.domain for y in temp]
             sleep_time =random.randint(self._min_sleep_time, self._max_wait)
             time.sleep(sleep_time)
             moz = MozCom(account)
             if not self._is_throughput_debug:
                 rankings = moz.get_ranking_data_batch(check_list, limit=len(check_list))
             else:
                 rankings = [100] * len(temp)
             for i in range(len(temp)):
                 temp[i].da = rankings[i]
         else:
             raise ValueError("account is none in process_data_batch()")
     except Exception as ex:
         ErrorLogger.log_error("MozFilter", ex, "process_data_batch() " + str(data) + " account: " + account.userID)
     finally:
         PrintLogger.print("Moz processed: " + str(data) + " with: " + account.userID)
         with self._sync_lock:
             job_done = [x for x in data if x is not None]
             self._job_done += len(job_done)
             if account is not None:
                 account.Available = True
             for item in temp:
                 if isinstance(item, FilteredDomainData):
                     # print("moz processed:", item.domain)
                     if item.da >= self._min_DA_value:
                         if not self._is_throughput_debug:
                             CsvLogger.log_to_file(self._log_file, [(item.domain, item.da)]) # log this to file
                         self._output_queue.put(item)
Exemple #4
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     result_ok = False
     if isinstance(data, FilteredDomainData):
         try:
             if len(data.domain_var) == 0:
                 data.domain_var = data.domain
             links = ArchiveOrg.get_url_info(data.domain_var, min_size=self._min_page_size, limit=-100)
             count = len(links)
             data.archive = count
             if count < self._min_profile:
                 pass
                 # raise ValueError("profile count is less than:" + str(self._min_profile))
             result_ok = True
         except Exception as ex:
             if not self._is_throughput_debug:
                 pass
                 # ErrorLogger.log_error("ArchiveOrgFilter.process_data()", ex, data.domain_var)
         finally:
             with self._sync_lock:
                 self._job_done += 1
                     #with self._process_queue_lock:
                 if result_ok:
                     if not self._is_throughput_debug:
                         CsvLogger.log_to_file(self._log_file, [(data.domain, data.da, data.archive)]) # log this to file
                     self._output_queue.put(data)
                     # return data
                 else:
                     if self._is_throughput_debug:
                         self._output_queue.put(data)
                     # return None
     else:
         return None