Ejemplo n.º 1
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     account = kwargs.get("Account")
     # is_domain_good = False
     is_spammed = False
     try:
         if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount):
             majestic = MajesticCom(account)
             if self._en_spam_check:
                 self._filter_domain_name(domain=data.domain)
                 # self._filter_anchor_text(majestic, data.domain)
                 # self._filter_ref_domains(majestic, data.domain)
             if self._en_tf_check:
                 data = self._filter_tf_cf_backlink_ratio(majestic, data)
             if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains):
                 raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains))
             # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio:
             #     raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,))
             if self._en_spam_check:
                 self._filter_anchor_text(majestic, data.domain)
                 self._filter_ref_domains(majestic, data.domain)
             # is_domain_good = True
         else:
             raise ValueError("account is none in process_data")
     except MajesticSpamException as mjx_ex:
         is_spammed = True
         data.exception = str(mjx_ex)
     except Exception as ex:
         data.exception = str(ex)
         # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data))
     finally:
         PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID)
         if isinstance(data, FilteredDomainData):
             with self._sync_lock:
                 self._job_done += 1
                 if account is not None:
                     account.Available = True
                 # if data.cf >= self._min_cf and data.tf >= self._min_tf:
                 if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains:
                 # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains:
                     #print("Majatic output:", data)
                     # PrintLogger.print("domain: " + data.domain + " is good.")
                     if not self._is_throughput_debug:
                         if is_spammed:
                             CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir())
                         else:
                             CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file
                     self._output_queue.put(data)
                     return data
                 # elif is_spammed:
                 #     if not self._is_throughput_debug:
                 #         CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir())
                 #     self._output_queue.put(data)
                     # return data
                 else:
                     if self._is_throughput_debug:
                         self._output_queue.put(data)
                     # return None
                     # print("domain: " + data.domain + " has exception:" + data.exception)
         else:
             pass
Ejemplo n.º 2
0
 def _filter_ref_domains(self, majestic: MajesticCom, domain: str) -> bool:
     """
     check ref_domain info,
     :param majestic:
     :param domain:
     :return: True if everything is ok, else raise Exception.
     """
     max_bad_country_ratio = 0.25
     bad_country_count = 0
     max_bad_country_count = 5
     max_backlinks_for_single_bad_country = 30
     ref_domains = majestic.get_ref_domains(domain, max_count=self._majestic_result_ref_domain_limit,
                                            is_dev=DomainFinderSrc.IS_DEBUG, fresh_data=True)
     total_record = len(ref_domains)
     for ref_domain in ref_domains:
         if isinstance(ref_domain, MajesticRefDomainStruct):
             if ref_domain.country in self._bad_country:
                 bad_country_count += 1
                 if ref_domain.backlinks > max_backlinks_for_single_bad_country:
                     raise MajesticSpamException("{0:s} from bad country has more than {1:d} backlinks.".format(ref_domain.domain,max_backlinks_for_single_bad_country))
     if bad_country_count >= max_bad_country_count:
         raise MajesticSpamException("too many bad countries, {0:d} detected.".format(bad_country_count,))
     bad_country_ratio = bad_country_count/total_record
     if total_record > 0 and bad_country_ratio > max_bad_country_ratio:
         raise MajesticSpamException("bad country ratio in ref domains is too high: {0:.1f} percent.".format(bad_country_ratio*100,))
     return True
Ejemplo n.º 3
0
    def _filter_tf_cf_backlink_ratio(self, majestic: MajesticCom, data: FilteredDomainData) -> FilteredDomainData:
        ranking = majestic.get_cf_tf_list(["http://"+data.domain,
                                           "www."+data.domain,
                                           "http://www."+data.domain],
                                          is_dev=DomainFinderSrc.IS_DEBUG)
        if ranking is not None and len(ranking) > 0:
            current_tf = 0
            for item in ranking:
                if isinstance(item, MajesticComStruct):
                    item_cf_tf_ratio = 999
                    data_cf_tf_ratio = 999
                    item_deviation = 999
                    data_deviation = 999
                    if item.tf > 0:
                        item_cf_tf_ratio1 = abs(1-item.cf/item.tf)
                        item_cf_tf_ratio2 = abs(1-item.tf/item.cf)
                        item_deviation = min([item_cf_tf_ratio1, item_cf_tf_ratio2])
                    else:
                        continue
                    if data.tf > 0:
                        data_cf_tf_ratio1 = abs(1-data.cf/data.tf)
                        data_cf_tf_ratio2 = abs(1-data.tf/data.cf)
                        data_deviation = min([data_cf_tf_ratio1, data_cf_tf_ratio2])
                        # data_deviation = abs(1-data_cf_tf_ratio)

                    if item.tf >= self._min_tf  and item.cf >=self._min_cf and item_deviation < data_deviation and item_deviation <= self._cf_tf_deviation:
                        data.domain_var = item.domain
                        data.tf = item.tf
                        data.cf = item.cf
                        data.backlinks = item.backlinks
                        data.ref_domains = item.ref_domains
                        data.topic = item.topic
        return data
Ejemplo n.º 4
0
    def _filter_anchor_text(self, majestic: MajesticCom, domain: str) -> bool:
        """
        check anchor text.
        :param majestic:
        :param domain:
        :return:True if everything ok, else raise Exception.
        """
        brand = LinkChecker.get_root_domain(domain)[6]
        min_anchor_variation_limit = 2
        no_follow_limit = 0.5
        non_brand_share_limit = 0.25
        domain_contain_limit = 5
        is_in_anchor = False
        brand_name_repeat_count = 0
        brand_name_backlinks_count = 0
        anchor_list, total_backlinks, deleted, nofollow, total_ref_domains \
            = majestic.get_anchor_text_info(domain=domain, max_count=self._majestic_result_anchor_limit,
                                            is_dev=DomainFinderSrc.IS_DEBUG, fresh_data=True)
        if len(anchor_list) <= min_anchor_variation_limit:
            raise MajesticSpamException("number of anchor variation is less than 2.")
        elif nofollow/total_backlinks > no_follow_limit:
            pass
            # raise MajesticSpamException("nofollow backlinks are more than 50%.")
        elif len(self._spam_anchor) > 0:
            count = 0
            for anchor, ref_domains, total_links, deleted_links, no_follow_links in anchor_list:
                if brand in anchor or brand in anchor.replace(' ', ''):
                    if count < domain_contain_limit:
                        is_in_anchor = True
                    brand_name_backlinks_count += total_links
                    brand_name_repeat_count += 1
                elif ref_domains/total_ref_domains > non_brand_share_limit:
                # elif total_links/total_backlinks > non_brand_share_limit:
                    raise MajesticSpamException("non branded anchor '{0:s}' exceeded limit {1:.2f}.".format(anchor, ref_domains/total_ref_domains))

                for spam in self._spam_anchor:
                    if spam in anchor and not any(x in anchor for x in self._white_keyword_list):
                        raise MajesticSpamException("anchor {0:s} is in spam word {1:s}".format(anchor, spam))
                count += 1
            # if brand_name_backlinks_count/total > self._max_percentage_for_anchor_text_ratio:
            #     raise MajesticSpamException("domain name mentioned in achor texts more than {0:.1f}.".format(self._max_percentage_for_anchor_text_ratio*100,))
        if not is_in_anchor:
            pass
            #print(anchor_list)
            # raise MajesticSpamException("anchor does not have the domain name in top {0:d} results.".format(domain_contain_limit,))

        return True
Ejemplo n.º 5
0
from DomainFinderSrc.MajesticCom import *
from DomainFinderSrc.SiteConst import *

majestic_account = SiteAccount(siteType=AccountType.Majestic,
                               userID="*****@*****.**",
                               password="******",
                               APIkey="1BB1D141D20CAF35D331F086F55C1CEE")
majestic = MajesticCom(majestic_account)

moz_account = SiteAccount(siteType=AccountType.Moz,
                          userID="*****@*****.**",
                          password="******",
                          AccessID="mozscape-320a4616a8",
                          APIkey="f03c19321b0973573137288c647b31ea")

moz_account_fake = SiteAccount(siteType=AccountType.Moz,
                               userID="*****@*****.**",
                               password="******",
                               AccessID="mozscape-44a37bfcd5",
                               APIkey="bedefa75b4c17317a94a421108974f1d")

amazon_ec2_account = SiteAccount(
    siteType=AccountType.AmazonEC2,
    userID="*****@*****.**",
    AccessID="AKIAIPA2WM3ILJWR2KSA",
    APIkey="7EisLQmbOv04ExZM9Fj1rxnmWiKw8wae5shRPDdx")

buy_proxy_org_account = SiteAccount(siteType=AccountType.BuyProxyOrg,
                                    userID="*****@*****.**",
                                    password="******",
                                    AccessID="49885",