def __init__(self, original_domain: str, link: str, external_stop_event: multiprocessing.Event, max_thread: int=1,
                 download_content=True, download_base_dir=None, max_level=2, max_page=200):
        self._original_domain = original_domain
        self._archive_link = link
        self._external_stop_event = external_stop_event
        self._internal_pages = []  # an array of PageAttrs  for page comparison
        self._external_ref_page = []  # an array of PageAttrs for page comparison
        self._internal_list = []  # an array of LinkAttrs for checking download list
        self._broken_res_list = []
        inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
        file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment)
        self._internal_list.append(LinkAttrs(link=link, path=file_path, ref_link=ref_path, shadow_ref_link=ref_path,
                                             source=file_path,
                                             res_type=LinkUtility.EXT_WEBPAGE, level=0))
        self._max_thread = max_thread
        self._max_level = max_level
        self._current_level = 0
        self._max_page = max_page
        if max_thread < 1:
            self._max_thread = 1
        self._download_content = download_content
        if self._download_content and download_base_dir is None:
            raise ValueError("ArchiveExplorer.__init__: download_base_dir cannot be None.")
        self._file_manager = SiteFileManager(base_dir_path=FilePath.get_default_archive_dir(), file_name=original_domain)
        self._file_manager.write_to_error_log(LinkAttrs.get_titles())
        self._max_redirect = 10
        self._max_retries = 2
        self._pool = None
        self._sync_lock = threading.RLock()

        self._broken_webpage_count = 0
        self._broken_image_count = 0
        self._broken_css_count = 0
        self._broken_js_count = 0
        self._broken_others_count = 0

        self._total_webpage_count = 0
        self._total_image_count = 0
        self._total_css_count = 0
        self._total_js_count = 0
        self._total_others_count = 0

        self._total_res_done = 0

        self._timeout = 10
Beispiel #2
0
from DomainFinderSrc.Utilities import FilePath
from DomainFinderSrc.MiniServer.DatabaseServer.SiteDB import CategoryDomainSiteDB, CatagoryDomainSiteDataStruct
from DomainFinderSrc.MiniServer.DatabaseServer.CategoryDB import *
from DomainFinderSrc.MiniServer.DatabaseServer.DBManager import DBManagerInterface
from threading import Event, RLock
from DomainFinderSrc.Utilities.Logging import ErrorLogger
from DomainFinderSrc.MiniServer.Common.SocketCommands import ServerState, MiningList, ServerCommand, CommandStruct
from DomainFinderSrc.Utilities.Serializable import Serializable, NamedMutableSequence

market_place_db_addr = FilePath.get_marketplace_db_path("MarketplaceSites.db")
market_place_skeleton_db_addr = FilePath.get_marketplace_db_path(
    "MarketplaceSkeletonSites.db")


def db_update_process(skeleton_db_addr: str = "", market_db_addr: str = ""):
    if len(skeleton_db_addr) == 0:
        skeleton_db_addr = market_place_skeleton_db_addr
    if len(market_db_addr) == 0:
        market_db_addr = market_place_db_addr
    skeleton_db_manager = CategoryDBManager(skeleton_db_addr)
    skeleton_db_manager.reset_category_count()
    len_per_patch = 20000
    db = CategoryDomainSiteDB(market_db_addr)
    site_count = db.site_count(False)
    current_count = 0
    while current_count < site_count:
        sites = db.get_next_patch_no_rollover(current_count, len_per_patch)
        for site in sites:
            if isinstance(site, CatagoryDomainSiteDataStruct):
                for topic in site.get_categories():
                    sub_category = skeleton_db_manager.get_sub_category(
Beispiel #3
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     account = kwargs.get("Account")
     # is_domain_good = False
     is_spammed = False
     try:
         if isinstance(data, FilteredDomainData) and isinstance(account, SiteAccount):
             majestic = MajesticCom(account)
             if self._en_spam_check:
                 self._filter_domain_name(domain=data.domain)
                 # self._filter_anchor_text(majestic, data.domain)
                 # self._filter_ref_domains(majestic, data.domain)
             if self._en_tf_check:
                 data = self._filter_tf_cf_backlink_ratio(majestic, data)
             if not (data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains):
                 raise ValueError("tf or cf doesn't match. tf:" + str(data.tf) + " cf: " + str(data.cf) + " ref domain: " + str(data.ref_domains))
             # if data.backlinks / data.ref_domains > self._max_backlink_to_ref_domain_ratio:
             #     raise MajesticSpamException("backlink to ref domain ratio is greater than {0:.1f}".format(self._max_backlink_to_ref_domain_ratio,))
             if self._en_spam_check:
                 self._filter_anchor_text(majestic, data.domain)
                 self._filter_ref_domains(majestic, data.domain)
             # is_domain_good = True
         else:
             raise ValueError("account is none in process_data")
     except MajesticSpamException as mjx_ex:
         is_spammed = True
         data.exception = str(mjx_ex)
     except Exception as ex:
         data.exception = str(ex)
         # ErrorLogger.log_error("MajesticFilter.process_data()", ex, str(data))
     finally:
         PrintLogger.print("Majestic processed: '" + str(data) + "' with: " + account.userID)
         if isinstance(data, FilteredDomainData):
             with self._sync_lock:
                 self._job_done += 1
                 if account is not None:
                     account.Available = True
                 # if data.cf >= self._min_cf and data.tf >= self._min_tf:
                 if data.tf >= self._min_tf and data.cf >= self._min_cf and data.ref_domains >= self._min_ref_domains:
                 # if data.tf >= self._min_tf and data.ref_domains >= self._min_ref_domains:
                     #print("Majatic output:", data)
                     # PrintLogger.print("domain: " + data.domain + " is good.")
                     if not self._is_throughput_debug:
                         if is_spammed:
                             CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir())
                         else:
                             CsvLogger.log_to_file(self._log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir()) # log this to file
                     self._output_queue.put(data)
                     return data
                 # elif is_spammed:
                 #     if not self._is_throughput_debug:
                 #         CsvLogger.log_to_file(self._bad_log_file, [data.to_tuple()], dir_path=FilePath.get_temp_db_dir())
                 #     self._output_queue.put(data)
                     # return data
                 else:
                     if self._is_throughput_debug:
                         self._output_queue.put(data)
                     # return None
                     # print("domain: " + data.domain + " has exception:" + data.exception)
         else:
             pass
Beispiel #4
0
 def __init__(self, *args, TF=15, CF=15, CF_TF_Deviation=0.80, Ref_Domains=10, manager: AccountManager,
              accounts=[], en_tf_check=True, en_spam_check=True, **kwargs):
     self._min_tf = TF
     self._min_cf = CF
     self._min_ref_domains = Ref_Domains
     acc_manager = manager
     self._cf_tf_deviation = CF_TF_Deviation
     self._majestic_result_anchor_limit = 50
     self._majestic_result_ref_domain_limit = 50
     self._max_backlink_to_ref_domain_ratio = 6.0
     self._max_percentage_for_anchor_text_ratio = 0.1
     self._en_spam_check = en_spam_check
     self._en_tf_check = en_tf_check
     self._log_file = "Majestic_filtering_good.csv"
     self._bad_log_file = "Majestic_filtering_bad.csv"
     self._spam_keyword = [x.lower() for x in FileIO.FileHandler.read_lines_from_file(FilePath.get_spam_filter_keywords_file_path())]
     self._spam_anchor = [x.lower() for x in FileIO.FileHandler.read_lines_from_file(FilePath.get_spam_filter_anchors_file_path())]
     self._white_keyword_list = [x.lower() for x in FileIO.FileHandler.read_lines_from_file(FilePath.get_spam_filter_white_list_file_path())]
     self._bad_country = [x.upper() for x in FileIO.FileHandler.read_lines_from_file(FilePath.get_spam_filter_bad_country_path())]
     if len(accounts) == 0:
         self._account_list = acc_manager.get_accounts(AccountType.Majestic)
     else:
         self._account_list = [x for x in accounts if isinstance(x, SiteAccount)]
     worker_number = kwargs["worker_number"]
     if worker_number <= 0:
         worker_number = len(self._account_list)
     kwargs.update({"worker_number": worker_number})
     FilterInterface.__init__(self, *args, **kwargs)
from DomainFinderSrc.Utilities import FilePath
from DomainFinderSrc.MiniServer.DatabaseServer.SiteDB import CategoryDomainSiteDB, CatagoryDomainSiteDataStruct
from DomainFinderSrc.MiniServer.DatabaseServer.CategoryDB import *
from DomainFinderSrc.MiniServer.DatabaseServer.DBManager import DBManagerInterface
from threading import Event, RLock
from DomainFinderSrc.Utilities.Logging import ErrorLogger
from DomainFinderSrc.MiniServer.Common.SocketCommands import ServerState, MiningList, ServerCommand, CommandStruct
from DomainFinderSrc.Utilities.Serializable import Serializable, NamedMutableSequence
market_place_db_addr = FilePath.get_marketplace_db_path("MarketplaceSites.db")
market_place_skeleton_db_addr = FilePath.get_marketplace_db_path("MarketplaceSkeletonSites.db")


def db_update_process(skeleton_db_addr: str="", market_db_addr: str=""):
    if len(skeleton_db_addr) == 0:
        skeleton_db_addr = market_place_skeleton_db_addr
    if len(market_db_addr) == 0:
        market_db_addr = market_place_db_addr
    skeleton_db_manager = CategoryDBManager(skeleton_db_addr)
    skeleton_db_manager.reset_category_count()
    len_per_patch = 20000
    db = CategoryDomainSiteDB(market_db_addr)
    site_count = db.site_count(False)
    current_count = 0
    while current_count < site_count:
        sites = db.get_next_patch_no_rollover(current_count, len_per_patch)
        for site in sites:
            if isinstance(site, CatagoryDomainSiteDataStruct):
                for topic in site.get_categories():
                    sub_category = skeleton_db_manager.get_sub_category(CategoryManager.decode_sub_category(topic, False))
                    sub_category.count += 1
                current_count += 1
 def __init__(self):
     self._file_path = FilePath.get_proxy_file_path()
Beispiel #7
0
    def __init__(self,
                 original_domain: str,
                 link: str,
                 external_stop_event: multiprocessing.Event,
                 max_thread: int = 1,
                 download_content=True,
                 download_base_dir=None,
                 max_level=2,
                 max_page=200):
        self._original_domain = original_domain
        self._archive_link = link
        self._external_stop_event = external_stop_event
        self._internal_pages = []  # an array of PageAttrs  for page comparison
        self._external_ref_page = [
        ]  # an array of PageAttrs for page comparison
        self._internal_list = [
        ]  # an array of LinkAttrs for checking download list
        self._broken_res_list = []
        inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
            link)
        file_path, ref_path = LinkUtility.make_valid_web_res_path(
            path, fragment)
        self._internal_list.append(
            LinkAttrs(link=link,
                      path=file_path,
                      ref_link=ref_path,
                      shadow_ref_link=ref_path,
                      source=file_path,
                      res_type=LinkUtility.EXT_WEBPAGE,
                      level=0))
        self._max_thread = max_thread
        self._max_level = max_level
        self._current_level = 0
        self._max_page = max_page
        if max_thread < 1:
            self._max_thread = 1
        self._download_content = download_content
        if self._download_content and download_base_dir is None:
            raise ValueError(
                "ArchiveExplorer.__init__: download_base_dir cannot be None.")
        self._file_manager = SiteFileManager(
            base_dir_path=FilePath.get_default_archive_dir(),
            file_name=original_domain)
        self._file_manager.write_to_error_log(LinkAttrs.get_titles())
        self._max_redirect = 10
        self._max_retries = 2
        self._pool = None
        self._sync_lock = threading.RLock()

        self._broken_webpage_count = 0
        self._broken_image_count = 0
        self._broken_css_count = 0
        self._broken_js_count = 0
        self._broken_others_count = 0

        self._total_webpage_count = 0
        self._total_image_count = 0
        self._total_css_count = 0
        self._total_js_count = 0
        self._total_others_count = 0

        self._total_res_done = 0

        self._timeout = 10