コード例 #1
0
def check_whois_with_dns(page: OnSiteLink):

    real_response_code = ResponseCode.DNSError
    skip_whois_check = False
    try:
        root_result = LinkChecker.get_root_domain(page.link)
        root_domain = root_result[1]
        sub_domain = root_result[4]
        suffix = root_result[5]

        if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST:
            skip_whois_check = True
        else:

            if LinkChecker.is_domain_DNS_OK(sub_domain):  # check DNS first
                real_response_code = ResponseCode.NoDNSError
                skip_whois_check = True
            elif not sub_domain.startswith("www."):
                if LinkChecker.is_domain_DNS_OK("www." + root_domain):
                    real_response_code = ResponseCode.NoDNSError
                    skip_whois_check = True
                # response = LinkChecker.get_response(page.link, timeout)  # check 404 error

            page.response_code = real_response_code
            page.link_type = OnSiteLink.TypeOutbound
            page.link = root_domain

    except Exception as ex:
        # ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link)
        skip_whois_check = True
    finally:
        if not skip_whois_check and real_response_code == ResponseCode.DNSError:
            return check_whois(page)
        else:
            return page.link, page.response_code
コード例 #2
0
 def _check_whois(self, domain_data: OnSiteLink):
     root_domain = domain_data.link.lower()
     try:
         if not self._is_debug:
             if root_domain.startswith("http"):
                 root_domain = LinkChecker.get_root_domain(
                     domain_data.link)[1]
             is_available, is_redemption = LinkChecker.is_domain_available_whois(
                 root_domain)  # check whois record
             if is_available or is_redemption:
                 if is_available:
                     real_response_code = ResponseCode.Expired
                 else:
                     real_response_code = ResponseCode.MightBeExpired
                 domain_data.link = root_domain
                 domain_data.response_code = real_response_code
                 #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
                 self._put_output_result_in_queue(domain_data)
         else:
             self._put_output_result_in_queue(domain_data)
     except Exception as ex:
         ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex,
                               "_check_whois() " + root_domain)
     finally:
         self._add_job_done_one()
コード例 #3
0
 def _check_whois_v1(self, domain_data: OnSiteLink):
     root_domain = domain_data.link
     try:
         if root_domain.startswith("http"):
             root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
         real_response_code = domain_data.response_code
         whois = LinkChecker.check_whois(root_domain)  # check whois record
         if whois[0]:
             if whois[2]:  # domain is expired
                 real_response_code = ResponseCode.Expired
             else:
                 real_response_code = ResponseCode.MightBeExpired
         if real_response_code == ResponseCode.Expired:
             #if ResponseCode.domain_might_be_expired(real_response_code):
             domain_data.link = root_domain
             domain_data.response_code = real_response_code
             #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
             # if isinstance(self._queue_lock, multiprocessing.RLock):
             with self._queue_lock:
                 self._output_q.put(
                     (domain_data.link, domain_data.response_code))
     except Exception as ex:
         ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex,
                               "_check_whois() " + root_domain)
     finally:
         self._add_job_done_one()
コード例 #4
0
def check_whois_with_dns(page: OnSiteLink):

    real_response_code = ResponseCode.DNSError
    skip_whois_check = False
    try:
        root_result = LinkChecker.get_root_domain(page.link)
        root_domain = root_result[1]
        sub_domain = root_result[4]
        suffix = root_result[5]

        if len(sub_domain) == 0 or suffix not in TldUtility.TOP_TLD_LIST:
            skip_whois_check = True
        else:

            if LinkChecker.is_domain_DNS_OK(sub_domain):  # check DNS first
                real_response_code = ResponseCode.NoDNSError
                skip_whois_check = True
            elif not sub_domain.startswith("www."):
                if LinkChecker.is_domain_DNS_OK("www." + root_domain):
                    real_response_code = ResponseCode.NoDNSError
                    skip_whois_check = True
                # response = LinkChecker.get_response(page.link, timeout)  # check 404 error

            page.response_code = real_response_code
            page.link_type = OnSiteLink.TypeOutbound
            page.link = root_domain

    except Exception as ex:
        # ErrorLogger.log_error("WhoisChecker", ex, "_check_whois_with_dns() " + page.link)
        skip_whois_check = True
    finally:
        if not skip_whois_check and real_response_code == ResponseCode.DNSError:
            return check_whois(page)
        else:
            return page.link, page.response_code
コード例 #5
0
    def get_sites_by_seed_sites(account: MajesticCom, seed_domains: [], catagories: [], fresh_data=False, index=0,
                                iteration=1, loop_count=0, count_per_domain=100, callback=None, current_count=0,
                                max_count=-1, tf=20) -> []:
        if iteration < 0:
            raise ValueError("get_sites_by_seed_sites: iteration should >= 0.")
        sub_domains = [LinkChecker.get_root_domain(x, use_www=False)[4] for x in seed_domains[index:]]
        if len(sub_domains) == 0:
            return []
        backlinks = []
        # counter = index
        if max_count > 0 and current_count >= max_count:
                return backlinks
        temp_sub_domains = []
        temp = []
        # target_catagories = []
        # for catagory in catagories:
        #     target_catagories.append(str(CategoryManager.decode_sub_category(catagory, False)))
        for sub_domain in sub_domains:
            print("doing backlinks of domain:", sub_domain, "seed len:", len(temp_sub_domains))
            try:
                temp = account.get_backlinks(sub_domain, count_per_domain, topic="", is_dev=False,
                                             fresh_data=fresh_data)
                current_count += 1
            except Exception as ex:
                print(ex)
            for item in temp:
                if isinstance(item, MajesticBacklinkDataStruct):

                    # item_catagory = str(CategoryManager.decode_sub_category(item.src_topic, False))
                    domain = LinkChecker.get_root_domain(item.backlink, use_www=False)[4]
                    item.ref_domain = domain
                    # if callback is not None:
                    #     callback(item)
                    # if len(target_catagories) > 0 and item_catagory not in target_catagories:
                    #         continue
                    if domain not in sub_domains and domain not in temp_sub_domains:
                        if len(catagories) > 0:
                            is_in = False
                            if len(item.src_topic) > 0:
                                decoded = str(CategoryManager.decode_sub_category(item.src_topic, False))
                                for cate in catagories:
                                    if cate in decoded:
                                        is_in = True
                                        break
                                if is_in and item.src_tf >= tf:
                                    temp_sub_domains.append(domain)
                        elif item.src_tf >= tf:
                            temp_sub_domains.append(domain)
                        item.ref_domain = domain
                        if callback is not None:
                            callback(item)

            if max_count > 0 and current_count >= max_count:
                break
        if loop_count >= iteration:
            return backlinks
        else:
            return backlinks + GoogleMajestic.get_sites_by_seed_sites(account, sub_domains + temp_sub_domains, catagories, fresh_data, len(seed_domains),
                                                                      iteration, loop_count+1, count_per_domain, callback, current_count, max_count, tf)
コード例 #6
0
 def testRequestAllLink(self):
     url = "http://www.jehovahs-witness.com"
     agent = "VegeBot-Careful"
     source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**", retries=0)
     links = LinkChecker.get_all_links_from_source(source)
     for link in links:
         paras = urlsplit(link)
         page_scheme, page_domain = paras[0], paras[1]
         print(LinkChecker.get_valid_link(page_domain, link.strip(), page_scheme))
コード例 #7
0
    def get_sites(keyword: str, page_number: int=1, result_per_page: int=100,
                  index: int=0, length: int=100, use_browser=False,
                  source_type="", filter_list=[], country_code='', return_domain_home_only=True, days_ago=0,
                  **kwargs) -> []:

    # def get_sites(keyword: str, page: int=1, index: int=0, length: int=100,
    #               history=SeedSiteSettings.TIME_NOW, blog=False) -> []:
        assert page_number > 0, "page number should greater than 0"
        assert index >= 0, "index should greater or equal to 0"
        assert length > 0, "length should greater than 0"
        search_query = BingConst.SearchLink.format(quote(keyword), quote(keyword), (page_number-1)*length + index + 1, length)
        user_agent = WebRequestCommonHeader.webpage_agent
        try:
            req = BingCom._get_response(request_link=search_query, user_agent=user_agent, **kwargs)
            # req = requests.get(search_query, timeout=30, headers=WebRequestCommonHeader.get_html_header())
            result = req.text
            soup = bs4.BeautifulSoup(result)
            tags = soup.select(BingConst.SitePath)
            domains = []
            for tag in tags:
                try:
                    domain = tag.attrs["href"].strip().replace(" ", "")
                    if return_domain_home_only:
                        domain = LinkChecker.get_root_domain(domain, use_www=False)[2]  # get the link
                    else:
                        domain = LinkChecker.get_root_domain(domain, use_www=False)[3]
                    if len(domain) > 0:
                        domains.append(domain)
                except:
                    pass

            new_list = []
            if isinstance(domains, list):
                if len(filter_list) > 0:
                    for domain in domains:
                        if isinstance(domain, str):
                            temp = domain.lower().strip()
                            if not any(x in temp for x in filter_list):
                                new_list.append(temp)
                else:
                    new_list = domains

            end = index + length
            data_len = len(new_list)
            if domains is not None and index < data_len:
                if data_len >= end:
                    return new_list[index:end]
                else:
                    return new_list[index:]
            else:
                return []

        except Exception as ex:
            print(ex)
            return None
コード例 #8
0
    def get_search_results(keyword: str, page_number: int, proxy: ProxyStruct=None, result_per_page: int=GoogleConst.Result100, timeout=5,
                           return_domain_home_only=True, use_forbidden_filter=True, days_ago=0, addtional_query_parameter: str="",
                           country_code="us", use_browser=False) -> list:
        """
        generic normal search, get a list of domains form page
        :param keyword:
        :param page_number:  > 0
        :param resultPerPage:
        :param timeout:
        :param return_domain_home_only: return root domain name if True, else return protocol suffix + domain name
        :param use_forbidden_filter:
        :param days_ago: specify how many days ago before when results were indexed.
        :return:
        """
        assert page_number > 0, "page number should be greater than 0."
        page_range = GoogleCom.get_result_per_page_range()
        assert result_per_page in page_range, "result per page should be one of those values:" + str(page_range)

        sub_domain = "www"
        request_link = GoogleUtility.get_local_endpoint(country_code, sub_domain) \
                       + GoogleConst.CommonSearchPath.format(quote(keyword), result_per_page, (page_number - 1) * result_per_page, country_code) \
                       + addtional_query_parameter+GoogleUtility.get_query_for_days(days_ago)
        try:
            user_agent = WebRequestCommonHeader.webpage_agent
            if not use_browser:
                response = GoogleCom._get_response(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent)
                if not response.status_code == 200:
                    # if response.status_code == 503:
                        # print(response.text)
                    raise ConnectionRefusedError("error getting result, with status code:", response.status_code)
                result = response.text
            else:
                result = GoogleCom._get_response_browser(request_link, proxy=proxy, timeout=timeout, user_agent=user_agent)
            soup = bs4.BeautifulSoup(result)
            tags = soup.select(GoogleConst.SitePath)
            domains = []
            for tag in tags:
                try:
                    domain = tag.text.strip().replace(" ", "")
                    if return_domain_home_only:
                        domain = LinkChecker.get_root_domain(domain, use_www=False)[2]  # get the link
                    else:
                        domain = LinkChecker.get_root_domain(domain, use_www=False)[3]
                    if use_forbidden_filter and LinkChecker.is_domain_forbidden(domain):
                        continue
                    if len(domain) > 0:
                        domains.append(domain)
                except:
                    pass
            return domains

        except Exception as ex:
            print(ex)
            return None
コード例 #9
0
 def testRequestAllLink(self):
     url = "http://www.jehovahs-witness.com"
     agent = "VegeBot-Careful"
     source = LinkChecker.get_page_source(
         url,
         agent=agent,
         from_src="*****@*****.**",
         retries=0)
     links = LinkChecker.get_all_links_from_source(source)
     for link in links:
         paras = urlsplit(link)
         page_scheme, page_domain = paras[0], paras[1]
         print(
             LinkChecker.get_valid_link(page_domain, link.strip(),
                                        page_scheme))
コード例 #10
0
    def populate_with_state(self, state):
        if state is not None and isinstance(state, SiteCheckerState):
            self._status = "Restarted"
            self.page_count = state.page_count
            self.page_allocated = state.page_count
            self.internal_page_count = state.internal_page_count
            self.internal_page_last_count = state.internal_page_count
            self.external_links_checked = state.external_page_count
            self._external_db_buffer.set_progress(state.external_page_count)
            self.page_need_look_up = state.page_need_look_up
            self.current_level = state.current_level
            self.progress_logger.set_reference(state.log_sample_index, state.log_started_time)
            counter = 0
            if self.data_source is not None:
                try:
                    for item in self.data_source.get_next():
                        if counter >= self.cache_size:
                            break
                        if isinstance(item, OnSiteLink) and not LinkChecker.is_external_link(self.root_domain, item.link):
                            self.cache_list.append(item.link)
                            # print("--restore: ", item)
                            counter += 1
                except Exception as ex:
                    msg = "error in SiteChecker.populate_with_state(), trying to populate cache, " + self.root_domain
                    ErrorLogger.log_error("SiteChecker", ex, msg)

                self.data_source.ref = state.datasource_ref
                self.data_source.output_c = state.datasource_output_c
                self.data_source.set_progress(state.datasource_index if state.datasource_index < state.page_count else state.page_count)
                self.data_source.set_continue_lock(True)
コード例 #11
0
 def testScrapePageBatch(self):
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_links.txt"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     domains_links = FileHandler.read_lines_from_file(file_path)
     for link in domains_links:
         # link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
         #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
         stop_event = multiprocessing.Event()
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             link)
         root_domain = LinkChecker.get_root_domain(domain)[1]
         path = "/index.html"
         link_s = LinkAttrs(link=link,
                            path=path,
                            ref_link="/",
                            shadow_ref_link="/",
                            source=path,
                            res_type=LinkUtility.EXT_WEBPAGE,
                            level=0)
         explorer = ArchiveExplorer(
             original_domain=root_domain,
             link=link,
             external_stop_event=stop_event,
             download_base_dir=FilePath.get_default_archive_dir(),
             max_thread=10,
             max_level=2)
         explorer.run()
         archive_detail = explorer.get_archive_detail()
         CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
コード例 #12
0
 def testRobot2(self):
     rp = LinkChecker.get_robot_agent("http://pointshound.com/robots.txt")
     if rp is not None:
         for i in range(1, 1000):
             print("count:", i, "can fetch:", rp.can_fetch("*", "http://www.bbc.co.uk/fafdjiaofpadpvhagaarga/news/agqrgfv/y"))
     else:
         print("domain is not available.")
コード例 #13
0
 def testRobot4(self):
     #rules = LinkChecker.get_robot_agent("sbnet.se")
     rules = LinkChecker.get_robot_agent("realwire.com")
     crawl_delay = rules.delay("idiot")
     print("delay is:", crawl_delay)
     for i in range(1, 1000):
         print(rules.allowed("http://api.google.com/search/", agent="idiot"))
コード例 #14
0
    def _parse_text_res(self, page: LinkAttrs) -> str:
        page.link = page.link.replace("\\/", "/")  # in case of javascript
        response = LinkChecker.get_common_web_resource(
            page.link,
            timeout=self._timeout,
            redirect=self._max_redirect,
            retries=self._max_retries)
        result = ""
        groups = []
        parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups,
                                         self._original_domain, page)
        if page.res_type == LinkUtility.EXT_WEBPAGE:
            text = str(LinkUtility.remove_archive_org_footprint(response.text))
        else:
            text = response.text
        result = re.sub(link_pattern, parse_str_sp, text)
        for item in groups:
            if isinstance(item, LinkAttrs):
                if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\
                        ArchiveExplorer.is_downloadable_content(item, self._max_level):
                    with self._sync_lock:
                        # print("appending:", item)
                        # print("adding to list:", item.link, "level: ", item.level)
                        if not item.shadow_ref_link == item.ref_link:
                            self._file_manager.write_to_redirect(
                                item.shadow_ref_link, item.ref_link)
                        self._internal_list.append(item)

        return result
コード例 #15
0
 def _map_res_str(captured: [], root_domain: str, page: LinkAttrs, current_match) -> str:
     returned = None
     level = page.level
     try:
         link = current_match.group(0)
         # print("cap:", link)
         match2 = current_match.group(2)
         current_link = current_match.group(1) + match2
         begin_index = str(link).index("/")
         begin_mark = str(link[:begin_index]).strip()
         end_index = begin_index + len(current_link)
         if end_index >= len(link):
             end_mark = ""
         else:
             end_mark = str(link[end_index:]).strip()
         # if "%3" in current_link:  # transform encoded url
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(current_link)
         if len(inner_link) > 0:
             if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE:  # data will be saved in file system
                 if root_domain in domain:
                     is_internal = True
                 else:
                     is_internal = False
                 path_decoded = parse.unquote(path)
                 if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN:
                     short_path, ext = LinkChecker.get_shorter_url_path(path)
                     short_path += ext
                 else:
                     short_path = path
                 if link_class == LinkUtility.EXT_WEBPAGE:
                     if len(ext) > 0 and not ext == ".html":
                         valid_short_path = short_path.replace(ext, ".html")
                     else:
                         valid_short_path = short_path
                 else:
                     valid_short_path = short_path
                 file_path, ref_path = LinkUtility.make_valid_web_res_path(path, fragment)
                 short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(valid_short_path, fragment)
                 current_link = current_link.replace("\\/", "/")
                 captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, short_file_path,
                                           short_ref_path, ref_path,
                                           page.path, link_class, level+1, is_internal=is_internal))
                 returned = begin_mark + short_ref_path + end_mark
             else: #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE:
                 returned = begin_mark + parse.unquote(match2) + end_mark
             # else:  # capture other resources except external webpage
             #     file_path, ref_path = LinkUtility.make_valid_web_res_path(path)
             #     captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1))
             #     returned = begin_mark + ref_path + end_mark
         else:
             returned = begin_mark + parse.unquote(current_link) + end_mark
     except Exception as ex:
         print("ex in mapping:", ex)
     finally:
         if isinstance(returned, str):
             # print("sub:", returned)
             return returned
         else:
             return ""
コード例 #16
0
def test_response(link: str) -> bool:
    status_code, content_type = LinkChecker.get_response(link)
    if status_code != 200:
        print(link, "status bad:", status_code, " content: ", content_type)
        return False
    else:
        print(link, "status good:", status_code, " content: ", content_type)
        return True
コード例 #17
0
 def test_get_sub_domains(self):
     full_link = "http://blogspot.co.uk/"
     domain_data = LinkChecker.get_root_domain(full_link, False)
     root_domain = domain_data[1]
     sub_domain = domain_data[4]
     domain_suffix = domain_data[5]
     sub_domain_no_local = sub_domain.strip(domain_suffix)
     print(sub_domain_no_local)
コード例 #18
0
def test_response(link: str) -> bool:
    status_code, content_type = LinkChecker.get_response(link)
    if status_code != 200:
        print(link, "status bad:", status_code, " content: ", content_type)
        return False
    else:
        print(link, "status good:", status_code, " content: ", content_type)
        return True
コード例 #19
0
 def test_get_sub_domains(self):
     full_link = "http://blogspot.co.uk/"
     domain_data = LinkChecker.get_root_domain(full_link, False)
     root_domain = domain_data[1]
     sub_domain = domain_data[4]
     domain_suffix = domain_data[5]
     sub_domain_no_local = sub_domain.strip(domain_suffix)
     print(sub_domain_no_local)
コード例 #20
0
 def testRobot4(self):
     #rules = LinkChecker.get_robot_agent("sbnet.se")
     rules = LinkChecker.get_robot_agent("realwire.com")
     crawl_delay = rules.delay("idiot")
     print("delay is:", crawl_delay)
     for i in range(1, 1000):
         print(rules.allowed("http://api.google.com/search/",
                             agent="idiot"))
コード例 #21
0
    def check_external_page(checker: SiteChecker, page: OnSiteLink, timeout=10):
        """
        check DNS Error Only
        :param checker:
        :param page:
        :param timeout:
        :return:
        """
        # response = LinkChecker.get_response(page.link, timeout)
        #real_response_code = response[0]
        #real_response_code = ResponseCode.LinkOK

        #print("-------checking external " + page.link)
        try:
            root_result = LinkChecker.get_root_domain(page.link)
            root_domain = root_result[1]
            sub_domain = root_result[4]

            if len(sub_domain) == 0 or root_domain in checker.external_cache_list:
                return
            else:
                if len(checker.external_cache_list) < checker.external_cache_size:
                    checker.external_cache_list.append(root_domain)

            real_response_code = page.response_code
            if LinkChecker.is_domain_DNS_OK(sub_domain):  # check DNS first
                real_response_code = ResponseCode.NoDNSError
            elif not sub_domain.startswith("www."):
                if LinkChecker.is_domain_DNS_OK("www." + root_domain):
                    real_response_code = ResponseCode.NoDNSError
                # response = LinkChecker.get_response(page.link, timeout)  # check 404 error

            page.response_code = real_response_code
            page.link_type = OnSiteLink.TypeOutbound
            page.link = root_domain
            #print(" ready to output external:", str(page))
            if checker.output_all_external or ResponseCode.domain_might_be_expired(real_response_code):
                    # if checker.delegate is not None:
                    #     checker.delegate(new_page)
                if checker.output_queue is not None:
                    with checker._queue_lock:
                        checker.output_queue.put(page)
        except Exception as ex:
            PrintLogger.print(ex)
            ErrorLogger.log_error("PageChecker", ex, "check_external_page() " + page.link)
コード例 #22
0
 def get_archives_lang(root_domain: str, thread_size=10, profile_check=300) -> list:
     url = LinkChecker.get_valid_link(root_domain, link="")
     profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=0-profile_check)
     today_stamp = datetime.utcnow().timestamp()
     for item in profiles:
         if isinstance(item, ArchiveStruct):
             timestamp = item.get_datestamp_unix_time()
             print(str(item), " converted:", str(timestamp))
     return []
コード例 #23
0
 def testRobot5(self):
     base_link = "http://pointshound.com"
     test_sub_paths = [
                      "/", "/why", "/about", "/privacy", "/howitworks", "/help",
                      "/press", "/terms", "/guarantee", "/contact_form", "/something-else"]
     rules = LinkChecker.get_robot_agent("pointshound.com", protocol="https")
     for item in test_sub_paths:
         path = base_link + item
         is_allowed = rules.allowed(path, agent="VegeBot Test")
         print("sub_path:", item, " is allowed:", is_allowed)
コード例 #24
0
def check_whois(domain_data: OnSiteLink):
    root_domain = domain_data.link.lower()
    try:
        if root_domain.startswith("http"):
            root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
        is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain)  # check whois record
        if is_available or is_redemption:
            if is_available:
                real_response_code = ResponseCode.Expired
            else:
                real_response_code = ResponseCode.MightBeExpired
            domain_data.link = root_domain
            domain_data.response_code = real_response_code
            #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
            # self._output_q.put((domain_data.link, domain_data.response_code))
    except Exception as ex:
        print(ex)
    finally:
        return domain_data.link, domain_data.response_code
コード例 #25
0
def check_whois(domain_data: OnSiteLink):
    root_domain = domain_data.link.lower()
    try:
        if root_domain.startswith("http"):
            root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
        is_available, is_redemption = LinkChecker.is_domain_available_whois(
            root_domain)  # check whois record
        if is_available or is_redemption:
            if is_available:
                real_response_code = ResponseCode.Expired
            else:
                real_response_code = ResponseCode.MightBeExpired
            domain_data.link = root_domain
            domain_data.response_code = real_response_code
            #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
            # self._output_q.put((domain_data.link, domain_data.response_code))
    except Exception as ex:
        print(ex)
    finally:
        return domain_data.link, domain_data.response_code
コード例 #26
0
 def testShortUrl2(self):
     urls = ["http://gamblingaddiction.cc/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%94%e0%b9%8c%e0%b8%a1%e0%b8%b2%e0%b8%a3%e0%b9%8c%e0%b8%8a%e0%b8%82%e0%b9%88%e0%b8%b2%e0%b8%a7%e0%b8%a3%e0%b8%b4%e0%b8%9f.html",
             "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%.html",
             "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%",
             "/中国人民解放军/中国人民解放军/中国人民解放军.html",
             "strongholeqp4tfq;eafak;faf"]
     for url in urls:
         short_path, ext = LinkChecker.get_shorter_url_path(url)
         print("doing:", url)
         print("new path:", short_path)
         print("extension:", ext)
コード例 #27
0
 def testRobot2(self):
     rp = LinkChecker.get_robot_agent("http://pointshound.com/robots.txt")
     if rp is not None:
         for i in range(1, 1000):
             print(
                 "count:", i, "can fetch:",
                 rp.can_fetch(
                     "*",
                     "http://www.bbc.co.uk/fafdjiaofpadpvhagaarga/news/agqrgfv/y"
                 ))
     else:
         print("domain is not available.")
コード例 #28
0
 def testShortUrl2(self):
     urls = [
         "http://gamblingaddiction.cc/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%94%e0%b9%8c%e0%b8%a1%e0%b8%b2%e0%b8%a3%e0%b9%8c%e0%b8%8a%e0%b8%82%e0%b9%88%e0%b8%b2%e0%b8%a7%e0%b8%a3%e0%b8%b4%e0%b8%9f.html",
         "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%.html",
         "/salendine-%e0%b8%99%e0%b8%b8%e0%b9%8a%e0%b8%81%e0%b8%a5%e0%b8%b4%e0%b8%99%e0%b8%",
         "/中国人民解放军/中国人民解放军/中国人民解放军.html", "strongholeqp4tfq;eafak;faf"
     ]
     for url in urls:
         short_path, ext = LinkChecker.get_shorter_url_path(url)
         print("doing:", url)
         print("new path:", short_path)
         print("extension:", ext)
コード例 #29
0
    def scrape_web_res(self, page: LinkAttrs):
        print("look:", page.link, "level: ", page.level)
        try:
            if len(page.path) > ArchiveExplorer.MAX_PATH_LEN:  # max file path in any file system
                raise OSError("file path is too long:" + page.path)
            response_code, content_type = LinkChecker.get_response(page.link)
            if response_code not in [ResponseCode.LinkOK, ResponseCode.LinkFound, ResponseCode.LinkRedirect]:
                raise ConnectionError("res is not available: " + page.link)
            if page.res_type in [LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS, LinkUtility.EXT_JS]:  # parse a webpage
                save_text = self._parse_text_res(page)
                self._file_manager.write_to_file(page.path, save_text)
            # elif page.res_type != LinkUtility.EXT_OTHER:  # TODO: download normal resources
            #     response = LinkChecker.get_common_web_resource(page.link)
            #     if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT:
            #         self._downloader.write_to_file(page.path, response.content, mode="b")
            #     else:
            #         self._downloader.write_to_file(page.path, response.text, mode="t")
            else:
                # response = LinkChecker.get_common_web_resource(page.link)
                # self._downloader.write_to_file(page.path, response.content, mode="b")
                self._file_manager.download_file(sub_path=page.path, url=page.link, timeout=self._timeout,
                                                 redirect=self._max_redirect, retries=self._max_retries)
        except Exception as ex:
            print("exception:", ex)
            print("broken res:", page)
            with self._sync_lock:
                self._file_manager.write_to_error_log(page.to_tuple(str(ex)))
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._broken_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._broken_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._broken_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._broken_js_count += 1
                else:
                    self._broken_others_count += 1

                self._broken_res_list.append(page)
        finally:
            with self._sync_lock:
                self._total_res_done += 1
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._total_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._total_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._total_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._total_js_count += 1
                else:
                    self._total_others_count += 1
コード例 #30
0
 def _check_whois(self, domain_data: OnSiteLink):
     root_domain = domain_data.link.lower()
     try:
         if not self._is_debug:
             if root_domain.startswith("http"):
                 root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
             is_available, is_redemption = LinkChecker.is_domain_available_whois(root_domain)  # check whois record
             if is_available or is_redemption:
                 if is_available:
                     real_response_code = ResponseCode.Expired
                 else:
                     real_response_code = ResponseCode.MightBeExpired
                 domain_data.link = root_domain
                 domain_data.response_code = real_response_code
             #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
                 self._put_output_result_in_queue(domain_data)
         else:
             self._put_output_result_in_queue(domain_data)
     except Exception as ex:
         ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain)
     finally:
         self._add_job_done_one()
コード例 #31
0
 def testRemoveFootprint2(self):
     link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
     page_source = LinkChecker.get_page_source(link)
     bs4_tree = LinkUtility.remove_archive_org_footprint(page_source.text)
     link_list = []
     for child in bs4_tree.find_all():
         if isinstance(child, bs4.Tag):
             if "href" in child.attrs:
                 link_list.append(child["href"])
             elif "src" in child.attrs:
                 link_list.append(child["src"])
     for item in link_list:
         print(item)
コード例 #32
0
 def get_archives_lang(root_domain: str,
                       thread_size=10,
                       profile_check=300) -> list:
     url = LinkChecker.get_valid_link(root_domain, link="")
     profiles = ArchiveOrg.get_url_info(url,
                                        min_size=1,
                                        limit=0 - profile_check)
     today_stamp = datetime.utcnow().timestamp()
     for item in profiles:
         if isinstance(item, ArchiveStruct):
             timestamp = item.get_datestamp_unix_time()
             print(str(item), " converted:", str(timestamp))
     return []
コード例 #33
0
 def testRemoveFootprint2(self):
     link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
     page_source = LinkChecker.get_page_source(link)
     bs4_tree = LinkUtility.remove_archive_org_footprint(page_source.text)
     link_list = []
     for child in bs4_tree.find_all():
         if isinstance(child, bs4.Tag):
             if "href" in child.attrs:
                 link_list.append(child["href"])
             elif "src" in child.attrs:
                 link_list.append(child["src"])
     for item in link_list:
         print(item)
コード例 #34
0
 def testRobot5(self):
     base_link = "http://pointshound.com"
     test_sub_paths = [
         "/", "/why", "/about", "/privacy", "/howitworks", "/help",
         "/press", "/terms", "/guarantee", "/contact_form",
         "/something-else"
     ]
     rules = LinkChecker.get_robot_agent("pointshound.com",
                                         protocol="https")
     for item in test_sub_paths:
         path = base_link + item
         is_allowed = rules.allowed(path, agent="VegeBot Test")
         print("sub_path:", item, " is allowed:", is_allowed)
コード例 #35
0
 def get_link_class(link: str) -> (str, str):
     ext = LinkChecker.get_link_extension(link).lower()
     if len(ext) == 0 or ext in LinkChecker.common_html_page_ex:
         return LinkUtility.EXT_WEBPAGE, ext
     elif ext in LinkChecker.common_img_ex:
         return LinkUtility.EXT_IMAGE, ext
     elif ext in LinkChecker.common_font_ex:
         return LinkUtility.EXT_FONT, ext
     elif ext.endswith("css"):
         return LinkUtility.EXT_CSS, ext
     elif ext.endswith("js"):
         return LinkUtility.EXT_JS, ext
     else:
         return LinkUtility.EXT_OTHER, ext
コード例 #36
0
 def get_link_class(link: str) -> (str, str):
     ext = LinkChecker.get_link_extension(link).lower()
     if len(ext) == 0 or ext in LinkChecker.common_html_page_ex:
         return LinkUtility.EXT_WEBPAGE, ext
     elif ext in LinkChecker.common_img_ex:
         return LinkUtility.EXT_IMAGE, ext
     elif ext in LinkChecker.common_font_ex:
         return LinkUtility.EXT_FONT, ext
     elif ext.endswith("css"):
         return LinkUtility.EXT_CSS, ext
     elif ext.endswith("js"):
         return LinkUtility.EXT_JS, ext
     else:
         return LinkUtility.EXT_OTHER, ext
コード例 #37
0
 def _check_whois_v1(self, domain_data: OnSiteLink):
     root_domain = domain_data.link
     try:
         if root_domain.startswith("http"):
             root_domain = LinkChecker.get_root_domain(domain_data.link)[1]
         real_response_code = domain_data.response_code
         whois = LinkChecker.check_whois(root_domain)  # check whois record
         if whois[0]:
             if whois[2]:  # domain is expired
                 real_response_code = ResponseCode.Expired
             else:
                 real_response_code = ResponseCode.MightBeExpired
         if real_response_code == ResponseCode.Expired:
         #if ResponseCode.domain_might_be_expired(real_response_code):
             domain_data.link = root_domain
             domain_data.response_code = real_response_code
             #return_obj = OnSiteLink(root_domain, real_response_code, domain_data.link_level, OnSiteLink.TypeOutbound)
             # if isinstance(self._queue_lock, multiprocessing.RLock):
             with self._queue_lock:
                 self._output_q.put((domain_data.link, domain_data.response_code))
     except Exception as ex:
         ErrorLogger.log_error("ExternalSiteChecker.WhoisChecker", ex, "_check_whois() " + root_domain)
     finally:
         self._add_job_done_one()
コード例 #38
0
 def testScrapePage(self):
     # link = "http://web.archive.org/web/20111102054835/http://www.agfdh.org:80/"
     link = "http://web.archive.org/web/20150425143742/http://susodigital.com/"
     #link ="http://web.archive.org/web/20130415001342/http://www.bbc.co.uk/"
     stop_event = multiprocessing.Event()
     inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(link)
     root_domain = LinkChecker.get_root_domain(domain)[1]
     path = "/index.html"
     link_s = LinkAttrs(link=link, path=path, ref_link="/", shadow_ref_link="/", source=path, res_type=LinkUtility.EXT_WEBPAGE, level=0)
     explorer = ArchiveExplorer(original_domain=root_domain, link=link,
                                external_stop_event=stop_event,
                                download_base_dir=FilePath.get_default_archive_dir(), max_thread=10, max_level=2)
     explorer.run()
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive_downloaded.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     archive_detail = explorer.get_archive_detail()
     CsvLogger.log_to_file_path(save_path, [archive_detail.to_tuple()])
コード例 #39
0
 def download_file(self, sub_path: str, url: str, timeout=5, retries=1, redirect=5):
     full_path = self._dir_path + sub_path
     full_path = full_path.replace("//", "/")
     print("download to file:", full_path)
     bytes_count = 0
     s = LinkChecker.get_common_request_session(retries=retries, redirect=redirect)
     # NOTE the stream=True parameter
     r = s.get(url, stream=True, timeout=timeout)
     FileIO.FileHandler.create_file_if_not_exist(full_path)
     with open(full_path, 'wb') as f:
         for chunk in r.iter_content(chunk_size=1024):
             if chunk:  # filter out keep-alive new chunks
                 f.write(chunk)
                 f.flush()
             bytes_count += len(chunk)
         f.close()
     if bytes_count == 0:
         raise ConnectionError("URL broken: " + url)
コード例 #40
0
def test_response(link: str) -> (bool, str):
    # link_cls, ext = LinkUtility.get_link_class(link)
    status_code, content_type = LinkChecker.get_response(link)
    link_cls, ext = LinkUtility.get_link_class(link.rstrip('/'))
    # print("checking link:", link, " link cls:", link_cls, " ext:", ext)
    # if "image" in content_type:
    #     link_cls = LinkUtility.EXT_WEBPAGE
    # elif "html" in content_type:
    #     link_cls = LinkUtility.EXT_WEBPAGE
    # elif "css" in content_type:
    #     link_cls = LinkUtility.EXT_CSS
    # elif "javascript" in content_type:
    #     link_cls = LinkUtility.EXT_JS
    # else:
    #     link_cls = LinkUtility.EXT_OTHER

    if status_code != 200:
        # print(link, "status bad:", status_code, " content: ", content_type)
        return False, link_cls
    else:
        # print(link, "status good:", status_code, " content: ", content_type)
        return True, link_cls
コード例 #41
0
def test_response(link: str) -> (bool, str):
    # link_cls, ext = LinkUtility.get_link_class(link)
    status_code, content_type = LinkChecker.get_response(link)
    link_cls, ext = LinkUtility.get_link_class(link.rstrip('/'))
    # print("checking link:", link, " link cls:", link_cls, " ext:", ext)
    # if "image" in content_type:
    #     link_cls = LinkUtility.EXT_WEBPAGE
    # elif "html" in content_type:
    #     link_cls = LinkUtility.EXT_WEBPAGE
    # elif "css" in content_type:
    #     link_cls = LinkUtility.EXT_CSS
    # elif "javascript" in content_type:
    #     link_cls = LinkUtility.EXT_JS
    # else:
    #     link_cls = LinkUtility.EXT_OTHER

    if status_code != 200:
        # print(link, "status bad:", status_code, " content: ", content_type)
        return False, link_cls
    else:
        # print(link, "status good:", status_code, " content: ", content_type)
        return True, link_cls
コード例 #42
0
    def _get_back_link_thread(account: MajesticCom, sub_domain: str, count_per_domain: int, fresh_data: bool,
                              sub_domains: [], temp_sub_domains: [], categories: [], callback, tf=20, bad_country_list=[]):
        temp = []
        print("doing backlinks of domain:", sub_domain, " domain len:", len(temp_sub_domains))
        try:

            temp = account.get_backlinks(sub_domain, count_per_domain, topic="", is_dev=False, fresh_data=fresh_data)
        except Exception as ex:
            print(ex)
        for item in temp:
            if isinstance(item, MajesticBacklinkDataStruct):

                # item_catagory = str(CategoryManager.decode_sub_category(item.src_topic, False))
                domain = LinkChecker.get_root_domain(item.backlink, use_www=False)[4]
                item.ref_domain = domain
                # if callback is not None:
                #     callback(item)
                # if len(target_catagories) > 0 and item_catagory not in target_catagories:
                #         continue
                if domain not in sub_domains and domain not in temp_sub_domains:
                    if len(categories) > 0:
                        is_in = False
                        if len(item.src_topic) > 0:
                            decoded = str(CategoryManager.decode_sub_category(item.src_topic, False))
                            for cate in categories:
                                if cate in decoded:
                                    is_in = True
                                    break
                            if is_in and item.src_tf >= tf:
                                temp_sub_domains.append(domain)
                    elif item.src_tf >= tf:
                        temp_sub_domains.append(domain)
                    item.ref_domain = domain
                    if callback is not None:
                        callback(item)
        time.sleep(1)
コード例 #43
0
 def download_file(self,
                   sub_path: str,
                   url: str,
                   timeout=5,
                   retries=1,
                   redirect=5):
     full_path = self._dir_path + sub_path
     full_path = full_path.replace("//", "/")
     print("download to file:", full_path)
     bytes_count = 0
     s = LinkChecker.get_common_request_session(retries=retries,
                                                redirect=redirect)
     # NOTE the stream=True parameter
     r = s.get(url, stream=True, timeout=timeout)
     FileIO.FileHandler.create_file_if_not_exist(full_path)
     with open(full_path, 'wb') as f:
         for chunk in r.iter_content(chunk_size=1024):
             if chunk:  # filter out keep-alive new chunks
                 f.write(chunk)
                 f.flush()
             bytes_count += len(chunk)
         f.close()
     if bytes_count == 0:
         raise ConnectionError("URL broken: " + url)
コード例 #44
0
    def _parse_text_res(self, page: LinkAttrs) -> str:
        page.link = page.link.replace("\\/", "/")  # in case of javascript
        response = LinkChecker.get_common_web_resource(page.link, timeout=self._timeout,
                                                       redirect=self._max_redirect, retries=self._max_retries)
        result = ""
        groups = []
        parse_str_sp = functools.partial(ArchiveExplorer._map_res_str, groups, self._original_domain, page)
        if page.res_type == LinkUtility.EXT_WEBPAGE:
            text = str(LinkUtility.remove_archive_org_footprint(response.text))
        else:
            text = response.text
        result = re.sub(link_pattern, parse_str_sp, text)
        for item in groups:
            if isinstance(item, LinkAttrs):
                if not ArchiveExplorer._is_in_list(item.path, self._internal_list) and\
                        ArchiveExplorer.is_downloadable_content(item, self._max_level):
                    with self._sync_lock:
                        # print("appending:", item)
                        # print("adding to list:", item.link, "level: ", item.level)
                        if not item.shadow_ref_link == item.ref_link:
                            self._file_manager.write_to_redirect(item.shadow_ref_link, item.ref_link)
                        self._internal_list.append(item)

        return result
コード例 #45
0
 def test_from_url(self):
     response = LinkChecker.get_page_source(link="http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848")
     print(langid.classify(response.text))
コード例 #46
0
    def get_sites_by_seed_sites_muti_threads(account: MajesticCom, seed_domains: [], catagories: [], fresh_data=False, index=0,
                                iteration=1, loop_count=0, count_per_domain=100, callback=None, current_count=0,
                                max_count=-1, tf=20, thread_pool_size=20, get_backlinks=True, bad_country_list=[]):
        """

        :param account:
        :param seed_domains:
        :param catagories:
        :param fresh_data:
        :param index:
        :param iteration:
        :param loop_count:
        :param count_per_domain:
        :param callback:
        :param current_count:
        :param max_count:
        :param tf:
        :param thread_pool_size:
        :param get_backlinks: it will get backlinks of domains if True, else it will get ref domains instead, which is cheaper.
        :return:
        """
        target_func = GoogleMajestic._get_back_link_thread if get_backlinks else GoogleMajestic._get_ref_domain_thread
        if iteration < 0:
            raise ValueError("get_sites_by_seed_sites: iteration should >= 0.")
        sub_domains = [LinkChecker.get_root_domain(x, use_www=False)[4] for x in seed_domains[index:]]
        if len(sub_domains) == 0:
            print("sub_domains is len 0.")
            return
        # counter = index
        process_len = len(sub_domains)
        if max_count > 0:
            if current_count >= max_count:
                print("exceeded seed len.")
                return
            elif current_count + process_len > max_count:
                process_len = max_count - current_count
        #target_catagories = []
        # for catagory in catagories:
        #     target_catagories.append(str(CategoryManager.decode_sub_category(catagory, False)))
        temp_sub_domains = []

        thread_pool = ThreadPool(processes=thread_pool_size)
        processes = [thread_pool.apply_async(target_func,
                                             args=(account, x, count_per_domain, fresh_data, sub_domains,
                                                   temp_sub_domains, catagories,  callback, tf, bad_country_list))
                     for x in sub_domains[0: process_len]]
        results = [y.get() for y in processes]
        thread_pool.terminate()
        current_count += process_len
        if loop_count >= iteration:
            return
        else:
            new_seeds = sub_domains + temp_sub_domains
            print("going to next level with seeds:", len(new_seeds))
            return GoogleMajestic.get_sites_by_seed_sites_muti_threads(account, new_seeds,
                                                                       catagories, fresh_data, len(seed_domains),
                                                                       iteration, loop_count+1, count_per_domain,
                                                                       callback, current_count, max_count,
                                                                       thread_pool_size=10, tf=tf,
                                                                       get_backlinks=get_backlinks,
                                                                       bad_country_list=bad_country_list)
コード例 #47
0
 def test_from_url(self):
     response = LinkChecker.get_page_source(
         link=
         "http://www.frenchweb.fr/sisense-decroche-50-millions-de-dollars-pour-accelerer-dans-lanalyse-de-donnees/221848"
     )
     print(langid.classify(response.text))
コード例 #48
0
    def get_best_archive(root_domain: str, thread_size=100, profile_check=10, pass_threshold=0.8, res_limit=2000) -> ArchiveDetail:
        """
        get the best profile from archive.org by doing profile spectrum analysis, given a root domain name.
        spectrum analysis: comparison between resources of current profile to all historic resources.
        :param root_domain: root domain in str, e.g: "google.co.uk"
        :param thread_size: number of thread to check resource link simultaneously
        :param profile_check: max number of profile to check
        :param pass_threshold: threshold define if a profile is good enough.
        :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc.
        :return: tuple (archive in ArchiveStruct, spectrum value)
        """
        url = LinkChecker.get_valid_link(root_domain, link="")
        profiles = ArchiveOrg.get_url_info(url, min_size=1, limit=-profile_check)
        timestamp =""
        info = ArchiveOrg.get_domain_urls(url, limit=res_limit)
        res_count = len(info)
        archive = None
        current_rate = 0.0
        min_broken_res_count = 0
        good_rate_web_page = 0
        good_rate_image = 0
        good_rate_css = 0
        good_rate_js = 0
        good_rate_other = 0

        total_web_page_min = 0
        total_js_min = 0
        total_css_min = 0
        total_image_min = 0
        total_other_min = 0
        if res_count > 0:
            for profile in profiles:
                if isinstance(profile, ArchiveStruct):
                    total_web_page = 0
                    total_js = 0
                    total_css = 0
                    total_image = 0
                    total_other = 0

                    broken_web_page = 0
                    broken_js = 0
                    broken_css = 0
                    broken_image = 0
                    broken_other = 0

                    test_pool = pool.ThreadPool(processes=thread_size)
                    timestamp = profile.date_stamp
                    print("checking:", str(profile))
                    links = []
                    for item in info:
                        item.date_stamp = timestamp
                        links.append(ArchiveOrg.get_archive_link(item))
                    results = [test_pool.apply_async(func=test_response, args=(x,)) for x in links]
                    returned = [y.get() for y in results]
                    test_pool.terminate()
                    for result_good, link_cls in returned:
                        if link_cls == LinkUtility.EXT_WEBPAGE:
                            total_web_page += 1
                            if not result_good:
                                broken_web_page += 1
                        elif link_cls == LinkUtility.EXT_CSS:
                            total_css += 1
                            if not result_good:
                                broken_css += 1
                        elif link_cls == LinkUtility.EXT_JS:
                            total_js += 1
                            if not result_good:
                                broken_js += 1
                        elif link_cls == LinkUtility.EXT_IMAGE:
                            total_image += 1
                            if not result_good:
                                broken_image += 1
                        else:
                            total_other += 1
                            if not result_good:
                                broken_other += 1
                    broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image
                    passed = False
                    total_broken_rate = 1-broken_res_count/res_count
                    if total_broken_rate >= pass_threshold:
                        passed = True
                    if total_broken_rate > current_rate:
                        current_rate = total_broken_rate
                        archive = profile
                        good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page/total_web_page
                        good_rate_image = 0 if total_image == 0 else 1 - broken_image/total_image
                        good_rate_css = 0 if total_css == 0 else 1 - broken_css/total_css
                        good_rate_js = 0 if total_js == 0 else 1 - broken_js/total_js
                        good_rate_other = 0 if total_other == 0 else 1 - broken_other/total_other

                        total_web_page_min = total_web_page
                        total_js_min = total_js
                        total_css_min = total_css
                        total_image_min = total_image
                        total_other_min = total_other
                        min_broken_res_count = total_broken_rate
                    print("total:", res_count, " broken res:", broken_res_count, " stamp: ",
                          profile.date_stamp, " pass? ", passed, " rate:", total_broken_rate)
        return ArchiveDetail(root_domain, archive_link=ArchiveOrg.get_archive_link(archive), total_res=res_count, good_res_rate=min_broken_res_count,
                             total_web_page=total_web_page_min, good_webpage_rate=good_rate_web_page,
                             total_css=total_css_min, good_css_rate=good_rate_css,
                             total_js=total_js_min,  good_js_rate=good_rate_js,
                             total_image=total_image_min, good_image_rate=good_rate_image,
                             total_other=total_other_min, good_other_rate=good_rate_other)
コード例 #49
0
    def __init__(self, full_link: str="", data_source: SiteTempDataSrcInterface=None,
                 controller: SiteCheckerController=None,
                 max_level=10, max_page=1000, delegate=None, output_buff_size=2000,
                 output_queue=None, output_all_external=False, result_delegate=None,
                 memory_control_terminate_event=None, check_robot_text=True,
                 **kwargs):
        """
        :param full_link: The full link of a domain, e.g: https://www.google.co.uk
        :param domain: domain to crawl
        :param max_level: stop crawling if it reaches this level
        :param max_page: maximum pages to check within a site, also stop crawling
        :param delegate: if this is not None, then it will send the latest result of external domain of ResponseCode==404 or 999
        :param result_delegate: send site_info upon finish
        :param memory_control_terminate_event: if this is not None and being set, it will be able to terminate an external memory controlled process.
        :return:
        """
        FeedbackInterface.__init__(self, **kwargs)
        #super(SiteChecker, self).__init__(**kwargs)
        if full_link is None or len(full_link) == 0:
            raise ValueError()

        original_path = ""
        try:
            paras = urlsplit(full_link)
            self.scheme, self.domain, original_path = paras[0], paras[1], paras[2]
        except:
            pass

        domain_data = LinkChecker.get_root_domain(full_link, False)
        self.root_domain = domain_data[1]
        self.sub_domain = domain_data[4]
        self.domain_suffix = domain_data[5]
        self.sub_domain_no_local = self.sub_domain.strip(self.domain_suffix)
        if self.scheme == "":
            self.scheme = "http"
        if self.domain == "":
            self.domain = self.root_domain
        self.orginal_link = full_link
        self.domain_link = LinkChecker.get_valid_link(self.root_domain, full_link, self.scheme)
        self.max_level = max_level
        self.max_page = max_page
        self.page_count = 0  # keep track page done
        self._page_count_shadow = 0 # track previous count
        self._all_page_count_shadow = 0 #track previous count in datasource
        self.internal_page_count = 0
        self.internal_page_last_count = 0
        self.page_allocated = 0
        self.current_level = 0  # if this = 0, it is root domain/home_page
        self._stop_event = Event()
        valid_file_name = SiteTempDataSrcInterface.get_valid_file_name(self.domain_link)
        self._external_db_buffer = ExternalTempDataDiskBuffer(valid_file_name+".ext.db", self,
                                                              stop_event=self._stop_event,
                                                              buf_size=int(output_buff_size/2),
                                                              dir_path=get_db_buffer_default_dir(),
                                                              convert_output=False)
        self._external_db_buffer.append_to_buffer([(self.root_domain, ResponseCode.DNSError),], convert_tuple=False)
        self._memory_control_terminate_event = memory_control_terminate_event
        self.task_control_lock = threading.RLock()
        if data_source is None:
            #self.data_source = SiteTempDataDisk(self.root_domain, ref_obj=self)
            self.data_source = SiteTempDataDiskWithBuff(ref=self.domain_link, output_buff_size=output_buff_size, ref_obj=self)
        else:
            self.data_source = data_source  # a list of OnSiteLink
        self.delegate = delegate
        if LinkChecker.might_be_link_html_page(original_path):
            self.data_source.append(OnSiteLink(self.domain_link, response_code=ResponseCode.LinkOK, link_level=1)) # add the root domain as a starting point
        self.data_source.append(OnSiteLink(self.scheme + "://www."+self.sub_domain, ResponseCode.LinkOK, link_level=1))
        self.data_source.append(OnSiteLink(self.scheme + "://" + self.domain, ResponseCode.LinkOK, link_level=1))
        self.cache_list = []  # internal page cache
        self.page_need_look_up_temp = 0
        self.cache_list.append(self.domain_link)
        if "www." not in self.sub_domain:
            self.cache_list.append(self.scheme + "://www."+self.sub_domain)
        self.cache_list.append(self.scheme + "://" + self.domain)
        self.page_need_look_up = self.data_source.count_all()
        self.cache_size = 500  # create a small cache list to avoid going to check link in file system with lots of read and write
        self._double_check_cache_lock = threading.RLock()
        self._double_check_cache = deque(maxlen=self.cache_size)
        self.external_cache_list = []
        self.external_cache_size = 500  # cache that hold external sites
        self.external_links_checked = 0
        self.add_internal_page_OK_only = True
        self.output_queue = output_queue
        self.output_all_external = output_all_external
        self.controller = controller
        self.result_delegate = result_delegate
        self.page_count_lock = threading.RLock()
        self.internal_page_count_lock = threading.RLock()
        self.level_lock = threading.RLock()
        self.page_look_up_lock = threading.RLock()
        self.external_link_check_lock = threading.RLock()
        self._finihsed = False
        self.task_control_max = 1
        self.agent = "VegeBot (we follow your robots.txt settings before crawling, you can slow down the bot by change the Crawl-Delay parameter in the settings." \
                     "if you have an enquiry, please email to: [email protected])"
        self.agent_from = "*****@*****.**"
        if check_robot_text:
            self.robot_agent = LinkChecker.get_robot_agent(self.sub_domain, protocol=self.scheme)
        else:
            self.robot_agent = None
        self.site_crawl_delay = 0.60

        if isinstance(self.robot_agent, Rules):
            delay_temp = self.robot_agent.delay(self.agent)
            if delay_temp is not None and delay_temp != self.site_crawl_delay:
                self.site_crawl_delay = delay_temp

        self.task_control_counter = 1
        self._speed_penalty_count = 0
        self._speed_penalty_threshold = 10
        self._progress_logging_speed = 120
        self._output_period = 120
        self._output_batch_size = 100
        self._death_wish_sent = False
        SiteChecker._is_lxml_parser_exist()
        self._output_thread = None
        self._output_queue = None
        self.progress_logger = ProgressLogger(self._progress_logging_speed, self, self._stop_event)
        self._status = "Start"
        self._populate_with_state()  # restore laste known state
コード例 #50
0
 def _map_res_str(captured: [], root_domain: str, page: LinkAttrs,
                  current_match) -> str:
     returned = None
     level = page.level
     try:
         link = current_match.group(0)
         # print("cap:", link)
         match2 = current_match.group(2)
         current_link = current_match.group(1) + match2
         begin_index = str(link).index("/")
         begin_mark = str(link[:begin_index]).strip()
         end_index = begin_index + len(current_link)
         if end_index >= len(link):
             end_mark = ""
         else:
             end_mark = str(link[end_index:]).strip()
         # if "%3" in current_link:  # transform encoded url
         inner_link, domain, path, link_class, ext, fragment = LinkUtility.get_link_detail(
             current_link)
         if len(inner_link) > 0:
             if root_domain in domain or link_class != LinkUtility.EXT_WEBPAGE:  # data will be saved in file system
                 if root_domain in domain:
                     is_internal = True
                 else:
                     is_internal = False
                 path_decoded = parse.unquote(path)
                 if len(path_decoded) > ArchiveExplorer.MAX_PATH_LEN:
                     short_path, ext = LinkChecker.get_shorter_url_path(
                         path)
                     short_path += ext
                 else:
                     short_path = path
                 if link_class == LinkUtility.EXT_WEBPAGE:
                     if len(ext) > 0 and not ext == ".html":
                         valid_short_path = short_path.replace(ext, ".html")
                     else:
                         valid_short_path = short_path
                 else:
                     valid_short_path = short_path
                 file_path, ref_path = LinkUtility.make_valid_web_res_path(
                     path, fragment)
                 short_file_path, short_ref_path = LinkUtility.make_valid_web_res_path(
                     valid_short_path, fragment)
                 current_link = current_link.replace("\\/", "/")
                 captured.append(
                     LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN +
                               current_link,
                               short_file_path,
                               short_ref_path,
                               ref_path,
                               page.path,
                               link_class,
                               level + 1,
                               is_internal=is_internal))
                 returned = begin_mark + short_ref_path + end_mark
             else:  #root_domain not in domain and ext == LinkUtility.EXT_WEBPAGE:
                 returned = begin_mark + parse.unquote(match2) + end_mark
             # else:  # capture other resources except external webpage
             #     file_path, ref_path = LinkUtility.make_valid_web_res_path(path)
             #     captured.append(LinkAttrs(ArchiveExplorer.ARCHIVE_DOMAIN+current_link, file_path, ref_path, file_path, ext, level+1))
             #     returned = begin_mark + ref_path + end_mark
         else:
             returned = begin_mark + parse.unquote(current_link) + end_mark
     except Exception as ex:
         print("ex in mapping:", ex)
     finally:
         if isinstance(returned, str):
             # print("sub:", returned)
             return returned
         else:
             return ""
コード例 #51
0
    def scrape_web_res(self, page: LinkAttrs):
        print("look:", page.link, "level: ", page.level)
        try:
            if len(
                    page.path
            ) > ArchiveExplorer.MAX_PATH_LEN:  # max file path in any file system
                raise OSError("file path is too long:" + page.path)
            response_code, content_type = LinkChecker.get_response(page.link)
            if response_code not in [
                    ResponseCode.LinkOK, ResponseCode.LinkFound,
                    ResponseCode.LinkRedirect
            ]:
                raise ConnectionError("res is not available: " + page.link)
            if page.res_type in [
                    LinkUtility.EXT_WEBPAGE, LinkUtility.EXT_CSS,
                    LinkUtility.EXT_JS
            ]:  # parse a webpage
                save_text = self._parse_text_res(page)
                self._file_manager.write_to_file(page.path, save_text)
            # elif page.res_type != LinkUtility.EXT_OTHER:  # TODO: download normal resources
            #     response = LinkChecker.get_common_web_resource(page.link)
            #     if page.res_type == LinkUtility.EXT_IMAGE or page.res_type == LinkUtility.EXT_FONT:
            #         self._downloader.write_to_file(page.path, response.content, mode="b")
            #     else:
            #         self._downloader.write_to_file(page.path, response.text, mode="t")
            else:
                # response = LinkChecker.get_common_web_resource(page.link)
                # self._downloader.write_to_file(page.path, response.content, mode="b")
                self._file_manager.download_file(sub_path=page.path,
                                                 url=page.link,
                                                 timeout=self._timeout,
                                                 redirect=self._max_redirect,
                                                 retries=self._max_retries)
        except Exception as ex:
            print("exception:", ex)
            print("broken res:", page)
            with self._sync_lock:
                self._file_manager.write_to_error_log(page.to_tuple(str(ex)))
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._broken_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._broken_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._broken_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._broken_js_count += 1
                else:
                    self._broken_others_count += 1

                self._broken_res_list.append(page)
        finally:
            with self._sync_lock:
                self._total_res_done += 1
                if page.res_type == LinkUtility.EXT_WEBPAGE:
                    self._total_webpage_count += 1
                elif page.res_type == LinkUtility.EXT_CSS:
                    self._total_css_count += 1
                elif page.res_type == LinkUtility.EXT_IMAGE:
                    self._total_image_count += 1
                elif page.res_type == LinkUtility.EXT_JS:
                    self._total_js_count += 1
                else:
                    self._total_others_count += 1
コード例 #52
0
 def test_get_all_links(self):
     link = "http://web.archive.org/web/20140711025724/http://susodigital.com/"
     source = LinkChecker.get_page_source(link)
     all_links = LinkChecker.get_all_links_from_source(source)
     for link in all_links:
         print(link)
コード例 #53
0
 def testRequest(self):
     url = "http://127.0.0.1:8000/"
     agent = "VegeBot"
     source = LinkChecker.get_page_source(url, agent=agent, from_src="*****@*****.**")
     print(source)
コード例 #54
0
 def testGetAgent(self):
     root_domain = "halifaxnational.com"
     agent = LinkChecker.get_robot_agent(root_domain)
     can_fetch = agent.can_fetch("*", "http://halifaxnational.com/somethin")
     print(agent,"can fetch:", can_fetch)
コード例 #55
0
    def get_best_archive(root_domain: str,
                         thread_size=100,
                         profile_check=10,
                         pass_threshold=0.8,
                         res_limit=2000) -> ArchiveDetail:
        """
        get the best profile from archive.org by doing profile spectrum analysis, given a root domain name.
        spectrum analysis: comparison between resources of current profile to all historic resources.
        :param root_domain: root domain in str, e.g: "google.co.uk"
        :param thread_size: number of thread to check resource link simultaneously
        :param profile_check: max number of profile to check
        :param pass_threshold: threshold define if a profile is good enough.
        :param res_limit: number of resource links in domain resource spectrum, including css, js, html etc.
        :return: tuple (archive in ArchiveStruct, spectrum value)
        """
        url = LinkChecker.get_valid_link(root_domain, link="")
        profiles = ArchiveOrg.get_url_info(url,
                                           min_size=1,
                                           limit=-profile_check)
        timestamp = ""
        info = ArchiveOrg.get_domain_urls(url, limit=res_limit)
        res_count = len(info)
        archive = None
        current_rate = 0.0
        min_broken_res_count = 0
        good_rate_web_page = 0
        good_rate_image = 0
        good_rate_css = 0
        good_rate_js = 0
        good_rate_other = 0

        total_web_page_min = 0
        total_js_min = 0
        total_css_min = 0
        total_image_min = 0
        total_other_min = 0
        if res_count > 0:
            for profile in profiles:
                if isinstance(profile, ArchiveStruct):
                    total_web_page = 0
                    total_js = 0
                    total_css = 0
                    total_image = 0
                    total_other = 0

                    broken_web_page = 0
                    broken_js = 0
                    broken_css = 0
                    broken_image = 0
                    broken_other = 0

                    test_pool = pool.ThreadPool(processes=thread_size)
                    timestamp = profile.date_stamp
                    print("checking:", str(profile))
                    links = []
                    for item in info:
                        item.date_stamp = timestamp
                        links.append(ArchiveOrg.get_archive_link(item))
                    results = [
                        test_pool.apply_async(func=test_response, args=(x, ))
                        for x in links
                    ]
                    returned = [y.get() for y in results]
                    test_pool.terminate()
                    for result_good, link_cls in returned:
                        if link_cls == LinkUtility.EXT_WEBPAGE:
                            total_web_page += 1
                            if not result_good:
                                broken_web_page += 1
                        elif link_cls == LinkUtility.EXT_CSS:
                            total_css += 1
                            if not result_good:
                                broken_css += 1
                        elif link_cls == LinkUtility.EXT_JS:
                            total_js += 1
                            if not result_good:
                                broken_js += 1
                        elif link_cls == LinkUtility.EXT_IMAGE:
                            total_image += 1
                            if not result_good:
                                broken_image += 1
                        else:
                            total_other += 1
                            if not result_good:
                                broken_other += 1
                    broken_res_count = broken_image + broken_other + broken_web_page + broken_js + broken_image
                    passed = False
                    total_broken_rate = 1 - broken_res_count / res_count
                    if total_broken_rate >= pass_threshold:
                        passed = True
                    if total_broken_rate > current_rate:
                        current_rate = total_broken_rate
                        archive = profile
                        good_rate_web_page = 0 if total_web_page == 0 else 1 - broken_web_page / total_web_page
                        good_rate_image = 0 if total_image == 0 else 1 - broken_image / total_image
                        good_rate_css = 0 if total_css == 0 else 1 - broken_css / total_css
                        good_rate_js = 0 if total_js == 0 else 1 - broken_js / total_js
                        good_rate_other = 0 if total_other == 0 else 1 - broken_other / total_other

                        total_web_page_min = total_web_page
                        total_js_min = total_js
                        total_css_min = total_css
                        total_image_min = total_image
                        total_other_min = total_other
                        min_broken_res_count = total_broken_rate
                    print("total:", res_count, " broken res:",
                          broken_res_count, " stamp: ", profile.date_stamp,
                          " pass? ", passed, " rate:", total_broken_rate)
        return ArchiveDetail(root_domain,
                             archive_link=ArchiveOrg.get_archive_link(archive),
                             total_res=res_count,
                             good_res_rate=min_broken_res_count,
                             total_web_page=total_web_page_min,
                             good_webpage_rate=good_rate_web_page,
                             total_css=total_css_min,
                             good_css_rate=good_rate_css,
                             total_js=total_js_min,
                             good_js_rate=good_rate_js,
                             total_image=total_image_min,
                             good_image_rate=good_rate_image,
                             total_other=total_other_min,
                             good_other_rate=good_rate_other)
コード例 #56
0
    def get_search_results(keyword: str,
                           page_number: int,
                           proxy: ProxyStruct = None,
                           result_per_page: int = GoogleConst.Result100,
                           timeout=5,
                           return_domain_home_only=True,
                           use_forbidden_filter=True,
                           days_ago=0,
                           addtional_query_parameter: str = "",
                           country_code="us",
                           use_browser=False) -> list:
        """
        generic normal search, get a list of domains form page
        :param keyword:
        :param page_number:  > 0
        :param resultPerPage:
        :param timeout:
        :param return_domain_home_only: return root domain name if True, else return protocol suffix + domain name
        :param use_forbidden_filter:
        :param days_ago: specify how many days ago before when results were indexed.
        :return:
        """
        assert page_number > 0, "page number should be greater than 0."
        page_range = GoogleCom.get_result_per_page_range()
        assert result_per_page in page_range, "result per page should be one of those values:" + str(
            page_range)

        sub_domain = "www"
        request_link = GoogleUtility.get_local_endpoint(country_code, sub_domain) \
                       + GoogleConst.CommonSearchPath.format(quote(keyword), result_per_page, (page_number - 1) * result_per_page, country_code) \
                       + addtional_query_parameter+GoogleUtility.get_query_for_days(days_ago)
        try:
            user_agent = WebRequestCommonHeader.webpage_agent
            if not use_browser:
                response = GoogleCom._get_response(request_link,
                                                   proxy=proxy,
                                                   timeout=timeout,
                                                   user_agent=user_agent)
                if not response.status_code == 200:
                    # if response.status_code == 503:
                    # print(response.text)
                    raise ConnectionRefusedError(
                        "error getting result, with status code:",
                        response.status_code)
                result = response.text
            else:
                result = GoogleCom._get_response_browser(request_link,
                                                         proxy=proxy,
                                                         timeout=timeout,
                                                         user_agent=user_agent)
            soup = bs4.BeautifulSoup(result)
            tags = soup.select(GoogleConst.SitePath)
            domains = []
            for tag in tags:
                try:
                    domain = tag.text.strip().replace(" ", "")
                    if return_domain_home_only:
                        domain = LinkChecker.get_root_domain(
                            domain, use_www=False)[2]  # get the link
                    else:
                        domain = LinkChecker.get_root_domain(domain,
                                                             use_www=False)[3]
                    if use_forbidden_filter and LinkChecker.is_domain_forbidden(
                            domain):
                        continue
                    if len(domain) > 0:
                        domains.append(domain)
                except:
                    pass
            return domains

        except Exception as ex:
            print(ex)
            return None