def testGettingLinks(self):
     info = ArchiveOrg.get_url_info("http://susodigital.com",
                                    min_size=1,
                                    limit=-100)
     for item in info:
         link = ArchiveOrg.get_archive_link(item)
         print(link)
 def testGetBestProfile(self):
     archive, rate = ArchiveOrg.get_best_archive(
         root_domain="susodigital.com",
         thread_size=100,
         profile_check=10,
         pass_threshold=0.7,
         res_limit=2000)
     profile_link = ArchiveOrg.get_archive_link(archive)
     print("best profile:", profile_link, "rate:", rate)
Example #3
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     result_ok = False
     if isinstance(data, FilteredDomainData):
         try:
             if len(data.domain_var) == 0:
                 data.domain_var = data.domain
             links = ArchiveOrg.get_url_info(data.domain_var, min_size=self._min_page_size, limit=-100)
             count = len(links)
             data.archive = count
             if count < self._min_profile:
                 pass
                 # raise ValueError("profile count is less than:" + str(self._min_profile))
             result_ok = True
         except Exception as ex:
             if not self._is_throughput_debug:
                 pass
                 # ErrorLogger.log_error("ArchiveOrgFilter.process_data()", ex, data.domain_var)
         finally:
             with self._sync_lock:
                 self._job_done += 1
                     #with self._process_queue_lock:
                 if result_ok:
                     if not self._is_throughput_debug:
                         CsvLogger.log_to_file(self._log_file, [(data.domain, data.da, data.archive)]) # log this to file
                     self._output_queue.put(data)
                     # return data
                 else:
                     if self._is_throughput_debug:
                         self._output_queue.put(data)
                     # return None
     else:
         return None
    def testGettingLinksVariation(self):
        test_pool = pool.ThreadPool(processes=100)
        url = "http://bbc.co.uk"
        latest = ArchiveOrg.get_url_info(url, min_size=1, limit=-1)[0]
        timestamp =""
        if isinstance(latest, ArchiveStruct):
            timestamp = latest.date_stamp

        info = ArchiveOrg.get_domain_urls(url, limit=2000)
        res_count = len(info)
        broken_res_count = 0
        links = []
        for item in info:
            item.date_stamp = timestamp
            links.append(ArchiveOrg.get_archive_link(item))
        results = [test_pool.apply_async(func=test_response, args=(x,)) for x in links]
        returned = [y.get() for y in results]
        for result in returned:
            if result == False:
                broken_res_count += 1
        print("total:", res_count, " broken res:", broken_res_count)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain, thread_size=100, profile_check=10, pass_threshold=0.9, res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
    def testGettingLinksVariation(self):
        test_pool = pool.ThreadPool(processes=100)
        url = "http://bbc.co.uk"
        latest = ArchiveOrg.get_url_info(url, min_size=1, limit=-1)[0]
        timestamp = ""
        if isinstance(latest, ArchiveStruct):
            timestamp = latest.date_stamp

        info = ArchiveOrg.get_domain_urls(url, limit=2000)
        res_count = len(info)
        broken_res_count = 0
        links = []
        for item in info:
            item.date_stamp = timestamp
            links.append(ArchiveOrg.get_archive_link(item))
        results = [
            test_pool.apply_async(func=test_response, args=(x, ))
            for x in links
        ]
        returned = [y.get() for y in results]
        for result in returned:
            if result == False:
                broken_res_count += 1
        print("total:", res_count, " broken res:", broken_res_count)
 def testGetBestProfileBatch(self):
     file_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_test_source.txt"
     domains = FileHandler.read_lines_from_file(file_path)
     save_path = "/Users/superCat/Desktop/PycharmProjectPortable/test/profile_archive.csv"
     CsvLogger.log_to_file_path(save_path, [ArchiveDetail.get_title()])
     for domain in domains:
         print("begin domain:", domain)
         try:
             archive = ArchiveOrg.get_best_archive(root_domain=domain,
                                                   thread_size=100,
                                                   profile_check=10,
                                                   pass_threshold=0.9,
                                                   res_limit=2000)
             CsvLogger.log_to_file_path(save_path, [archive.to_tuple()])
         except Exception as ex:
             print(ex)
 def testArchiveTimeStamps(self):
     domain = "susodigital.com"
     langs = ArchiveOrg.get_archives_lang(domain)
     print(langs)
 def testGetBestProfile(self):
     archive, rate = ArchiveOrg.get_best_archive(root_domain="susodigital.com", thread_size=100, profile_check=10, pass_threshold=0.7, res_limit=2000)
     profile_link = ArchiveOrg.get_archive_link(archive)
     print("best profile:", profile_link, "rate:", rate)
 def testGettingLinks(self):
     info = ArchiveOrg.get_url_info("http://susodigital.com", min_size=1, limit=-100)
     for item in info:
         link = ArchiveOrg.get_archive_link(item)
         print(link)
 def testArchiveTimeStamps(self):
     domain = "susodigital.com"
     langs = ArchiveOrg.get_archives_lang(domain)
     print(langs)