Example #1
0
 def process_data(self, data: FilteredDomainData, **kwargs):
     result_ok = False
     if isinstance(data, FilteredDomainData):
         try:
             if len(data.domain_var) == 0:
                 data.domain_var = data.domain
             links = ArchiveOrg.get_url_info(data.domain_var, min_size=self._min_page_size, limit=-100)
             count = len(links)
             data.archive = count
             if count < self._min_profile:
                 pass
                 # raise ValueError("profile count is less than:" + str(self._min_profile))
             result_ok = True
         except Exception as ex:
             if not self._is_throughput_debug:
                 pass
                 # ErrorLogger.log_error("ArchiveOrgFilter.process_data()", ex, data.domain_var)
         finally:
             with self._sync_lock:
                 self._job_done += 1
                     #with self._process_queue_lock:
                 if result_ok:
                     if not self._is_throughput_debug:
                         CsvLogger.log_to_file(self._log_file, [(data.domain, data.da, data.archive)]) # log this to file
                     self._output_queue.put(data)
                     # return data
                 else:
                     if self._is_throughput_debug:
                         self._output_queue.put(data)
                     # return None
     else:
         return None
 def testGettingLinks(self):
     info = ArchiveOrg.get_url_info("http://susodigital.com",
                                    min_size=1,
                                    limit=-100)
     for item in info:
         link = ArchiveOrg.get_archive_link(item)
         print(link)
    def testGettingLinksVariation(self):
        test_pool = pool.ThreadPool(processes=100)
        url = "http://bbc.co.uk"
        latest = ArchiveOrg.get_url_info(url, min_size=1, limit=-1)[0]
        timestamp =""
        if isinstance(latest, ArchiveStruct):
            timestamp = latest.date_stamp

        info = ArchiveOrg.get_domain_urls(url, limit=2000)
        res_count = len(info)
        broken_res_count = 0
        links = []
        for item in info:
            item.date_stamp = timestamp
            links.append(ArchiveOrg.get_archive_link(item))
        results = [test_pool.apply_async(func=test_response, args=(x,)) for x in links]
        returned = [y.get() for y in results]
        for result in returned:
            if result == False:
                broken_res_count += 1
        print("total:", res_count, " broken res:", broken_res_count)
    def testGettingLinksVariation(self):
        test_pool = pool.ThreadPool(processes=100)
        url = "http://bbc.co.uk"
        latest = ArchiveOrg.get_url_info(url, min_size=1, limit=-1)[0]
        timestamp = ""
        if isinstance(latest, ArchiveStruct):
            timestamp = latest.date_stamp

        info = ArchiveOrg.get_domain_urls(url, limit=2000)
        res_count = len(info)
        broken_res_count = 0
        links = []
        for item in info:
            item.date_stamp = timestamp
            links.append(ArchiveOrg.get_archive_link(item))
        results = [
            test_pool.apply_async(func=test_response, args=(x, ))
            for x in links
        ]
        returned = [y.get() for y in results]
        for result in returned:
            if result == False:
                broken_res_count += 1
        print("total:", res_count, " broken res:", broken_res_count)
 def testGettingLinks(self):
     info = ArchiveOrg.get_url_info("http://susodigital.com", min_size=1, limit=-100)
     for item in info:
         link = ArchiveOrg.get_archive_link(item)
         print(link)