def process_data(self, data: FilteredDomainData, **kwargs): result_ok = False if isinstance(data, FilteredDomainData): try: if len(data.domain_var) == 0: data.domain_var = data.domain links = ArchiveOrg.get_url_info(data.domain_var, min_size=self._min_page_size, limit=-100) count = len(links) data.archive = count if count < self._min_profile: pass # raise ValueError("profile count is less than:" + str(self._min_profile)) result_ok = True except Exception as ex: if not self._is_throughput_debug: pass # ErrorLogger.log_error("ArchiveOrgFilter.process_data()", ex, data.domain_var) finally: with self._sync_lock: self._job_done += 1 #with self._process_queue_lock: if result_ok: if not self._is_throughput_debug: CsvLogger.log_to_file(self._log_file, [(data.domain, data.da, data.archive)]) # log this to file self._output_queue.put(data) # return data else: if self._is_throughput_debug: self._output_queue.put(data) # return None else: return None
def testGettingLinks(self): info = ArchiveOrg.get_url_info("http://susodigital.com", min_size=1, limit=-100) for item in info: link = ArchiveOrg.get_archive_link(item) print(link)
def testGettingLinksVariation(self): test_pool = pool.ThreadPool(processes=100) url = "http://bbc.co.uk" latest = ArchiveOrg.get_url_info(url, min_size=1, limit=-1)[0] timestamp ="" if isinstance(latest, ArchiveStruct): timestamp = latest.date_stamp info = ArchiveOrg.get_domain_urls(url, limit=2000) res_count = len(info) broken_res_count = 0 links = [] for item in info: item.date_stamp = timestamp links.append(ArchiveOrg.get_archive_link(item)) results = [test_pool.apply_async(func=test_response, args=(x,)) for x in links] returned = [y.get() for y in results] for result in returned: if result == False: broken_res_count += 1 print("total:", res_count, " broken res:", broken_res_count)
def testGettingLinksVariation(self): test_pool = pool.ThreadPool(processes=100) url = "http://bbc.co.uk" latest = ArchiveOrg.get_url_info(url, min_size=1, limit=-1)[0] timestamp = "" if isinstance(latest, ArchiveStruct): timestamp = latest.date_stamp info = ArchiveOrg.get_domain_urls(url, limit=2000) res_count = len(info) broken_res_count = 0 links = [] for item in info: item.date_stamp = timestamp links.append(ArchiveOrg.get_archive_link(item)) results = [ test_pool.apply_async(func=test_response, args=(x, )) for x in links ] returned = [y.get() for y in results] for result in returned: if result == False: broken_res_count += 1 print("total:", res_count, " broken res:", broken_res_count)