def testThreadChecker(self): stop_event = Event() link = "munichre.com" checker = SiteThreadChecker(full_link=link, thread_pool_size=3, max_page=3000, max_level=10) def crawl(): checker.crawling() queue_server_t = Process(target=run_queue_server) queue_server_t.start() output_t = Process(target=single_output, args=(stop_event, )) output_t.start() # link = "http://sweetlifebake.com/#axzz3t4Nx7b7N" crawl_t = Thread(target=crawl) crawl_t.start() timeout = 1000 counter = 0 while counter < timeout: time.sleep(1) counter += 1 print("is going to sudden death.") stop_event.set() checker.sudden_death() if crawl_t.is_alive(): crawl_t.join() output_t.terminate() queue_server_t.terminate() print("finished")
def testThreadChecker(self): stop_event = Event() link = "munichre.com" checker = SiteThreadChecker(full_link=link, thread_pool_size=3, max_page=3000, max_level=10) def crawl(): checker.crawling() queue_server_t = Process(target=run_queue_server) queue_server_t.start() output_t = Process(target=single_output, args=(stop_event,)) output_t.start() # link = "http://sweetlifebake.com/#axzz3t4Nx7b7N" crawl_t = Thread(target=crawl) crawl_t.start() timeout = 1000 counter = 0 while counter < timeout: time.sleep(1) counter += 1 print("is going to sudden death.") stop_event.set() checker.sudden_death() if crawl_t.is_alive(): crawl_t.join() output_t.terminate() queue_server_t.terminate() print("finished")
def testPageCrawl2(self): link = "http://stackoverflow.com/" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink(link="http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail", response_code=999) PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page(checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def testPageCrawl(self): link = "http://www.secondcityhockey.com" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018", response_code=999) # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999) # PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page(checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def testPageCrawl2(self): link = "http://stackoverflow.com/" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink( link= "http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail", response_code=999) PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page( checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def testPageCrawl(self): link = "http://www.secondcityhockey.com" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink( link= "http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018", response_code=999) # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999) # PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page( checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def run(self): # self.set_system_limit() self._create_all_file_dirs() self.whois_queue_process.start() whois_thread = Thread(target=self.checking_whois) trash_clean_thread = Thread(target=self.clear_trash) manager, self.outputQueue = get_queue_client( QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue, # delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread = outputThread( threadID=0, name=self.threadPrfix + "Output", stop_event=self.stop_event, inputQ=self.outputQueue, delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread.start() trash_clean_thread.start() whois_thread.start() # self.whois_queue_process.start() self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter( full_link="", # this parameter will be updated in self.input_iter max_page=self.max_page_per_site, max_level=self.page_max_level, output_queue=self._whoisQueue, pool_size=self.concurrent_page) self.input_iter.callback = self.process_feedback self.input_iter.Memlimit = self.memory_limit_per_process try: #print("monitor process started: pid: ", os.getpid()) self.pool.imap(site_check_process_iter, self.input_iter, 1) #self.pool.imap_unordered(site_check_process_iter, self.input_iter) while self.can_continue(): time.sleep(0.5) except Exception as ex: msg = "run(), with database: " + self.name ErrorLogger.log_error("SiteCheckProcessManager", ex, msg) finally: print("terminate miner!") self.pool.terminate() whois_thread.join() self.whois_queue_process.terminate() self.temp_results.clear() self.site_info.clear() self.finished = True
def run(self): # self.set_system_limit() self._create_all_file_dirs() self.whois_queue_process.start() whois_thread = Thread(target=self.checking_whois) trash_clean_thread = Thread(target=self.clear_trash) manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output) # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue, # delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread = outputThread(threadID=0, name=self.threadPrfix+"Output", stop_event=self.stop_event, inputQ=self.outputQueue, delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset) self.output_thread.start() trash_clean_thread.start() whois_thread.start() # self.whois_queue_process.start() self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter(full_link="", # this parameter will be updated in self.input_iter max_page=self.max_page_per_site, max_level=self.page_max_level, output_queue=self._whoisQueue, pool_size=self.concurrent_page) self.input_iter.callback = self.process_feedback self.input_iter.Memlimit = self.memory_limit_per_process try: #print("monitor process started: pid: ", os.getpid()) self.pool.imap(site_check_process_iter, self.input_iter, 1) #self.pool.imap_unordered(site_check_process_iter, self.input_iter) while self.can_continue(): time.sleep(0.5) except Exception as ex: msg = "run(), with database: " + self.name ErrorLogger.log_error("SiteCheckProcessManager", ex, msg) finally: print("terminate miner!") self.pool.terminate() whois_thread.join() self.whois_queue_process.terminate() self.temp_results.clear() self.site_info.clear() self.finished = True
def site_check_process(*args, **kwargs): site_checker = SiteThreadChecker(*args, **kwargs) site_checker.crawling()