def testThreadChecker(self):
        stop_event = Event()
        link = "munichre.com"
        checker = SiteThreadChecker(full_link=link,
                                    thread_pool_size=3,
                                    max_page=3000,
                                    max_level=10)

        def crawl():
            checker.crawling()

        queue_server_t = Process(target=run_queue_server)
        queue_server_t.start()
        output_t = Process(target=single_output, args=(stop_event, ))
        output_t.start()
        # link = "http://sweetlifebake.com/#axzz3t4Nx7b7N"
        crawl_t = Thread(target=crawl)
        crawl_t.start()
        timeout = 1000
        counter = 0
        while counter < timeout:
            time.sleep(1)
            counter += 1
        print("is going to sudden death.")
        stop_event.set()
        checker.sudden_death()
        if crawl_t.is_alive():
            crawl_t.join()
        output_t.terminate()
        queue_server_t.terminate()

        print("finished")
    def testThreadChecker(self):
        stop_event = Event()
        link = "munichre.com"
        checker = SiteThreadChecker(full_link=link, thread_pool_size=3, max_page=3000, max_level=10)

        def crawl():
            checker.crawling()

        queue_server_t = Process(target=run_queue_server)
        queue_server_t.start()
        output_t = Process(target=single_output, args=(stop_event,))
        output_t.start()
        # link = "http://sweetlifebake.com/#axzz3t4Nx7b7N"
        crawl_t = Thread(target=crawl)
        crawl_t.start()
        timeout = 1000
        counter = 0
        while counter < timeout:
            time.sleep(1)
            counter += 1
        print("is going to sudden death.")
        stop_event.set()
        checker.sudden_death()
        if crawl_t.is_alive():
            crawl_t.join()
        output_t.terminate()
        queue_server_t.terminate()

        print("finished")
 def testPageCrawl2(self):
     link = "http://stackoverflow.com/"
     checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10)
     checker.agent = "VegeBot-Careful"
     page = OnSiteLink(link=link, response_code=999)
     next_page = OnSiteLink(link="http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail", response_code=999)
     PageChecker.check_internal_page(checker, page)
     internal, external = PageChecker.check_internal_page(checker, next_page)
     print("external links:")
     for item in external:
         print(item)
     print("internal links:")
     for item in internal:
         print(item)
    def testPageCrawl(self):
        link = "http://www.secondcityhockey.com"
        checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10)
        checker.agent = "VegeBot-Careful"
        page = OnSiteLink(link=link, response_code=999)
        next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018", response_code=999)
        # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999)

        # PageChecker.check_internal_page(checker, page)
        internal, external = PageChecker.check_internal_page(checker, next_page)
        print("external links:")
        for item in external:
            print(item)
        print("internal links:")
        for item in internal:
            print(item)
 def testPageCrawl2(self):
     link = "http://stackoverflow.com/"
     checker = SiteThreadChecker(full_link=link,
                                 thread_pool_size=2,
                                 max_page=1000,
                                 max_level=10)
     checker.agent = "VegeBot-Careful"
     page = OnSiteLink(link=link, response_code=999)
     next_page = OnSiteLink(
         link=
         "http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail",
         response_code=999)
     PageChecker.check_internal_page(checker, page)
     internal, external = PageChecker.check_internal_page(
         checker, next_page)
     print("external links:")
     for item in external:
         print(item)
     print("internal links:")
     for item in internal:
         print(item)
    def testPageCrawl(self):
        link = "http://www.secondcityhockey.com"
        checker = SiteThreadChecker(full_link=link,
                                    thread_pool_size=2,
                                    max_page=1000,
                                    max_level=10)
        checker.agent = "VegeBot-Careful"
        page = OnSiteLink(link=link, response_code=999)
        next_page = OnSiteLink(
            link=
            "http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018",
            response_code=999)
        # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999)

        # PageChecker.check_internal_page(checker, page)
        internal, external = PageChecker.check_internal_page(
            checker, next_page)
        print("external links:")
        for item in external:
            print(item)
        print("internal links:")
        for item in internal:
            print(item)
예제 #7
0
 def run(self):
     # self.set_system_limit()
     self._create_all_file_dirs()
     self.whois_queue_process.start()
     whois_thread = Thread(target=self.checking_whois)
     trash_clean_thread = Thread(target=self.clear_trash)
     manager, self.outputQueue = get_queue_client(
         QueueManager.MachineSettingCrawler,
         QueueManager.Method_Whois_Output)
     # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue,
     #                           delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset)
     self.output_thread = outputThread(
         threadID=0,
         name=self.threadPrfix + "Output",
         stop_event=self.stop_event,
         inputQ=self.outputQueue,
         delegate=self.output_delegate,
         failsure_reset_queue=self.queue_failure_reset)
     self.output_thread.start()
     trash_clean_thread.start()
     whois_thread.start()
     # self.whois_queue_process.start()
     self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter(
         full_link="",  # this parameter will be updated in self.input_iter
         max_page=self.max_page_per_site,
         max_level=self.page_max_level,
         output_queue=self._whoisQueue,
         pool_size=self.concurrent_page)
     self.input_iter.callback = self.process_feedback
     self.input_iter.Memlimit = self.memory_limit_per_process
     try:
         #print("monitor process started: pid: ", os.getpid())
         self.pool.imap(site_check_process_iter, self.input_iter, 1)
         #self.pool.imap_unordered(site_check_process_iter, self.input_iter)
         while self.can_continue():
             time.sleep(0.5)
     except Exception as ex:
         msg = "run(), with database: " + self.name
         ErrorLogger.log_error("SiteCheckProcessManager", ex, msg)
     finally:
         print("terminate miner!")
         self.pool.terminate()
         whois_thread.join()
         self.whois_queue_process.terminate()
         self.temp_results.clear()
         self.site_info.clear()
         self.finished = True
 def run(self):
     # self.set_system_limit()
     self._create_all_file_dirs()
     self.whois_queue_process.start()
     whois_thread = Thread(target=self.checking_whois)
     trash_clean_thread = Thread(target=self.clear_trash)
     manager, self.outputQueue = get_queue_client(QueueManager.MachineSettingCrawler, QueueManager.Method_Whois_Output)
     # self.output_thread = outputThread(0, self.threadPrfix+"Output", self.stop_event, self.outputQueue,
     #                           delegate=self.output_delegate, failsure_reset_queue=self.queue_failure_reset)
     self.output_thread = outputThread(threadID=0, name=self.threadPrfix+"Output", stop_event=self.stop_event,
                                       inputQ=self.outputQueue, delegate=self.output_delegate,
                                       failsure_reset_queue=self.queue_failure_reset)
     self.output_thread.start()
     trash_clean_thread.start()
     whois_thread.start()
     # self.whois_queue_process.start()
     self.input_iter.func_kwarg = SiteThreadChecker.get_input_parameter(full_link="", # this parameter will be updated in self.input_iter
                                                                        max_page=self.max_page_per_site,
                                                                        max_level=self.page_max_level,
                                                                        output_queue=self._whoisQueue,
                                                                        pool_size=self.concurrent_page)
     self.input_iter.callback = self.process_feedback
     self.input_iter.Memlimit = self.memory_limit_per_process
     try:
         #print("monitor process started: pid: ", os.getpid())
         self.pool.imap(site_check_process_iter, self.input_iter, 1)
         #self.pool.imap_unordered(site_check_process_iter, self.input_iter)
         while self.can_continue():
             time.sleep(0.5)
     except Exception as ex:
         msg = "run(), with database: " + self.name
         ErrorLogger.log_error("SiteCheckProcessManager", ex, msg)
     finally:
         print("terminate miner!")
         self.pool.terminate()
         whois_thread.join()
         self.whois_queue_process.terminate()
         self.temp_results.clear()
         self.site_info.clear()
         self.finished = True
def site_check_process(*args, **kwargs):
    site_checker = SiteThreadChecker(*args, **kwargs)
    site_checker.crawling()
예제 #10
0
def site_check_process(*args, **kwargs):
    site_checker = SiteThreadChecker(*args, **kwargs)
    site_checker.crawling()