def begin_crawl(self, level=0):
     """
     :return: True if finished successfully, else the crawler stopped, but may have results
     """
     print(self.domain, " get next level : ", str(level + 1))
     pages = self.data_source.get_onsite_links(
         level + 1, ResponseCode.LinkNotBroken
     )  #+ self.get_onsite_links(level+1, ResponseCode.LinkRedirect)
     if pages is None or len(pages) == 0:  # end of crawling
         print(self.domain, " next level is None crawling stop at page: ",
               str(self.page_count), " level: ", str(level))
         return True
     else:
         for page in pages:
             if self.page_count >= self.max_page or level >= self.max_level:
                 print(
                     self.domain, " reach limit crawling stop at page: " +
                     str(self.page_count), " level: ", str(level))
                 return True
             try:
                 PageChecker.crawl_page(self, page)
                 self.page_count += 1
             except Exception as e:
                 print(self.domain + " " + page.link + " " + str(e))
         return True and self.begin_crawl(
             level + 1)  # crawl next level, recursive
 def testPageCrawl2(self):
     link = "http://stackoverflow.com/"
     checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10)
     checker.agent = "VegeBot-Careful"
     page = OnSiteLink(link=link, response_code=999)
     next_page = OnSiteLink(link="http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail", response_code=999)
     PageChecker.check_internal_page(checker, page)
     internal, external = PageChecker.check_internal_page(checker, next_page)
     print("external links:")
     for item in external:
         print(item)
     print("internal links:")
     for item in internal:
         print(item)
 def begin_crawl(self, level=0):
     """
     :return: True if finished successfully, else the crawler stopped, but may have results
     """
     print(self.domain, " get next level : ", str(level+1))
     pages = self.data_source.get_onsite_links(level+1, ResponseCode.LinkNotBroken) #+ self.get_onsite_links(level+1, ResponseCode.LinkRedirect)
     if pages is None or len(pages) == 0:  # end of crawling
         print(self.domain, " next level is None crawling stop at page: ", str(self.page_count), " level: ", str(level))
         return True
     else:
         for page in pages:
             if self.page_count >= self.max_page or level >= self.max_level:
                 print(self.domain, " reach limit crawling stop at page: "+str(self.page_count), " level: ", str(level))
                 return True
             try:
                 PageChecker.crawl_page(self, page)
                 self.page_count += 1
             except Exception as e:
                 print(self.domain + " " + page.link + " " + str(e))
         return True and self.begin_crawl(level + 1)  # crawl next level, recursive
 def testPageCrawl2(self):
     link = "http://stackoverflow.com/"
     checker = SiteThreadChecker(full_link=link,
                                 thread_pool_size=2,
                                 max_page=1000,
                                 max_level=10)
     checker.agent = "VegeBot-Careful"
     page = OnSiteLink(link=link, response_code=999)
     next_page = OnSiteLink(
         link=
         "http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail",
         response_code=999)
     PageChecker.check_internal_page(checker, page)
     internal, external = PageChecker.check_internal_page(
         checker, next_page)
     print("external links:")
     for item in external:
         print(item)
     print("internal links:")
     for item in internal:
         print(item)
    def testPageCrawl(self):
        link = "http://www.secondcityhockey.com"
        checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10)
        checker.agent = "VegeBot-Careful"
        page = OnSiteLink(link=link, response_code=999)
        next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018", response_code=999)
        # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999)

        # PageChecker.check_internal_page(checker, page)
        internal, external = PageChecker.check_internal_page(checker, next_page)
        print("external links:")
        for item in external:
            print(item)
        print("internal links:")
        for item in internal:
            print(item)
    def testPageCrawl(self):
        link = "http://www.secondcityhockey.com"
        checker = SiteThreadChecker(full_link=link,
                                    thread_pool_size=2,
                                    max_page=1000,
                                    max_level=10)
        checker.agent = "VegeBot-Careful"
        page = OnSiteLink(link=link, response_code=999)
        next_page = OnSiteLink(
            link=
            "http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018",
            response_code=999)
        # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999)

        # PageChecker.check_internal_page(checker, page)
        internal, external = PageChecker.check_internal_page(
            checker, next_page)
        print("external links:")
        for item in external:
            print(item)
        print("internal links:")
        for item in internal:
            print(item)