def begin_crawl(self, level=0): """ :return: True if finished successfully, else the crawler stopped, but may have results """ print(self.domain, " get next level : ", str(level + 1)) pages = self.data_source.get_onsite_links( level + 1, ResponseCode.LinkNotBroken ) #+ self.get_onsite_links(level+1, ResponseCode.LinkRedirect) if pages is None or len(pages) == 0: # end of crawling print(self.domain, " next level is None crawling stop at page: ", str(self.page_count), " level: ", str(level)) return True else: for page in pages: if self.page_count >= self.max_page or level >= self.max_level: print( self.domain, " reach limit crawling stop at page: " + str(self.page_count), " level: ", str(level)) return True try: PageChecker.crawl_page(self, page) self.page_count += 1 except Exception as e: print(self.domain + " " + page.link + " " + str(e)) return True and self.begin_crawl( level + 1) # crawl next level, recursive
def testPageCrawl2(self): link = "http://stackoverflow.com/" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink(link="http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail", response_code=999) PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page(checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def begin_crawl(self, level=0): """ :return: True if finished successfully, else the crawler stopped, but may have results """ print(self.domain, " get next level : ", str(level+1)) pages = self.data_source.get_onsite_links(level+1, ResponseCode.LinkNotBroken) #+ self.get_onsite_links(level+1, ResponseCode.LinkRedirect) if pages is None or len(pages) == 0: # end of crawling print(self.domain, " next level is None crawling stop at page: ", str(self.page_count), " level: ", str(level)) return True else: for page in pages: if self.page_count >= self.max_page or level >= self.max_level: print(self.domain, " reach limit crawling stop at page: "+str(self.page_count), " level: ", str(level)) return True try: PageChecker.crawl_page(self, page) self.page_count += 1 except Exception as e: print(self.domain + " " + page.link + " " + str(e)) return True and self.begin_crawl(level + 1) # crawl next level, recursive
def testPageCrawl2(self): link = "http://stackoverflow.com/" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink( link= "http://stackoverflow.com/questions/5836674/why-does-debug-false-setting-make-my-django-static-files-access-fail", response_code=999) PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page( checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def testPageCrawl(self): link = "http://www.secondcityhockey.com" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018", response_code=999) # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999) # PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page(checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)
def testPageCrawl(self): link = "http://www.secondcityhockey.com" checker = SiteThreadChecker(full_link=link, thread_pool_size=2, max_page=1000, max_level=10) checker.agent = "VegeBot-Careful" page = OnSiteLink(link=link, response_code=999) next_page = OnSiteLink( link= "http://www.secondcityhockey.com/2014/10/9/6951991/state-of-the-blog-lets-party-and-be-nice-and-hip-and-cool/in/6645018", response_code=999) # next_page = OnSiteLink(link="http://www.secondcityhockey.com/2014/", response_code=999) # PageChecker.check_internal_page(checker, page) internal, external = PageChecker.check_internal_page( checker, next_page) print("external links:") for item in external: print(item) print("internal links:") for item in internal: print(item)