def test_backend(backend): # Graph graph = graphs.Manager() graph.add_site_list(SITE_LIST) # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) print '-' * 80 print frontier.backend.name print '-' * 80 # Tester tester = FrontierTester(frontier, graph) tester.run() # Show crawling sequence for page in tester.sequence: print page.url
def test_backend(backend): # Graph graph = graphs.Manager() graph.add_site_list(SITE_LIST) # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) print '-'*80 print frontier.backend.name print '-'*80 # Tester tester = FrontierTester(frontier, graph) tester.run() # Show crawling sequence for page in tester.sequence: print page.url
def get_sequence(self, site_list, max_next_requests, downloader_simulator=BaseDownloaderSimulator()): """ Returns a crawling sequence from a site list :param list site_list: A list of sites to use as frontier seeds. :param int max_next_requests: Max next requests for the frontier. """ # Graph graph_manager = graphs.Manager() graph_manager.add_site_list(site_list) # Tester tester = FrontierTester( frontier=self.get_frontier(), graph_manager=graph_manager, max_next_requests=max_next_requests, downloader_simulator=downloader_simulator, ) # Run tester and generate sequence tester.run() return [page.url for page in tester.sequence]
def test_logic(backend): # Graph graph = graphs.Manager('sqlite:///data/graph.db') # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False settings.TEST_MODE = True frontier = FrontierManager.from_settings(settings) # Tester tester = FrontierTester(frontier, graph) tester.run(add_all_pages=True) # Show crawling sequence print('-' * 80) print(frontier.backend.name) print('-' * 80) for page in tester.sequence: print(page.url)
def test_logic(backend): # Graph graph = graphs.Manager('sqlite:///data/graph.db') # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False settings.TEST_MODE = True frontier = FrontierManager.from_settings(settings) # Tester tester = FrontierTester(frontier, graph) tester.run(add_all_pages=True) # Show crawling sequence print '-'*80 print frontier.backend.name print '-'*80 for page in tester.sequence: print page.url
def get_sequence(self, site_list, max_next_requests, downloader_simulator=BaseDownloaderSimulator()): """ Returns a crawling sequence from a site list :param list site_list: A list of sites to use as frontier seeds. :param int max_next_requests: Max next requests for the frontier. """ # Graph graph_manager = graphs.Manager() graph_manager.add_site_list(site_list) # Tester tester = FrontierTester(frontier=self.get_frontier(), graph_manager=graph_manager, max_next_requests=max_next_requests, downloader_simulator=downloader_simulator) # Run tester and generate sequence tester.run() return [page.url for page in tester.sequence]
SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = False if __name__ == '__main__': # Graph graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db') # Frontier frontier = FrontierManager.from_settings(SETTINGS) # Tester tester = FrontierTester(frontier, graph) # Run test tester.run() # Show frontier pages print '-'*80 print ' Frontier pages' print '-'*80 for page in frontier.backend.pages.values(): print page.url, page.depth, page.state # Show crawling sequence print '-'*80 print ' Crawling sequence' print '-'*80
SETTINGS = Settings() SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = False if __name__ == '__main__': # Graph graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db') # Frontier frontier = FrontierManager.from_settings(SETTINGS) # Tester tester = FrontierTester(frontier, graph) # Run test tester.run() # Show frontier pages print('-' * 80) print(' Frontier pages') print('-' * 80) for page in list(frontier.backend.pages.values()): print(page.url, page.depth, page.state) # Show crawling sequence print('-' * 80) print(' Crawling sequence') print('-' * 80)