def test_backend(backend): # Graph graph = graphs.Manager() graph.add_site_list(SITE_LIST) # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) print '-' * 80 print frontier.backend.name print '-' * 80 # Tester tester = FrontierTester(frontier, graph) tester.run() # Show crawling sequence for page in tester.sequence: print page.url
def open(self, spider): super(RecorderScheduler, self).open(spider) self.stats_manager = StatsManager(spider.crawler.stats) settings = spider.crawler.settings self.recorder_enabled = settings.get('RECORDER_ENABLED', DEFAULT_RECORDER_ENABLED) if not self.recorder_enabled: log.msg('Recorder disabled!', log.WARNING) return log.msg('Starting recorder', log.INFO) recorder_storage = settings.get('RECORDER_STORAGE_ENGINE', None) if not recorder_storage: self.recorder_enabled = False log.msg('Missing Recorder storage! Recorder disabled...', log.WARNING) return self.graph = graphs.Manager( engine=recorder_storage, drop_all_tables=settings.getbool( 'RECORDER_STORAGE_DROP_ALL_TABLES', DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES), clear_content=settings.getbool( 'RECORDER_STORAGE_CLEAR_CONTENT', DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT))
def test_site(site): # Create graph graph = graphs.Manager() # Add site to graph graph.add_site(site) # Show graph pages print '-' * 80 for page in graph.pages: print page, page.status # Show single page a_page = graph.get_page("A") print a_page.url, [link.url for link in a_page.links]
def get_sequence(self, site_list, max_next_requests): """ Returns a crawling sequence from a site list :param list site_list: A list of sites to use as frontier seeds. :param int max_next_requests: Max next requests for the frontier. """ # Graph graph_manager = graphs.Manager() graph_manager.add_site_list(site_list) # Tester tester = FrontierTester(frontier=self.get_frontier(), graph_manager=graph_manager, max_next_requests=max_next_requests) # Run tester and generate sequence tester.run() return [page.url for page in tester.sequence]
def test_logic(backend): # Graph graph = graphs.Manager('sqlite:///data/graph.db') # Frontier settings = Settings() settings.BACKEND = backend settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False settings.TEST_MODE = True frontier = FrontierManager.from_settings(settings) # Tester tester = FrontierTester(frontier, graph) tester.run(add_all_pages=True) # Show crawling sequence print '-' * 80 print frontier.backend.name print '-' * 80 for page in tester.sequence: print page.url
""" Graph manager example with site list """ from crawlfrontier import graphs SITE_LIST = [ [ ("A1", ["A2", "A3"]), ("A2", ["A4", "A5"]), ("A3", ["A6", "A7"]), ], [ ("B1", ["B2", "B3"]), ("B2", ["B4", "B5"]), ("B3", ["B6", "B7"]), ], ] if __name__ == '__main__': # Create graph graph = graphs.Manager() # Add site list to graph graph.add_site_list(SITE_LIST) # Show graph pages for page in graph.pages: print page
""" Graph manager with database """ from crawlfrontier import graphs SITE_LIST = [ [ ("A1", ["A2", "A3"]), ("A2", ["A4", "A5"]), ("A3", ["A6", "A7"]), ], [ ("B1", ["B2", "B3"]), ("B2", ["B4", "B5"]), ("B3", ["B6", "B7"]), ], ] if __name__ == '__main__': # Create graph with sqlite db graph = graphs.Manager('sqlite:///data/graph.db', drop_all_tables=True) # Add site list to graph graph.add_site_list(SITE_LIST) # Show graph pages for page in graph.pages: print page
from crawlfrontier import graphs graph = graphs.Manager('sqlite:///recordings/record.db') graph.render(filename='recordings/record.png', label='Record graph', use_urls=True, include_ids=True)
""" Frontier tester using recording data """ from crawlfrontier import FrontierManager, FrontierTester, Settings, graphs SETTINGS = Settings() SETTINGS.BACKEND = 'crawlfrontier.contrib.backends.memory_heapq.FIFO' SETTINGS.LOGGING_MANAGER_ENABLED = True SETTINGS.LOGGING_BACKEND_ENABLED = True SETTINGS.LOGGING_DEBUGGING_ENABLED = False if __name__ == '__main__': # Graph graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db') # Frontier frontier = FrontierManager.from_settings(SETTINGS) # Tester tester = FrontierTester(frontier, graph) # Run test tester.run() # Show frontier pages print '-' * 80 print ' Frontier pages' print '-' * 80 for page in frontier.backend.pages.values(): print page.url, page.depth, page.state
""" Frontier tester usage example """ from crawlfrontier import FrontierManager, FrontierTester, Settings, graphs if __name__ == '__main__': # Graph graph = graphs.Manager('sqlite:///data/graph.db') # Frontier settings = Settings() settings.TEST_MODE = True settings.LOGGING_MANAGER_ENABLED = True settings.LOGGING_BACKEND_ENABLED = True settings.LOGGING_DEBUGGING_ENABLED = False frontier = FrontierManager.from_settings(settings) # Tester tester = FrontierTester(frontier, graph) # Run test tester.run() # Show crawling sequence for page in tester.sequence: print page.url
def generate_graph(site_list, filename, title, use_urls=False): print 'Generating diagram "%s"...' % title graph = graphs.Manager() graph.add_site_list(site_list) graph.render(filename=filename, label=title, use_urls=use_urls)