Exemple #1
0
def test_backend(backend):

    # Graph
    graph = graphs.Manager()
    graph.add_site_list(SITE_LIST)

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    frontier = FrontierManager.from_settings(settings)

    print '-' * 80
    print frontier.backend.name
    print '-' * 80

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run()

    # Show crawling sequence
    for page in tester.sequence:
        print page.url
Exemple #2
0
def test_backend(backend):

    # Graph
    graph = graphs.Manager()
    graph.add_site_list(SITE_LIST)

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    frontier = FrontierManager.from_settings(settings)

    print '-'*80
    print frontier.backend.name
    print '-'*80

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run()

    # Show crawling sequence
    for page in tester.sequence:
        print page.url
Exemple #3
0
    def get_sequence(self, site_list, max_next_requests, downloader_simulator=BaseDownloaderSimulator()):
        """
        Returns a crawling sequence from a site list

        :param list site_list: A list of sites to use as frontier seeds.
        :param int max_next_requests: Max next requests for the frontier.
        """

        # Graph
        graph_manager = graphs.Manager()
        graph_manager.add_site_list(site_list)

        # Tester
        tester = FrontierTester(
            frontier=self.get_frontier(),
            graph_manager=graph_manager,
            max_next_requests=max_next_requests,
            downloader_simulator=downloader_simulator,
        )

        # Run tester and generate sequence
        tester.run()
        return [page.url for page in tester.sequence]
Exemple #4
0
def test_logic(backend):
    # Graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    settings.TEST_MODE = True
    frontier = FrontierManager.from_settings(settings)

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run(add_all_pages=True)

    # Show crawling sequence
    print('-' * 80)
    print(frontier.backend.name)
    print('-' * 80)
    for page in tester.sequence:
        print(page.url)
Exemple #5
0
def test_logic(backend):
    # Graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    settings.TEST_MODE = True
    frontier = FrontierManager.from_settings(settings)

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run(add_all_pages=True)

    # Show crawling sequence
    print '-'*80
    print frontier.backend.name
    print '-'*80
    for page in tester.sequence:
        print page.url
Exemple #6
0
    def get_sequence(self,
                     site_list,
                     max_next_requests,
                     downloader_simulator=BaseDownloaderSimulator()):
        """
        Returns a crawling sequence from a site list

        :param list site_list: A list of sites to use as frontier seeds.
        :param int max_next_requests: Max next requests for the frontier.
        """

        # Graph
        graph_manager = graphs.Manager()
        graph_manager.add_site_list(site_list)

        # Tester
        tester = FrontierTester(frontier=self.get_frontier(),
                                graph_manager=graph_manager,
                                max_next_requests=max_next_requests,
                                downloader_simulator=downloader_simulator)

        # Run tester and generate sequence
        tester.run()
        return [page.url for page in tester.sequence]
Exemple #7
0
SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.LOGGING_DEBUGGING_ENABLED = False


if __name__ == '__main__':
    # Graph
    graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db')

    # Frontier
    frontier = FrontierManager.from_settings(SETTINGS)

    # Tester
    tester = FrontierTester(frontier, graph)

    # Run test
    tester.run()

    # Show frontier pages
    print '-'*80
    print ' Frontier pages'
    print '-'*80
    for page in frontier.backend.pages.values():
        print page.url, page.depth, page.state

    # Show crawling sequence
    print '-'*80
    print ' Crawling sequence'
    print '-'*80
Exemple #8
0
SETTINGS = Settings()
SETTINGS.BACKEND = 'frontera.contrib.backends.memory_heapq.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.LOGGING_DEBUGGING_ENABLED = False

if __name__ == '__main__':
    # Graph
    graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db')

    # Frontier
    frontier = FrontierManager.from_settings(SETTINGS)

    # Tester
    tester = FrontierTester(frontier, graph)

    # Run test
    tester.run()

    # Show frontier pages
    print('-' * 80)
    print(' Frontier pages')
    print('-' * 80)
    for page in list(frontier.backend.pages.values()):
        print(page.url, page.depth, page.state)

    # Show crawling sequence
    print('-' * 80)
    print(' Crawling sequence')
    print('-' * 80)