Esempio n. 1
0
def test_backend(backend):

    # Graph
    graph = graphs.Manager()
    graph.add_site_list(SITE_LIST)

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    frontier = FrontierManager.from_settings(settings)

    print '-' * 80
    print frontier.backend.name
    print '-' * 80

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run()

    # Show crawling sequence
    for page in tester.sequence:
        print page.url
Esempio n. 2
0
    def open(self, spider):
        super(RecorderScheduler, self).open(spider)

        self.stats_manager = StatsManager(spider.crawler.stats)

        settings = spider.crawler.settings
        self.recorder_enabled = settings.get('RECORDER_ENABLED',
                                             DEFAULT_RECORDER_ENABLED)

        if not self.recorder_enabled:
            log.msg('Recorder disabled!', log.WARNING)
            return

        log.msg('Starting recorder', log.INFO)

        recorder_storage = settings.get('RECORDER_STORAGE_ENGINE', None)
        if not recorder_storage:
            self.recorder_enabled = False
            log.msg('Missing Recorder storage! Recorder disabled...',
                    log.WARNING)
            return

        self.graph = graphs.Manager(
            engine=recorder_storage,
            drop_all_tables=settings.getbool(
                'RECORDER_STORAGE_DROP_ALL_TABLES',
                DEFAULT_RECORDER_STORAGE_DROP_ALL_TABLES),
            clear_content=settings.getbool(
                'RECORDER_STORAGE_CLEAR_CONTENT',
                DEFAULT_RECORDER_STORAGE_CLEAR_CONTENT))
Esempio n. 3
0
def test_site(site):
    # Create graph
    graph = graphs.Manager()

    # Add site to graph
    graph.add_site(site)

    # Show graph pages
    print '-' * 80
    for page in graph.pages:
        print page, page.status

    # Show single page
    a_page = graph.get_page("A")
    print a_page.url, [link.url for link in a_page.links]
Esempio n. 4
0
    def get_sequence(self, site_list, max_next_requests):
        """
        Returns a crawling sequence from a site list

        :param list site_list: A list of sites to use as frontier seeds.
        :param int max_next_requests: Max next requests for the frontier.
        """

        # Graph
        graph_manager = graphs.Manager()
        graph_manager.add_site_list(site_list)

        # Tester
        tester = FrontierTester(frontier=self.get_frontier(),
                                graph_manager=graph_manager,
                                max_next_requests=max_next_requests)

        # Run tester and generate sequence
        tester.run()
        return [page.url for page in tester.sequence]
Esempio n. 5
0
def test_logic(backend):
    # Graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Frontier
    settings = Settings()
    settings.BACKEND = backend
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    settings.TEST_MODE = True
    frontier = FrontierManager.from_settings(settings)

    # Tester
    tester = FrontierTester(frontier, graph)
    tester.run(add_all_pages=True)

    # Show crawling sequence
    print '-' * 80
    print frontier.backend.name
    print '-' * 80
    for page in tester.sequence:
        print page.url
Esempio n. 6
0
"""
Graph manager example with site list
"""
from crawlfrontier import graphs

SITE_LIST = [
    [
        ("A1", ["A2", "A3"]),
        ("A2", ["A4", "A5"]),
        ("A3", ["A6", "A7"]),
    ],
    [
        ("B1", ["B2", "B3"]),
        ("B2", ["B4", "B5"]),
        ("B3", ["B6", "B7"]),
    ],
]

if __name__ == '__main__':
    # Create graph
    graph = graphs.Manager()

    # Add site list to graph
    graph.add_site_list(SITE_LIST)

    # Show graph pages
    for page in graph.pages:
        print page
Esempio n. 7
0
"""
Graph manager with database
"""
from crawlfrontier import graphs

SITE_LIST = [
    [
        ("A1", ["A2", "A3"]),
        ("A2", ["A4", "A5"]),
        ("A3", ["A6", "A7"]),
    ],
    [
        ("B1", ["B2", "B3"]),
        ("B2", ["B4", "B5"]),
        ("B3", ["B6", "B7"]),
    ],
]

if __name__ == '__main__':
    # Create graph with sqlite db
    graph = graphs.Manager('sqlite:///data/graph.db', drop_all_tables=True)

    # Add site list to graph
    graph.add_site_list(SITE_LIST)

    # Show graph pages
    for page in graph.pages:
        print page

Esempio n. 8
0
from crawlfrontier import graphs

graph = graphs.Manager('sqlite:///recordings/record.db')
graph.render(filename='recordings/record.png',
             label='Record graph',
             use_urls=True,
             include_ids=True)
Esempio n. 9
0
"""
Frontier tester using recording data
"""
from crawlfrontier import FrontierManager, FrontierTester, Settings, graphs

SETTINGS = Settings()
SETTINGS.BACKEND = 'crawlfrontier.contrib.backends.memory_heapq.FIFO'
SETTINGS.LOGGING_MANAGER_ENABLED = True
SETTINGS.LOGGING_BACKEND_ENABLED = True
SETTINGS.LOGGING_DEBUGGING_ENABLED = False

if __name__ == '__main__':
    # Graph
    graph = graphs.Manager('sqlite:///recordings/scrapinghub.com.db')

    # Frontier
    frontier = FrontierManager.from_settings(SETTINGS)

    # Tester
    tester = FrontierTester(frontier, graph)

    # Run test
    tester.run()

    # Show frontier pages
    print '-' * 80
    print ' Frontier pages'
    print '-' * 80
    for page in frontier.backend.pages.values():
        print page.url, page.depth, page.state
Esempio n. 10
0
"""
Frontier tester usage example
"""
from crawlfrontier import FrontierManager, FrontierTester, Settings, graphs

if __name__ == '__main__':
    # Graph
    graph = graphs.Manager('sqlite:///data/graph.db')

    # Frontier
    settings = Settings()
    settings.TEST_MODE = True
    settings.LOGGING_MANAGER_ENABLED = True
    settings.LOGGING_BACKEND_ENABLED = True
    settings.LOGGING_DEBUGGING_ENABLED = False
    frontier = FrontierManager.from_settings(settings)

    # Tester
    tester = FrontierTester(frontier, graph)

    # Run test
    tester.run()

    # Show crawling sequence
    for page in tester.sequence:
        print page.url
Esempio n. 11
0
def generate_graph(site_list, filename, title, use_urls=False):
    print 'Generating diagram "%s"...' % title
    graph = graphs.Manager()
    graph.add_site_list(site_list)
    graph.render(filename=filename, label=title, use_urls=use_urls)