Esempio n. 1
0
    def test_with_multiple_active_queues(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 2
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://www.google.de")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(2, len(frontier._current_queues))

        next_url = frontier.get_next()
Esempio n. 2
0
    def test_with_multiple_active_queues(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 2
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://www.google.de")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(2, len(frontier._current_queues))

        next_url = frontier.get_next()
Esempio n. 3
0
    def test_queues_work(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 1
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://foreignhost")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(1, len(frontier._current_queues))
        for q1 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q1])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q1])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q1 == 1:
            curi1.status_code = 500
            frontier.process_server_error(curi1)
        else:
            curi1.status_code = 500
            frontier.process_server_error(curi2)

        self.assertEquals(-1, frontier._budget_politeness[q1])

        frontier._cleanup_budget_politeness()

        self.assertEqual(1, len(frontier._current_queues))
        for q2 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q2])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q2])
 
        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q2 == 1:
            curi1.status_code = 200
            frontier.process_successful_crawl(curi1)
        else:
            curi2.status_code = 200
            frontier.process_successful_crawl(curi2)

        self.assertEquals(3, frontier._budget_politeness[q2])

        frontier._cleanup_budget_politeness()
Esempio n. 4
0
    def test_queues_work(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"
        s.FRONTIER_ACTIVE_QUEUES = 1
        s.FRONTIER_QUEUE_BUDGET = 4
        s.FRONTIER_QUEUE_BUDGET_PUNISH = 5

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        curi1 = CrawlUri("http://localhost")
        curi1.current_priority = 2
        curi1.req_time = 0.4

        frontier.add_uri(curi1)

        cur = frontier._front_end_queues._cursor

        curi2 = CrawlUri("http://foreignhost")
        curi2.current_priority = 1
        curi2.req_time = 1.4

        frontier.add_uri(curi2)

        self.assertEqual(0, len(frontier._current_queues))
        frontier._maybe_add_queues()

        self.assertEqual(1, len(frontier._current_queues))
        for q1 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q1])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q1])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q1 == 1:
            curi1.status_code = 500
            frontier.process_server_error(curi1)
        else:
            curi1.status_code = 500
            frontier.process_server_error(curi2)

        self.assertEquals(-1, frontier._budget_politeness[q1])

        frontier._cleanup_budget_politeness()

        self.assertEqual(1, len(frontier._current_queues))
        for q2 in frontier._current_queues.keys():
            pass

        self.assertEquals(4, frontier._budget_politeness[q2])
        frontier._cleanup_budget_politeness()
        self.assertEquals(4, frontier._budget_politeness[q2])

        frontier._update_heap()
        self.assertEqual(1, len(frontier._current_queues))

        if q2 == 1:
            curi1.status_code = 200
            frontier.process_successful_crawl(curi1)
        else:
            curi2.status_code = 200
            frontier.process_successful_crawl(curi2)

        self.assertEquals(3, frontier._budget_politeness[q2])

        frontier._cleanup_budget_politeness()