Example #1
0
    def test_removing_lists_works(self):

        uris = [("http://localhost", "etag", int(time.time()*1000),
                int(time.time() * 1000), 1),
            ("http://fogeignhost", "ETAG", int(time.time()*1000),
             int(time.time() * 1000), 2),
        ]

        q = SQLiteSingleHostUriQueue(":memory:")
        q.add_uris(uris)

        q.remove_uris(uris)

        cursor = q._connection.execute("SELECT * FROM queue")
        self.assertTrue(None is cursor.fetchone())
Example #2
0
    def test_crawluri_from_uri_with_credentials(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        now_timestamp = time.mktime(now.timetuple())
        next_crawl_date = now + timedelta(days=1)
        next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple())

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))

        uri = ("http://*****:*****@localhost", "123", now_timestamp, 1,
               next_crawl_date_timestamp)

        curi = frontier._crawluri_from_uri(uri)

        self.assertEqual("http://*****:*****@localhost", curi.url)
        self.assertEqual("123", curi.req_header["Etag"])
        self.assertEqual(serialize_date_time(now),
                         curi.req_header["Last-Modified"])
        self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME])
        self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
Example #3
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
Example #4
0
    def test_adding_lists_works(self):

        uris = [("http://localhost", "etag", int(time.time()*1000),
                int(time.time() * 1010), 1),
        ]

        q = SQLiteSingleHostUriQueue(":memory:")
        q.add_uris(uris)

        cursor = q._connection.execute("SELECT * FROM queue")
        uri_res = cursor.fetchone()
        (url, etag, mod_date, next_date, prio) = uris[0]
        (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
        self.assertEqual(url, url_res)
        self.assertEqual(etag, etag_res)
        self.assertEqual(mod_date, mod_date_res)
        self.assertEqual(prio, prio_res)
        self.assertEqual(next_date, next_date_res)
Example #5
0
    def test_queue_head_works(self):

        uris = [("http://localhost", "etag", int(time.time()*1000),
                int(time.time() * 1000), 1),
            ("http://fogeignhost", "ETAG", int(time.time()*1000),
             int(time.time() * 1001), 2),
        ]

        q = SQLiteSingleHostUriQueue(":memory:")
        q.add_uris(uris)

        (url1, etag1, mod_date1, next_date1, prio1) = uris[0]
        (url2, etag2, mod_date2, next_date2, prio2) = uris[1]

        for uri_res in q.queue_head(n=1, offset=0):
            (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
            self.assertEqual(url1, url_res)
            self.assertEqual(etag1, etag_res)
            self.assertEqual(mod_date1, mod_date_res)
            self.assertEqual(prio1, prio_res)
            self.assertEqual(next_date1, next_date_res)

        for uri_res in q.queue_head(n=1, offset=1):
            (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
            self.assertEqual(url2, url_res)
            self.assertEqual(etag2, etag_res)
            self.assertEqual(mod_date2, mod_date_res)
            self.assertEqual(prio2, prio_res)
            self.assertEqual(next_date2, next_date_res)

        uris.append(("http://localhost/1", "eTag", int(time.time()*1000),
                    int(time.time()*1002), 1))
        (url3, etag3, mod_date3, next_date3, prio3) = uris[2]
        q.add_uri(uris[2])

        q.ignore_uri("http://localhost", 404)

        for uri_res in q.queue_head(n=1, offset=1):
            (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
            self.assertEqual(url3, url_res)
            self.assertEqual(etag3, etag_res)
            self.assertEqual(mod_date3, mod_date_res)
            self.assertEqual(prio3, prio_res)
            self.assertEqual(next_date3, next_date_res)
Example #6
0
    def test_iterating_over_all_uris_works(self):

        uris = [("http://localhost", "etag", int(time.time()*1000),
                int(time.time() * 1000), 1),
            ("http://foreignhost", "ETAG", int(time.time()*1000),
             int(time.time() * 1000), 2),
        ]
        urls = ["http://localhost", "http://foreignhost"]

        q = SQLiteSingleHostUriQueue(":memory:")
        q.add_uris(uris)

        uri = q.get_uri("http://foreignhost")
        self.assertEqual(uris[1], uri)

        self.assertRaises(UriNotFound, q.get_uri, "http://gibtsnuesch")

        for url in q.all_uris():
            self.assertTrue(url in urls)
Example #7
0
    def test_adding_lists_works(self):

        uris = [
            ("http://localhost", "etag", int(time.time() * 1000),
             int(time.time() * 1010), 1),
        ]

        q = SQLiteSingleHostUriQueue(":memory:")
        q.add_uris(uris)

        cursor = q._connection.execute("SELECT * FROM queue")
        uri_res = cursor.fetchone()
        (url, etag, mod_date, next_date, prio) = uris[0]
        (url_res, etag_res, mod_date_res, next_date_res, prio_res) = uri_res
        self.assertEqual(url, url_res)
        self.assertEqual(etag, etag_res)
        self.assertEqual(mod_date, mod_date_res)
        self.assertEqual(prio, prio_res)
        self.assertEqual(next_date, next_date_res)
Example #8
0
    def __init__(self, settings, log_handler):
        """
        Initialize the base frontier.
        """
        prio_clazz = import_class(settings.PRIORITIZER_CLASS)
        AbstractBaseFrontier.__init__(
            self, settings, log_handler,
            SQLiteSingleHostUriQueue(settings.FRONTIER_STATE_FILE),
            prio_clazz(settings))

        self._crawl_delay = settings.FRONTIER_CRAWL_DELAY_FACTOR
        self._min_delay = settings.FRONTIER_MIN_DELAY
        self._next_possible_crawl = time.time()
Example #9
0
    def test_removing_lists_works(self):

        uris = [
            ("http://localhost", "etag", int(time.time() * 1000),
             int(time.time() * 1000), 1),
            ("http://fogeignhost", "ETAG", int(time.time() * 1000),
             int(time.time() * 1000), 2),
        ]

        q = SQLiteSingleHostUriQueue(":memory:")
        q.add_uris(uris)

        q.remove_uris(uris)

        cursor = q._connection.execute("SELECT * FROM queue")
        self.assertTrue(None is cursor.fetchone())
Example #10
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri
Example #11
0
    def test_iterating_over_all_uris_works(self):

        uris = [
            ("http://localhost", "etag", int(time.time() * 1000),
             int(time.time() * 1000), 1),
            ("http://foreignhost", "ETAG", int(time.time() * 1000),
             int(time.time() * 1000), 2),
        ]
        urls = ["http://localhost", "http://foreignhost"]

        q = SQLiteSingleHostUriQueue(":memory:")
        q.add_uris(uris)

        uri = q.get_uri("http://foreignhost")
        self.assertEqual(uris[1], uri)

        self.assertRaises(UriNotFound, q.get_uri, "http://gibtsnuesch")

        for url in q.all_uris():
            self.assertTrue(url in urls)
Example #12
0
    def test_queue_head_works(self):

        uris = [
            ("http://localhost", "etag", int(time.time() * 1000),
             int(time.time() * 1000), 1),
            ("http://fogeignhost", "ETAG", int(time.time() * 1000),
             int(time.time() * 1001), 2),
        ]

        q = SQLiteSingleHostUriQueue(":memory:")
        q.add_uris(uris)

        (url1, etag1, mod_date1, next_date1, prio1) = uris[0]
        (url2, etag2, mod_date2, next_date2, prio2) = uris[1]

        for uri_res in q.queue_head(n=1, offset=0):
            (url_res, etag_res, mod_date_res, next_date_res,
             prio_res) = uri_res
            self.assertEqual(url1, url_res)
            self.assertEqual(etag1, etag_res)
            self.assertEqual(mod_date1, mod_date_res)
            self.assertEqual(prio1, prio_res)
            self.assertEqual(next_date1, next_date_res)

        for uri_res in q.queue_head(n=1, offset=1):
            (url_res, etag_res, mod_date_res, next_date_res,
             prio_res) = uri_res
            self.assertEqual(url2, url_res)
            self.assertEqual(etag2, etag_res)
            self.assertEqual(mod_date2, mod_date_res)
            self.assertEqual(prio2, prio_res)
            self.assertEqual(next_date2, next_date_res)

        uris.append(("http://localhost/1", "eTag", int(time.time() * 1000),
                     int(time.time() * 1002), 1))
        (url3, etag3, mod_date3, next_date3, prio3) = uris[2]
        q.add_uri(uris[2])

        q.ignore_uri("http://localhost", 404)

        for uri_res in q.queue_head(n=1, offset=1):
            (url_res, etag_res, mod_date_res, next_date_res,
             prio_res) = uri_res
            self.assertEqual(url3, url_res)
            self.assertEqual(etag3, etag_res)
            self.assertEqual(mod_date3, mod_date_res)
            self.assertEqual(prio3, prio_res)
            self.assertEqual(next_date3, next_date_res)