def test_that_adding_uris_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)
        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier.add_uri(curi)

        cur = frontier._front_end_queues._cursor

        curi = CrawlUri("http://foreignhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 1

        frontier.add_uri(curi)

        idents = {"localhost": -1, "foreignhost": -1}
        cur.execute("SELECT * FROM queue_identifiers")
        for row in cur:
            self.assertTrue(row['identifier'] in idents.keys())
            idents["http://%s" % row['identifier']] = row['queue']

        cur.execute("SELECT * FROM queues")
        for row in cur:
            self.assertEqual(idents[row['url']], row['queue'])

        self.assertEqual(2, frontier._front_end_queues.get_queue_count())
    def test_that_adding_uris_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)
        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier.add_uri(curi)

        cur = frontier._front_end_queues._cursor

        curi = CrawlUri("http://foreignhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 1

        frontier.add_uri(curi)

        idents = {"localhost": -1, "foreignhost": -1}
        cur.execute("SELECT * FROM queue_identifiers")
        for row in cur:
            self.assertTrue(row['identifier'] in idents.keys())
            idents["http://%s" % row['identifier']] = row['queue']

        cur.execute("SELECT * FROM queues")
        for row in cur:
            self.assertEqual(idents[row['url']], row['queue'])

        self.assertEqual(2, frontier._front_end_queues.get_queue_count())
Exemple #3
0
    def test_crawluri_from_uri_with_credentials(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        now_timestamp = time.mktime(now.timetuple())
        next_crawl_date = now + timedelta(days=1)
        next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple())

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))

        uri = ("http://*****:*****@localhost", "123", now_timestamp, 1,
               next_crawl_date_timestamp)

        curi = frontier._crawluri_from_uri(uri)

        self.assertEqual("http://*****:*****@localhost", curi.url)
        self.assertEqual("123", curi.req_header["Etag"])
        self.assertEqual(serialize_date_time(now),
                         curi.req_header["Last-Modified"])
        self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME])
        self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
Exemple #4
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
Exemple #5
0
    def test_crawluri_from_uri_with_credentials(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        now_timestamp = time.mktime(now.timetuple())
        next_crawl_date = now + timedelta(days=1)
        next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple())

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))

        uri = ("http://*****:*****@localhost", "123", now_timestamp, 1,
            next_crawl_date_timestamp)

        curi = frontier._crawluri_from_uri(uri)

        self.assertEqual("http://*****:*****@localhost", curi.url)
        self.assertEqual("123", curi.req_header["Etag"])
        self.assertEqual(serialize_date_time(now),
            curi.req_header["Last-Modified"])
        self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME])
        self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
Exemple #6
0
    def test_sinks(self):
        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))
        frontier.add_sink(AbstractCrawlUriSink())

        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_successful_crawl(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_not_found(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_redirect(curi)

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)
        frontier.process_server_error(curi)
Exemple #7
0
    def test_that_updating_heap_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        q1 = []
        q2 = []

        now = datetime(
            *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta(
                days=2)

        for i in range(1, 20):
            curi = CrawlUri("http://localhost/test/%s" % i)
            curi.current_priority = (i % 2 + 1)
            curi.rep_header = {
                "Etag": "123%s" % i,
                "Date": serialize_date_time(now)
            }

            frontier.add_uri(curi)

            if i % 2 == 0:
                (url, etag, mod_date, next_date,
                 prio) = frontier._uri_from_curi(curi)
                next_date = next_date - 1000 * 60 * 5
                frontier._front_end_queues.update_uri(
                    (url, etag, mod_date, next_date, prio))
                q2.append(curi.url)
            else:
                q1.append(curi.url)

        self.assertRaises(Empty, frontier._heap.get_nowait)

        for i in range(1, 10):
            frontier._next_possible_crawl = time.time()
            candidate_uri = frontier.get_next()

            if candidate_uri.url in q1:
                self.assertTrue(candidate_uri.url in q1)
                q1.remove(candidate_uri.url)
            elif candidate_uri.url in q2:
                self.assertTrue(candidate_uri.url in q2)
                q2.remove(candidate_uri.url)

        self.assertEqual(10, len(q1))
        self.assertEqual(0, len(q2))

        self.assertRaises(Empty, frontier.get_next)
Exemple #8
0
    def test_that_updating_heap_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        q1 = []
        q2 = []

        now = datetime(*datetime.fromtimestamp(
            time.time()).timetuple()[0:6]) - timedelta(days=2)

        for i in range(1, 20):
            curi = CrawlUri("http://localhost/test/%s" % i)
            curi.current_priority = (i % 2 + 1)
            curi.rep_header = { "Etag" : "123%s" % i, "Date" : serialize_date_time(now) }

            frontier.add_uri(curi)

            if i % 2 == 0:
                (url, etag, mod_date, next_date, prio) = frontier._uri_from_curi(curi)
                next_date = next_date - 1000 * 60 * 5
                frontier._front_end_queues.update_uri((url, etag, mod_date,
                            next_date, prio))
                q2.append(curi.url)
            else:
                q1.append(curi.url)

        self.assertRaises(Empty, frontier._heap.get_nowait)

        for i in range(1, 10):
            frontier._next_possible_crawl = time.time()
            candidate_uri = frontier.get_next()

            if candidate_uri.url in q1:
                self.assertTrue(candidate_uri.url in q1)
                q1.remove(candidate_uri.url)
            elif candidate_uri.url in q2:
                self.assertTrue(candidate_uri.url in q2)
                q2.remove(candidate_uri.url)

        self.assertEqual(10, len(q1))
        self.assertEqual(0, len(q2))

        self.assertRaises(Empty, frontier.get_next)
Exemple #9
0
    def test_that_time_based_politeness_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(*datetime.fromtimestamp(
            time.time()).timetuple()[0:6]) - timedelta(days=2)
        curi = CrawlUri("http://localhost/test")
        curi.current_priority = 3
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.req_time = 0.5

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)

        a = frontier._next_possible_crawl
        frontier.process_successful_crawl(curi)
        self.assertTrue(frontier._next_possible_crawl > a)
        self.assertTrue(frontier._next_possible_crawl > time.time())
        self.assertRaises(Empty, frontier.get_next)
    def test_fetching_last_modified_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
            self._worker_sockets['worker_pub'],
            self._mgmt,
            fetcher,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop)
        worker.start()

        mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path,
                        "robots.txt")).st_mtime)
        mtime = serialize_date_time(mtimestamp)
        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
                req_header = { "Last-Modified" :
                    mtime }
                )

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                    data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Exemple #11
0
    def test_that_time_based_politeness_works(self):

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        frontier = SingleHostFrontier(s, StreamHandler(sys.stdout))

        now = datetime(
            *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta(
                days=2)
        curi = CrawlUri("http://localhost/test")
        curi.current_priority = 3
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.req_time = 0.5

        frontier._add_to_heap(frontier._uri_from_curi(curi), 0)

        a = frontier._next_possible_crawl
        frontier.process_successful_crawl(curi)
        self.assertTrue(frontier._next_possible_crawl > a)
        self.assertTrue(frontier._next_possible_crawl > time.time())
        self.assertRaises(Empty, frontier.get_next)
Exemple #12
0
    def _crawluri_from_uri(self, uri):
        """
        Convert an URI tuple to a :class:`CrawlUri`.

        Replace the hostname with the real IP in order to cache DNS queries.
        """
        (url, etag, mod_date, _next_date, prio) = uri

        parsed_url = urlparse(url)

        # dns resolution and caching
        port = parsed_url.port
        if not port:
            port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme]

        effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname,
            port)]

        curi = CrawlUri(url)
        curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" %
                effective_netloc)
        curi.current_priority = prio
        curi.req_header = dict()
        if etag:
            curi.req_header["Etag"] = etag
        if mod_date:
            mod_date_time = datetime.fromtimestamp(mod_date)
            curi.req_header["Last-Modified"] = serialize_date_time(
                    mod_date_time)

        curi.optional_vars = dict()
        if parsed_url.username and parsed_url.password:
            curi.optional_vars[CURI_SITE_USERNAME] = \
                parsed_url.username.encode()
            curi.optional_vars[CURI_SITE_PASSWORD] = \
                parsed_url.password.encode()

        return curi
Exemple #13
0
    def _crawluri_from_uri(self, uri):
        """
        Convert an URI tuple to a :class:`CrawlUri`.

        Replace the hostname with the real IP in order to cache DNS queries.
        """
        (url, etag, mod_date, _next_date, prio) = uri

        parsed_url = urlparse(url)

        # dns resolution and caching
        port = parsed_url.port
        if not port:
            port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme]

        effective_netloc = self._dns_cache["%s:%s" %
                                           (parsed_url.hostname, port)]

        curi = CrawlUri(url)
        curi.effective_url = url.replace(parsed_url.netloc,
                                         "%s:%s" % effective_netloc)
        curi.current_priority = prio
        curi.req_header = dict()
        if etag:
            curi.req_header["Etag"] = etag
        if mod_date:
            mod_date_time = datetime.fromtimestamp(mod_date)
            curi.req_header["Last-Modified"] = serialize_date_time(
                mod_date_time)

        curi.optional_vars = dict()
        if parsed_url.username and parsed_url.password:
            curi.optional_vars[CURI_SITE_USERNAME] = \
                parsed_url.username.encode()
            curi.optional_vars[CURI_SITE_PASSWORD] = \
                parsed_url.password.encode()

        return curi
Exemple #14
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) }
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout),
                SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
                SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri
    def test_fetching_last_modified_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        mtimestamp = datetime.fromtimestamp(
            os.stat(os.path.join(self._path, "robots.txt")).st_mtime)
        mtime = serialize_date_time(mtimestamp)
        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                        effective_url="http://127.0.0.1:%s/robots.txt" %
                        self.port,
                        req_header={"Last-Modified": mtime})

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Exemple #16
0
    def test_adding_uri_works(self):

        now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6])
        next_crawl_date = now + timedelta(days=1)

        s = Settings()
        s.FRONTIER_STATE_FILE = ":memory:"

        curi = CrawlUri("http://localhost")
        curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)}
        curi.current_priority = 2

        frontier = AbstractBaseFrontier(
            s, StreamHandler(sys.stdout),
            SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE),
            SimpleTimestampPrioritizer(s))
        frontier.add_uri(curi)

        for uri in frontier._front_end_queues.queue_head():
            (url, etag, mod_date, queue, next_date) = uri
            self.assertEqual("http://localhost", url)
            self.assertEqual("123", etag)
            self.assertEqual(now, datetime.fromtimestamp(mod_date))
            frontier._current_uris[url] = uri