def test_that_adding_uris_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) curi = CrawlUri("http://localhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 2 frontier.add_uri(curi) cur = frontier._front_end_queues._cursor curi = CrawlUri("http://foreignhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 1 frontier.add_uri(curi) idents = {"localhost": -1, "foreignhost": -1} cur.execute("SELECT * FROM queue_identifiers") for row in cur: self.assertTrue(row['identifier'] in idents.keys()) idents["http://%s" % row['identifier']] = row['queue'] cur.execute("SELECT * FROM queues") for row in cur: self.assertEqual(idents[row['url']], row['queue']) self.assertEqual(2, frontier._front_end_queues.get_queue_count())
def test_that_adding_uris_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = MultipleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) curi = CrawlUri("http://localhost") curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.current_priority = 2 frontier.add_uri(curi) cur = frontier._front_end_queues._cursor curi = CrawlUri("http://foreignhost") curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.current_priority = 1 frontier.add_uri(curi) idents = {"localhost": -1, "foreignhost": -1} cur.execute("SELECT * FROM queue_identifiers") for row in cur: self.assertTrue(row['identifier'] in idents.keys()) idents["http://%s" % row['identifier']] = row['queue'] cur.execute("SELECT * FROM queues") for row in cur: self.assertEqual(idents[row['url']], row['queue']) self.assertEqual(2, frontier._front_end_queues.get_queue_count())
def test_crawluri_from_uri_with_credentials(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) now_timestamp = time.mktime(now.timetuple()) next_crawl_date = now + timedelta(days=1) next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple()) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) uri = ("http://*****:*****@localhost", "123", now_timestamp, 1, next_crawl_date_timestamp) curi = frontier._crawluri_from_uri(uri) self.assertEqual("http://*****:*****@localhost", curi.url) self.assertEqual("123", curi.req_header["Etag"]) self.assertEqual(serialize_date_time(now), curi.req_header["Last-Modified"]) self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME]) self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
def test_sinks(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_sink(AbstractCrawlUriSink()) curi = CrawlUri("http://localhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 2 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_successful_crawl(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_not_found(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_redirect(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_server_error(curi)
def test_crawluri_from_uri_with_credentials(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) now_timestamp = time.mktime(now.timetuple()) next_crawl_date = now + timedelta(days=1) next_crawl_date_timestamp = time.mktime(next_crawl_date.timetuple()) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) uri = ("http://*****:*****@localhost", "123", now_timestamp, 1, next_crawl_date_timestamp) curi = frontier._crawluri_from_uri(uri) self.assertEqual("http://*****:*****@localhost", curi.url) self.assertEqual("123", curi.req_header["Etag"]) self.assertEqual(serialize_date_time(now), curi.req_header["Last-Modified"]) self.assertEqual("user", curi.optional_vars[CURI_SITE_USERNAME]) self.assertEqual("passwd", curi.optional_vars[CURI_SITE_PASSWORD])
def test_sinks(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_sink(AbstractCrawlUriSink()) curi = CrawlUri("http://localhost") curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.current_priority = 2 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_successful_crawl(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_not_found(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_redirect(curi) frontier._add_to_heap(frontier._uri_from_curi(curi), 0) frontier.process_server_error(curi)
def test_that_updating_heap_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) q1 = [] q2 = [] now = datetime( *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta( days=2) for i in range(1, 20): curi = CrawlUri("http://localhost/test/%s" % i) curi.current_priority = (i % 2 + 1) curi.rep_header = { "Etag": "123%s" % i, "Date": serialize_date_time(now) } frontier.add_uri(curi) if i % 2 == 0: (url, etag, mod_date, next_date, prio) = frontier._uri_from_curi(curi) next_date = next_date - 1000 * 60 * 5 frontier._front_end_queues.update_uri( (url, etag, mod_date, next_date, prio)) q2.append(curi.url) else: q1.append(curi.url) self.assertRaises(Empty, frontier._heap.get_nowait) for i in range(1, 10): frontier._next_possible_crawl = time.time() candidate_uri = frontier.get_next() if candidate_uri.url in q1: self.assertTrue(candidate_uri.url in q1) q1.remove(candidate_uri.url) elif candidate_uri.url in q2: self.assertTrue(candidate_uri.url in q2) q2.remove(candidate_uri.url) self.assertEqual(10, len(q1)) self.assertEqual(0, len(q2)) self.assertRaises(Empty, frontier.get_next)
def test_that_updating_heap_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) q1 = [] q2 = [] now = datetime(*datetime.fromtimestamp( time.time()).timetuple()[0:6]) - timedelta(days=2) for i in range(1, 20): curi = CrawlUri("http://localhost/test/%s" % i) curi.current_priority = (i % 2 + 1) curi.rep_header = { "Etag" : "123%s" % i, "Date" : serialize_date_time(now) } frontier.add_uri(curi) if i % 2 == 0: (url, etag, mod_date, next_date, prio) = frontier._uri_from_curi(curi) next_date = next_date - 1000 * 60 * 5 frontier._front_end_queues.update_uri((url, etag, mod_date, next_date, prio)) q2.append(curi.url) else: q1.append(curi.url) self.assertRaises(Empty, frontier._heap.get_nowait) for i in range(1, 10): frontier._next_possible_crawl = time.time() candidate_uri = frontier.get_next() if candidate_uri.url in q1: self.assertTrue(candidate_uri.url in q1) q1.remove(candidate_uri.url) elif candidate_uri.url in q2: self.assertTrue(candidate_uri.url in q2) q2.remove(candidate_uri.url) self.assertEqual(10, len(q1)) self.assertEqual(0, len(q2)) self.assertRaises(Empty, frontier.get_next)
def test_that_time_based_politeness_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime(*datetime.fromtimestamp( time.time()).timetuple()[0:6]) - timedelta(days=2) curi = CrawlUri("http://localhost/test") curi.current_priority = 3 curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.req_time = 0.5 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) a = frontier._next_possible_crawl frontier.process_successful_crawl(curi) self.assertTrue(frontier._next_possible_crawl > a) self.assertTrue(frontier._next_possible_crawl > time.time()) self.assertRaises(Empty, frontier.get_next)
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker( self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header = { "Last-Modified" : mtime } ) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_that_time_based_politeness_works(self): s = Settings() s.FRONTIER_STATE_FILE = ":memory:" frontier = SingleHostFrontier(s, StreamHandler(sys.stdout)) now = datetime( *datetime.fromtimestamp(time.time()).timetuple()[0:6]) - timedelta( days=2) curi = CrawlUri("http://localhost/test") curi.current_priority = 3 curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.req_time = 0.5 frontier._add_to_heap(frontier._uri_from_curi(curi), 0) a = frontier._next_possible_crawl frontier.process_successful_crawl(curi) self.assertTrue(frontier._next_possible_crawl > a) self.assertTrue(frontier._next_possible_crawl > time.time()) self.assertRaises(Empty, frontier.get_next)
def _crawluri_from_uri(self, uri): """ Convert an URI tuple to a :class:`CrawlUri`. Replace the hostname with the real IP in order to cache DNS queries. """ (url, etag, mod_date, _next_date, prio) = uri parsed_url = urlparse(url) # dns resolution and caching port = parsed_url.port if not port: port = PROTOCOLS_DEFAULT_PORT[parsed_url.scheme] effective_netloc = self._dns_cache["%s:%s" % (parsed_url.hostname, port)] curi = CrawlUri(url) curi.effective_url = url.replace(parsed_url.netloc, "%s:%s" % effective_netloc) curi.current_priority = prio curi.req_header = dict() if etag: curi.req_header["Etag"] = etag if mod_date: mod_date_time = datetime.fromtimestamp(mod_date) curi.req_header["Last-Modified"] = serialize_date_time( mod_date_time) curi.optional_vars = dict() if parsed_url.username and parsed_url.password: curi.optional_vars[CURI_SITE_USERNAME] = \ parsed_url.username.encode() curi.optional_vars[CURI_SITE_PASSWORD] = \ parsed_url.password.encode() return curi
def test_adding_uri_works(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" curi = CrawlUri("http://localhost") curi.rep_header = { "Etag" : "123", "Date" : serialize_date_time(now) } curi.current_priority = 2 frontier = AbstractBaseFrontier(s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_uri(curi) for uri in frontier._front_end_queues.queue_head(): (url, etag, mod_date, queue, next_date) = uri self.assertEqual("http://localhost", url) self.assertEqual("123", etag) self.assertEqual(now, datetime.fromtimestamp(mod_date)) frontier._current_uris[url] = uri
def test_fetching_last_modified_works(self): settings = Settings() fetcher = FetchProcessor(settings, io_loop=self._io_loop) worker = AsyncZmqWorker(self._worker_sockets['worker_pull'], self._worker_sockets['worker_pub'], self._mgmt, fetcher, StreamHandler(sys.stdout), logging.DEBUG, self._io_loop) worker.start() mtimestamp = datetime.fromtimestamp( os.stat(os.path.join(self._path, "robots.txt")).st_mtime) mtime = serialize_date_time(mtimestamp) curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port, effective_url="http://127.0.0.1:%s/robots.txt" % self.port, req_header={"Last-Modified": mtime}) msg = DataMessage() msg.identity = "me" msg.curi = curi def assert_expected_result_and_stop(raw_msg): msg = DataMessage(raw_msg) self.assertEqual(304, msg.curi.status_code) self.assertEqual("", msg.curi.content_body) death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER, data=ZMQ_SPYDER_MGMT_WORKER_QUIT) self._mgmt_sockets['master_pub'].send_multipart(death.serialize()) self._worker_sockets['master_sub'].on_recv( assert_expected_result_and_stop) self._worker_sockets['master_push'].send_multipart(msg.serialize()) self._io_loop.start()
def test_adding_uri_works(self): now = datetime(*datetime.fromtimestamp(time.time()).timetuple()[0:6]) next_crawl_date = now + timedelta(days=1) s = Settings() s.FRONTIER_STATE_FILE = ":memory:" curi = CrawlUri("http://localhost") curi.rep_header = {"Etag": "123", "Date": serialize_date_time(now)} curi.current_priority = 2 frontier = AbstractBaseFrontier( s, StreamHandler(sys.stdout), SQLiteSingleHostUriQueue(s.FRONTIER_STATE_FILE), SimpleTimestampPrioritizer(s)) frontier.add_uri(curi) for uri in frontier._front_end_queues.queue_head(): (url, etag, mod_date, queue, next_date) = uri self.assertEqual("http://localhost", url) self.assertEqual("123", etag) self.assertEqual(now, datetime.fromtimestamp(mod_date)) frontier._current_uris[url] = uri