Beispiel #1
0
    def test_that_stopping_worker_via_mgmt_works(self):

        worker = ZmqWorker(self._worker_sockets['worker_pull'],
                           self._worker_sockets['worker_pub'],
                           self._mgmt, self.echo_processing,
                           StreamHandler(sys.stdout), logging.DEBUG,
                           self._io_loop)

        worker.start()

        curi = CrawlUri(url="http://localhost")
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_correct_data_answer(msg2):
            self.assertEqual(msg, DataMessage(msg2))

        self._worker_sockets['master_sub'].on_recv(assert_correct_data_answer)

        def assert_correct_mgmt_answer(msg3):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg3.data)

        self._mgmt_sockets['master_sub'].on_recv(assert_correct_data_answer)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Beispiel #2
0
    def test_that_async_worker_works(self):
        worker = AsyncZmqWorker(
            self._worker_sockets["worker_pull"],
            self._worker_sockets["worker_pub"],
            self._mgmt,
            self.echo_processing,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop,
        )

        worker.start()

        curi = CrawlUri(url="http://localhost")
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_correct_data(msg2):
            msg3 = DataMessage(msg2)
            self.assertEqual(msg, msg3)

        self._worker_sockets["master_sub"].on_recv(assert_correct_data)

        def assert_correct_mgmt(msg4):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data)

        self._mgmt_sockets["master_sub"].on_recv(assert_correct_mgmt)

        self._worker_sockets["master_push"].send_multipart(msg.serialize())

        self._io_loop.start()
        worker._in_stream.flush()
    def test_fetching_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
            self._worker_sockets['worker_pub'],
            self._mgmt,
            fetcher,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop)
        worker.start()

        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
                )
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            robots = open(os.path.join(os.path.dirname(__file__),
                        "static/robots.txt")).read()
            self.assertEqual(robots, msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                    data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)

        self._io_loop.start()
Beispiel #4
0
    def _send_next_uri(self):
        """
        See if there are more uris to process and send them to the workers if
        there are any.

        At this point there is a very small heuristic in order to maximize the
        throughput: try to keep the `self._out_stream._send_queue` full.
        """
        if not self._running:
            self._logger.error("Master is not running, not sending more uris")
            return

        num_workers = len(self._available_workers)

        if self._running and num_workers > 0:
            while self._out_stream._send_queue.qsize() < num_workers * 4:

                try:
                    next_curi = self._frontier.get_next()
                except Empty:
                    # well, frontier has nothing to process right now
                    self._logger.debug("zmqmaster::Nothing to crawl right now")
                    break

                self._logger.info("zmqmaster::Begin crawling next URL (%s)" %
                        next_curi.url)
                msg = DataMessage(identity=self._identity, curi=next_curi)
                self._out_stream.send_multipart(msg.serialize())
Beispiel #5
0
    def test_that_stopping_worker_via_mgmt_works(self):

        worker = ZmqWorker( self._worker_sockets['worker_pull'],
            self._worker_sockets['worker_pub'],
            self._mgmt,
            self.echo_processing,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop)

        worker.start()

        curi = CrawlUri(url="http://localhost")
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_correct_data_answer(msg2):
            self.assertEqual(msg, DataMessage(msg2))

        self._worker_sockets['master_sub'].on_recv(assert_correct_data_answer)

        def assert_correct_mgmt_answer(msg3):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg3.data)

        self._mgmt_sockets['master_sub'].on_recv(assert_correct_data_answer)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
    def test_fetching_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        curi = CrawlUri(
            url="http://localhost:%s/robots.txt" % self.port,
            effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
        )
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            robots = open(
                os.path.join(os.path.dirname(__file__),
                             "static/robots.txt")).read()
            self.assertEqual(robots, msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._io_loop.start()
Beispiel #7
0
    def test_that_construction_works(self):
        msg = DataMessage(identity="me")
        self.assertEqual("me", msg.identity)
        self.assertEqual(None, msg.curi)

        msg = DataMessage(curi="bla")
        self.assertEqual("bla", msg.curi)
        self.assertEqual(None, msg.identity)
Beispiel #8
0
    def test_that_data_messages_work(self):
        identity = "me myself and i"
        curi = CrawlUri(url="http://localhost")
        serialized = serialize_crawl_uri(curi)

        msg = DataMessage([identity, serialized])

        self.assertEqual(identity, msg.identity)
        self.assertEqual(curi, msg.curi)
        self.assertEqual([identity, serialized], msg.serialize())
        self.assertEqual(msg, DataMessage(msg.serialize()))
Beispiel #9
0
    def test_that_data_messages_work(self):
        identity = "me myself and i"
        curi = CrawlUri(url="http://localhost")
        serialized = serialize_crawl_uri(curi)

        msg = DataMessage([identity, serialized])

        self.assertEqual(identity, msg.identity)
        self.assertEqual(curi, msg.curi)
        self.assertEqual([identity, serialized], msg.serialize())
        self.assertEqual(msg, DataMessage(msg.serialize()))
Beispiel #10
0
 def assert_expected_result_and_stop(raw_msg):
     msg2 = DataMessage(raw_msg)
     self.assertEqual(CURI_OPTIONAL_TRUE,
                      msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED])
     death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                         data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
     self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
Beispiel #11
0
    def _receive_processed_uri(self, raw_msg):
        """
        Receive and reschedule an URI that has been processed. Additionally add
        all extracted URLs to the frontier.
        """
        msg = DataMessage(raw_msg)
        self._logger.info("zmqmaster::Crawling URL (%s) finished" %
                msg.curi.url)

        try:
            if 200 <= msg.curi.status_code < 300:
                # we have some kind of success code! yay
                self._frontier.process_successful_crawl(msg.curi)
            elif 300 <= msg.curi.status_code < 400:
                # Some kind of redirect code. This will only happen if the number
                # of redirects exceeds settings.MAX_REDIRECTS
                self._frontier.process_redirect(msg.curi)
            elif 400 <= msg.curi.status_code < 500:
                # some kind of error where the resource could not be found.
                self._frontier.process_not_found(msg.curi)
            elif 500 <= msg.curi.status_code < 600:
                # some kind of server error
                self._frontier.process_server_error(msg.curi)
        except:
            self._logger.critical("zmqmaster::Uncaught exception in the sink")
            self._logger.critical("zmqmaster::%s" % (traceback.format_exc(),))
            msg.curi.status_code = CURI_EUNCAUGHT_EXCEPTION
            self._frontier.process_server_error(msg.curi)

        self._send_next_uri()
 def assert_expected_result_and_stop(raw_msg):
     msg = DataMessage(raw_msg)
     self.assertEqual(304, msg.curi.status_code)
     self.assertEqual("", msg.curi.content_body)
     death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                         data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
     self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
Beispiel #13
0
 def assert_expected_result_and_stop(raw_msg):
     msg = DataMessage(raw_msg)
     robots = open(
         os.path.join(os.path.dirname(__file__),
                      "static/robots.txt")).read()
     self.assertEqual(robots, msg.curi.content_body)
     death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                         data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
     self._mgmt_sockets['master_pub'].send_multipart(death.serialize())
Beispiel #14
0
    def test_that_creating_extractor_works(self):

        self._settings.SPYDER_EXTRACTOR_PIPELINE = [
            'spyder.processor.limiter.DefaultLimiter',
        ]

        extractor = workerprocess.create_worker_extractor(
            self._settings, self._mgmt, self._ctx, StreamHandler(sys.stdout),
            self._io_loop)
        extractor.start()

        curi = CrawlUri(
            url="http://localhost:80/robots.txt",
            effective_url="http://127.0.0.1:%s/robots.txt",
            optional_vars=dict(),
        )
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg2 = DataMessage(raw_msg)
            self.assertEqual(CURI_OPTIONAL_TRUE,
                             msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED])
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        def assert_correct_mgmt_message(raw_msg):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, raw_msg)

        self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt_message)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()

        extractor._out_stream.close()
        extractor._outsocket.close()
        extractor._in_stream.close()
        extractor._insocket.close()
Beispiel #15
0
    def _receive(self, msg):
        """
        We have a message!

        `msg` is a serialized version of a `DataMessage`.
        """
        message = DataMessage(msg)

        try:
            # this is the real work we want to do
            curi = self._processing(message.curi)
            message.curi = curi
        except:
            # catch any uncaught exception and only log it as CRITICAL
            self._logger.critical(
                    "worker::Uncaught exception executing the worker for URL %s!" %
                    (message.curi.url,))
            self._logger.critical("worker::%s" % (traceback.format_exc(),))

        # finished, now send the result back to the master
        self._out_stream.send_multipart(message.serialize())
    def test_fetching_last_modified_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
            self._worker_sockets['worker_pub'],
            self._mgmt,
            fetcher,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop)
        worker.start()

        mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path,
                        "robots.txt")).st_mtime)
        mtime = serialize_date_time(mtimestamp)
        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
                req_header = { "Last-Modified" :
                    mtime }
                )

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                    data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
    def test_that_creating_extractor_works(self):

        self._settings.SPYDER_EXTRACTOR_PIPELINE = ['spyder.processor.limiter.DefaultLimiter',]

        extractor = workerprocess.create_worker_extractor(self._settings,
                self._mgmt, self._ctx, StreamHandler(sys.stdout), self._io_loop)
        extractor.start()

        curi = CrawlUri(url="http://localhost:80/robots.txt",
                effective_url="http://127.0.0.1:%s/robots.txt",
                optional_vars=dict(),
                )
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg2 = DataMessage(raw_msg)
            self.assertEqual(CURI_OPTIONAL_TRUE,
                    msg2.curi.optional_vars[CURI_EXTRACTION_FINISHED])
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                    data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)

        def assert_correct_mgmt_message(raw_msg):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, raw_msg)

        self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt_message)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()

        extractor._out_stream.close()
        extractor._outsocket.close()
        extractor._in_stream.close()
        extractor._insocket.close()
    def test_fetching_last_modified_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        mtimestamp = datetime.fromtimestamp(
            os.stat(os.path.join(self._path, "robots.txt")).st_mtime)
        mtime = serialize_date_time(mtimestamp)
        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                        effective_url="http://127.0.0.1:%s/robots.txt" %
                        self.port,
                        req_header={"Last-Modified": mtime})

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
    def test_fetching_etag_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
            self._worker_sockets['worker_pub'],
            self._mgmt,
            fetcher,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop)
        worker.start()

        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
                req_header = { "Etag" :
                    "\"3926227169c58185234888b60000c6eb1169577d\"" }
                )

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                    data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Beispiel #20
0
    def test_fetching_etag_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        curi = CrawlUri(
            url="http://localhost:%s/robots.txt" % self.port,
            effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
            req_header={
                "Etag": "\"3926227169c58185234888b60000c6eb1169577d\""
            })

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Beispiel #21
0
    def _receive(self, msg):
        """
        We have a message!

        Instead of the synchronous version we do not handle serializing and
        sending the result to the `self._outsocket`. This has to be handled by
        the `self._processing` method.
        """
        message = DataMessage(msg)

        try:
            self._processing(message, self._out_stream)
        except:
            # catch any uncaught exception and only log it as CRITICAL
            self._logger.critical("Uncaught exception executing the worker!")
            self._logger.critical(traceback.format_exc())
Beispiel #22
0
 def assert_correct_data_answer(msg2):
     self.assertEqual(msg, DataMessage(msg2))
Beispiel #23
0
 def assert_correct_data(msg2):
     msg3 = DataMessage(msg2)
     self.assertEqual(msg, msg3)