Example #1
0
    def test_fetching_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        curi = CrawlUri(
            url="http://localhost:%s/robots.txt" % self.port,
            effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
        )
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            robots = open(
                os.path.join(os.path.dirname(__file__),
                             "static/robots.txt")).read()
            self.assertEqual(robots, msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._io_loop.start()
Example #2
0
    def test_that_async_worker_works(self):
        worker = AsyncZmqWorker(
            self._worker_sockets["worker_pull"],
            self._worker_sockets["worker_pub"],
            self._mgmt,
            self.echo_processing,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop,
        )

        worker.start()

        curi = CrawlUri(url="http://localhost")
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_correct_data(msg2):
            msg3 = DataMessage(msg2)
            self.assertEqual(msg, msg3)

        self._worker_sockets["master_sub"].on_recv(assert_correct_data)

        def assert_correct_mgmt(msg4):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data)

        self._mgmt_sockets["master_sub"].on_recv(assert_correct_mgmt)

        self._worker_sockets["master_push"].send_multipart(msg.serialize())

        self._io_loop.start()
        worker._in_stream.flush()
Example #3
0
    def test_that_async_worker_works(self):
        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'],
                                self._mgmt, self.echo_processing,
                                StreamHandler(sys.stdout), logging.DEBUG,
                                self._io_loop)

        worker.start()

        curi = CrawlUri(url="http://localhost")
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_correct_data(msg2):
            msg3 = DataMessage(msg2)
            self.assertEqual(msg, msg3)

        self._worker_sockets['master_sub'].on_recv(assert_correct_data)

        def assert_correct_mgmt(msg4):
            self.assertEqual(ZMQ_SPYDER_MGMT_WORKER_QUIT_ACK, msg4.data)

        self._mgmt_sockets['master_sub'].on_recv(assert_correct_mgmt)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
        worker._in_stream.flush()
Example #4
0
    def test_fetching_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
            self._worker_sockets['worker_pub'],
            self._mgmt,
            fetcher,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop)
        worker.start()

        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
                )
        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            robots = open(os.path.join(os.path.dirname(__file__),
                        "static/robots.txt")).read()
            self.assertEqual(robots, msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                    data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)

        self._io_loop.start()
    def test_fetching_last_modified_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
            self._worker_sockets['worker_pub'],
            self._mgmt,
            fetcher,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop)
        worker.start()

        mtimestamp = datetime.fromtimestamp(os.stat(os.path.join(self._path,
                        "robots.txt")).st_mtime)
        mtime = serialize_date_time(mtimestamp)
        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
                req_header = { "Last-Modified" :
                    mtime }
                )

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                    data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
    def test_fetching_last_modified_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        mtimestamp = datetime.fromtimestamp(
            os.stat(os.path.join(self._path, "robots.txt")).st_mtime)
        mtime = serialize_date_time(mtimestamp)
        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                        effective_url="http://127.0.0.1:%s/robots.txt" %
                        self.port,
                        req_header={"Last-Modified": mtime})

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
    def test_fetching_etag_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker( self._worker_sockets['worker_pull'],
            self._worker_sockets['worker_pub'],
            self._mgmt,
            fetcher,
            StreamHandler(sys.stdout),
            logging.DEBUG,
            self._io_loop)
        worker.start()

        curi = CrawlUri(url="http://localhost:%s/robots.txt" % self.port,
                effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
                req_header = { "Etag" :
                    "\"3926227169c58185234888b60000c6eb1169577d\"" }
                )

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                    data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Example #8
0
    def test_fetching_etag_works(self):

        settings = Settings()
        fetcher = FetchProcessor(settings, io_loop=self._io_loop)

        worker = AsyncZmqWorker(self._worker_sockets['worker_pull'],
                                self._worker_sockets['worker_pub'], self._mgmt,
                                fetcher, StreamHandler(sys.stdout),
                                logging.DEBUG, self._io_loop)
        worker.start()

        curi = CrawlUri(
            url="http://localhost:%s/robots.txt" % self.port,
            effective_url="http://127.0.0.1:%s/robots.txt" % self.port,
            req_header={
                "Etag": "\"3926227169c58185234888b60000c6eb1169577d\""
            })

        msg = DataMessage()
        msg.identity = "me"
        msg.curi = curi

        def assert_expected_result_and_stop(raw_msg):
            msg = DataMessage(raw_msg)
            self.assertEqual(304, msg.curi.status_code)
            self.assertEqual("", msg.curi.content_body)
            death = MgmtMessage(topic=ZMQ_SPYDER_MGMT_WORKER,
                                data=ZMQ_SPYDER_MGMT_WORKER_QUIT)
            self._mgmt_sockets['master_pub'].send_multipart(death.serialize())

        self._worker_sockets['master_sub'].on_recv(
            assert_expected_result_and_stop)

        self._worker_sockets['master_push'].send_multipart(msg.serialize())

        self._io_loop.start()
Example #9
0
def create_worker_fetcher(settings, mgmt, zmq_context, log_handler, io_loop):
    """
    Create and return a new `Worker Fetcher`.
    """
    pulling_socket = zmq_context.socket(zmq.PULL)
    pulling_socket.connect(settings.ZEROMQ_WORKER_PROC_FETCHER_PULL)

    pushing_socket = zmq_context.socket(zmq.PUSH)
    pushing_socket.setsockopt(zmq.HWM,
            settings.ZEROMQ_WORKER_PROC_FETCHER_PUSH_HWM)
    pushing_socket.bind(settings.ZEROMQ_WORKER_PROC_FETCHER_PUSH)

    fetcher = FetchProcessor(settings, io_loop)

    return AsyncZmqWorker(pulling_socket, pushing_socket, mgmt, fetcher,
            log_handler, settings.LOG_LEVEL_WORKER, io_loop)