コード例 #1
0
def test_decimal():
    s = PyArrowSerializer()
    expected = [{'a': Decimal('1.2')}]
    actual = s.deserialize(s.serialize(expected))
    np.testing.assert_array_equal(actual[0]['a'], expected[0]['a'])

    expected = [{'a': [Decimal('1.2')]}]
    actual = s.deserialize(s.serialize(expected))
    np.testing.assert_array_equal(actual[0]['a'], expected[0]['a'])
コード例 #2
0
def test_all_matrix_types():
    s = PyArrowSerializer()
    # We would be using serializer with arrays of dictionaries or arrays of dictionaries of dictionaries (ngram)
    serialized_values = [
        (np.int8, -127),
        (np.uint8, 255),
        (np.int16, -2**15),
        (np.uint16, 2**16 - 1),
        (np.int32, -2**31),
        (np.uint32, 2**32 - 1),
        (np.float16, 1.2),
        (np.float32, 1.2),
        (np.float64, 1.2),
        (np.string_, 'abc'),
        (np.unicode_, u'אבג'),
        (np.int64, -2**63),
        (np.uint64, 2**64 - 1),
    ]

    for type_factory, value in serialized_values:
        desired = [{'value': np.asarray(4 * [value], dtype=type_factory)}]
        actual = s.deserialize(s.serialize(desired))
        assert actual[0]['value'].dtype == desired[0]['value'].dtype
        np.testing.assert_array_equal(actual[0]['value'], desired[0]['value'])
コード例 #3
0
class ProcessPool(object):
    def __init__(self, workers_count, pyarrow_serialize=False):
        """Initializes a ProcessPool.

        This pool is different from standard Python pool implementations by the fact that the workers are spawned
        without using fork. Some issues with using jvm based HDFS driver were observed when the process was forked
        (could not access HDFS from the forked worker if the driver was already used in the parent process).

        :param workers_count: Number of processes to be spawned
        :param pyarrow_serialize: Use ``pyarrow.serialize`` serialization if True. ``pyarrow.serialize`` is much faster
          than pickling. Integer types (int8, uint8 etc...) is not done yet in pyarrow, so all integer types are
          currently converted to 'int'
        """
        self._workers = []
        self._ventilator_send = None
        self._control_sender = None
        self.workers_count = workers_count
        self._results_receiver_poller = None

        self._ventilated_items = 0
        self._ventilated_items_processed = 0
        self._ventilator = None
        self._serializer = PyArrowSerializer() if pyarrow_serialize else PickleSerializer()

    def _create_local_socket_on_random_port(self, context, socket_type):
        """Creates a zmq socket on a random port.

        :param context: zmq context
        :param socket_type: zmq socket type
        :return: A tuple: ``(zmq_socket, endpoint_address)``
        """
        LOCALHOST = 'tcp://127.0.0.1'
        socket = context.socket(socket_type)

        # There are race conditions where the socket can close when messages are still trying to be sent by zmq.
        # This can end up causing zmq to block indefinitely when sending objects or shutting down. Having the socket
        # linger on close helps prevent this.
        socket.linger = _SOCKET_LINGER_MS

        port = socket.bind_to_random_port(LOCALHOST)
        return socket, '{}:{}'.format(LOCALHOST, port)

    def start(self, worker_class, worker_setup_args=None, ventilator=None):
        """Starts worker processes.

        Will block until all processes to subscribe to the worker queue (the messages are distributed by zmq on write
        so if only one, out of many, workers is up at the time of 'ventilation', the initial load won't be balanced
        between workers. If can not start the workers in timely fashion, will raise an exception.

        :param worker_class: A class of the worker class. The class will be instantiated in the worker process. The
            class must implement :class:`.WorkerBase` protocol.
        :param worker_setup_args: Argument that will be passed to 'args' property of the instantiated
            :class:`.WorkerBase`.
        :param ventilator: Optional ventilator to handle ventilating items to the process pool. Process pool needs
            to know about the ventilator to know if it has completed ventilating items.
        :return: ``None``
        """
        # Initialize a zeromq context
        self._context = zmq.Context()

        # Ventilator socket used to send out tasks to workers
        self._ventilator_send, worker_receiver_socket = self._create_local_socket_on_random_port(self._context,
                                                                                                 zmq.PUSH)

        # Control socket is used to signal termination of the pool
        self._control_sender, control_socket = self._create_local_socket_on_random_port(self._context, zmq.PUB)
        self._results_receiver, results_sender_socket = self._create_local_socket_on_random_port(self._context,
                                                                                                 zmq.PULL)

        # We need poller to be able to read results from workers in a non-blocking manner
        self._results_receiver_poller = zmq.Poller()
        self._results_receiver_poller.register(self._results_receiver, zmq.POLLIN)

        # Monitors will be used to count number of workers created.
        # We will block till all of them are ready to accept messages
        monitor_sockets = [
            self._ventilator_send.get_monitor_socket(zmq.constants.EVENT_ACCEPTED),
            self._control_sender.get_monitor_socket(zmq.constants.EVENT_ACCEPTED),
            self._results_receiver.get_monitor_socket(zmq.constants.EVENT_ACCEPTED),
        ]

        # Start a bunch of processes
        self._workers = [
            exec_in_new_process(_worker_bootstrap, worker_class, worker_id, control_socket, worker_receiver_socket,
                                results_sender_socket, self._serializer, worker_setup_args)
            for worker_id in range(self.workers_count)]

        # Block until we have all workers up. Will raise an error if fails to start in a timely fashion
        self._wait_for_workers_to_start(monitor_sockets)

        if ventilator:
            self._ventilator = ventilator
            self._ventilator.start()

    def _wait_for_workers_to_start(self, monitor_sockets):
        """Waits for all workers to start."""
        now = time()
        for monitor_socket in monitor_sockets:
            started_count = 0
            while started_count < self.workers_count and time() < now + _WORKERS_STARTED_TIMEOUT_S:
                _keep_retrying_while_zmq_again(_KEEP_TRYING_WHILE_ZMQ_AGAIN_IS_RAIZED_TIMEOUT_S,
                                               lambda sock=monitor_socket: monitor.recv_monitor_message(
                                                   sock, flags=zmq.constants.NOBLOCK))
                started_count += 1

            if started_count < self.workers_count:
                raise RuntimeError(
                    'Workers were not able to start within timeout {} s ({} has started)'.format(
                        _WORKERS_STARTED_TIMEOUT_S,
                        started_count))

    def ventilate(self, *args, **kargs):
        """Sends a work item to a worker process. Will result in worker.process(...) call with arbitrary arguments."""
        self._ventilated_items += 1
        logger.debug('ventilate called. total ventilated items count %d', self._ventilated_items)
        # There is a race condition when sending objects to zmq that if all workers have been killed, sending objects
        # can block indefinitely. By using NOBLOCK, an exception is thrown stating that all resources have been
        # exhausted which the user can decide how to handle instead of just having the process hang.
        _keep_retrying_while_zmq_again(_KEEP_TRYING_WHILE_ZMQ_AGAIN_IS_RAIZED_TIMEOUT_S,
                                       lambda: self._ventilator_send.send_pyobj((args, kargs),
                                                                                flags=zmq.constants.NOBLOCK))

    def get_results(self):
        """Returns results from worker pool

        :param timeout: If None, will block forever, otherwise will raise :class:`.TimeoutWaitingForResultError`
            exception if no data received within the timeout (in seconds)
        :return: arguments passed to ``publish_func(...)`` by a worker. If no more results are anticipated,
            :class:`.EmptyResultError` is raised.
        """

        while True:
            # If there is no more work to do, raise an EmptyResultError
            logger.debug('ventilated_items=%d ventilated_items_processed=%d ventilator.completed=%s',
                         self._ventilated_items, self._ventilated_items_processed,
                         str(self._ventilator.completed()) if self._ventilator else 'N/A')
            if self._ventilated_items == self._ventilated_items_processed:
                # We also need to check if we are using a ventilator and if it is completed
                if not self._ventilator or self._ventilator.completed():
                    logger.debug('ventilator reported it has completed. Reporting end of results')
                    raise EmptyResultError()

            logger.debug('get_results polling on the next result')
            socks = self._results_receiver_poller.poll(_VERIFY_END_OF_VENTILATION_PERIOD * 1e3)
            if not socks:
                continue
            # Result message is a tuple containing data payload and possible exception (or None).
            # By specifying pyarrow_serialize=True, we may choose to use pyarrow serializer which is faster, but
            # does not support all data types correctly.
            fast_serialized, pickle_serialized = self._results_receiver.recv_multipart(copy=False)
            pickle_serialized = pickle.loads(pickle_serialized)

            if pickle_serialized:
                logger.debug('get_results a pickled message %s', type(pickle_serialized))
                if isinstance(pickle_serialized, VentilatedItemProcessedMessage):
                    self._ventilated_items_processed += 1
                    if self._ventilator:
                        self._ventilator.processed_item()
                elif isinstance(pickle_serialized, Exception):
                    self.stop()
                    self.join()
                    raise pickle_serialized
            else:
                logger.debug('get_results received new results')
                deserialized_result = self._serializer.deserialize(fast_serialized.buffer)
                return deserialized_result

    def stop(self):
        """Stops all workers (non-blocking)"""
        logger.debug('stopping')
        if self._ventilator:
            self._ventilator.stop()
        self._control_sender.send_string(_CONTROL_FINISHED)

    def join(self):
        """Blocks until all workers are terminated."""

        logger.debug('joining')

        # Slow joiner problem with zeromq means that not all workers are guaranteed to have gotten
        # the stop event. Therefore we will keep sending it until all workers are stopped to prevent
        # a deadlock.
        while any([w.poll() is None for w in self._workers]):
            self.stop()
            sleep(.1)

        for w in self._workers:
            w.wait()
        self._ventilator_send.close()
        self._control_sender.close()
        self._results_receiver.close()
        self._context.destroy()

    @property
    def diagnostics(self):
        # items_produced is updated only when VentilatedItemProcessedMessage is received. This will happen only on the
        # next call to get_results, so it's value may lag.
        return {
            'items_consumed': self._ventilated_items,
            'items_produced': self._ventilated_items_processed,
            'items_inprocess': self._ventilated_items - self._ventilated_items_processed,
        }
コード例 #4
0
def test_nominal():
    s = PyArrowSerializer()
    expected = [{'a': np.asarray([1, 2], dtype=np.uint64)}]
    actual = s.deserialize(s.serialize(expected))
    np.testing.assert_array_equal(actual[0]['a'], expected[0]['a'])