Esempio n. 1
0
    def mask(self, row_labels, col_labels):
        """
        Lazily create a mask that extracts the indices provided.

        Parameters
        ----------
        row_labels : list-like, slice or label
            The row labels for the rows to extract.
        col_labels : list-like, slice or label
            The column labels for the columns to extract.

        Returns
        -------
        PandasOnDaskDataframePartition
            A new ``PandasOnDaskDataframePartition`` object.
        """
        new_obj = super().mask(row_labels, col_labels)
        if isinstance(row_labels, slice) and isinstance(
                self._length_cache, Future):
            new_obj._length_cache = DaskWrapper.deploy(compute_sliced_len,
                                                       row_labels,
                                                       self._length_cache)
        if isinstance(col_labels, slice) and isinstance(
                self._width_cache, Future):
            new_obj._width_cache = DaskWrapper.deploy(compute_sliced_len,
                                                      col_labels,
                                                      self._width_cache)
        return new_obj
Esempio n. 2
0
 def drain_call_queue(self):
     """Execute all operations stored in the call queue on the object wrapped by this partition."""
     if len(self.call_queue) == 0:
         return
     call_queue = self.call_queue
     if len(call_queue) > 1:
         futures = DaskWrapper.deploy(apply_list_of_funcs,
                                      call_queue,
                                      self.future,
                                      num_returns=2,
                                      pure=False)
     else:
         # We handle `len(call_queue) == 1` in a different way because
         # this improves performance a bit.
         func, args, kwargs = call_queue[0]
         futures = DaskWrapper.deploy(
             apply_func,
             self.future,
             func,
             *args,
             num_returns=2,
             pure=False,
             **kwargs,
         )
     self.future = futures[0]
     self._ip_cache = futures[1]
     self.call_queue = []
Esempio n. 3
0
    def apply(self, func, *args, **kwargs):
        """
        Apply a function to the object wrapped by this partition.

        Parameters
        ----------
        func : callable
            A function to apply.
        *args : iterable
            Additional positional arguments to be passed in `func`.
        **kwargs : dict
            Additional keyword arguments to be passed in `func`.

        Returns
        -------
        PandasOnDaskDataframePartition
            A new ``PandasOnDaskDataframePartition`` object.

        Notes
        -----
        The keyword arguments are sent as a dictionary.
        """
        call_queue = self.call_queue + [[func, args, kwargs]]
        if len(call_queue) > 1:
            futures = DaskWrapper.deploy(apply_list_of_funcs,
                                         call_queue,
                                         self.future,
                                         num_returns=2,
                                         pure=False)
        else:
            # We handle `len(call_queue) == 1` in a different way because
            # this improves performance a bit.
            func, args, kwargs = call_queue[0]
            futures = DaskWrapper.deploy(
                apply_func,
                self.future,
                func,
                *args,
                num_returns=2,
                pure=False,
                **kwargs,
            )
        return PandasOnDaskDataframePartition(futures[0], ip=futures[1])
Esempio n. 4
0
    def get(self):
        """
        Get the object wrapped by this partition out of the distributed memory.

        Returns
        -------
        pandas.DataFrame
            The object from the distributed memory.
        """
        self.drain_call_queue()
        return DaskWrapper.materialize(self.future)
Esempio n. 5
0
    def ip(self):
        """
        Get the node IP address of the object wrapped by this partition.

        Returns
        -------
        str
            IP address of the node that holds the data.
        """
        if self._ip_cache is None:
            self._ip_cache = self.apply(lambda df: df)._ip_cache
        if isinstance(self._ip_cache, Future):
            self._ip_cache = DaskWrapper.materialize(self._ip_cache)
        return self._ip_cache
Esempio n. 6
0
    def width(self):
        """
        Get the width of the object wrapped by the partition.

        Returns
        -------
        int
            The width of the object.
        """
        if self._width_cache is None:
            self._width_cache = self.apply(lambda df: len(df.columns)).future
        if isinstance(self._width_cache, Future):
            self._width_cache = DaskWrapper.materialize(self._width_cache)
        return self._width_cache
Esempio n. 7
0
    def get_objects_from_partitions(cls, partitions):
        """
        Get the objects wrapped by `partitions` in parallel.

        Parameters
        ----------
        partitions : np.ndarray
            NumPy array with ``PandasDataframePartition``-s.

        Returns
        -------
        list
            The objects wrapped by `partitions`.
        """
        return DaskWrapper.materialize([partition.future for partition in partitions])
Esempio n. 8
0
    def preprocess_func(cls, func):
        """
        Preprocess a function before an ``apply`` call.

        Parameters
        ----------
        func : callable
            The function to preprocess.

        Returns
        -------
        callable
            An object that can be accepted by ``apply``.
        """
        return DaskWrapper.put(func, hash=False, broadcast=True)
Esempio n. 9
0
    def put(cls, obj):
        """
        Put an object into distributed memory and wrap it with partition object.

        Parameters
        ----------
        obj : any
            An object to be put.

        Returns
        -------
        PandasOnDaskDataframePartition
            A new ``PandasOnDaskDataframePartition`` object.
        """
        return cls(DaskWrapper.put(obj, hash=False))
Esempio n. 10
0
    def deploy_func_between_two_axis_partitions(cls, axis, func, num_splits,
                                                len_of_left, other_shape,
                                                kwargs, *partitions):
        """
        Deploy a function along a full axis between two data sets.

        Parameters
        ----------
        axis : {0, 1}
            The axis to perform the function along.
        func : callable
            The function to perform.
        num_splits : int
            The number of splits to return (see `split_result_of_axis_func_pandas`).
        len_of_left : int
            The number of values in `partitions` that belong to the left data set.
        other_shape : np.ndarray
            The shape of right frame in terms of partitions, i.e.
            (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition.
        kwargs : dict
            Additional keywords arguments to be passed in `func`.
        *partitions : iterable
            All partitions that make up the full axis (row or column) for both data sets.

        Returns
        -------
        list
            A list of distributed.Future.
        """
        return DaskWrapper.deploy(
            deploy_dask_func,
            PandasDataframeAxisPartition.
            deploy_func_between_two_axis_partitions,
            axis,
            func,
            num_splits,
            len_of_left,
            other_shape,
            kwargs,
            *partitions,
            num_returns=num_splits * 4,
            pure=False,
        )
Esempio n. 11
0
    def deploy_axis_func(cls, axis, func, num_splits, kwargs,
                         maintain_partitioning, *partitions):
        """
        Deploy a function along a full axis.

        Parameters
        ----------
        axis : {0, 1}
            The axis to perform the function along.
        func : callable
            The function to perform.
        num_splits : int
            The number of splits to return (see `split_result_of_axis_func_pandas`).
        kwargs : dict
            Additional keywords arguments to be passed in `func`.
        maintain_partitioning : bool
            If True, keep the old partitioning if possible.
            If False, create a new partition layout.
        *partitions : iterable
            All partitions that make up the full axis (row or column).

        Returns
        -------
        list
            A list of distributed.Future.
        """
        lengths = kwargs.get("_lengths", None)
        result_num_splits = len(lengths) if lengths else num_splits
        return DaskWrapper.deploy(
            deploy_dask_func,
            PandasDataframeAxisPartition.deploy_axis_func,
            axis,
            func,
            num_splits,
            kwargs,
            maintain_partitioning,
            *partitions,
            num_returns=result_num_splits * 4,
            pure=False,
        )
Esempio n. 12
0
from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

PartitionClass = (FactoryDispatcher.get_factory().io_cls.frame_cls.
                  _partition_mgr_cls._partition_class)

if Engine.get() == "Ray":
    import ray

    put_func = ray.put
    get_func = ray.get
    FutureType = ray.ObjectRef
elif Engine.get() == "Dask":
    from modin.core.execution.dask.common.engine_wrapper import DaskWrapper
    from distributed import Future

    put_func = lambda x: DaskWrapper.put(x)  # noqa: E731
    get_func = lambda x: DaskWrapper.materialize(x)  # noqa: E731
    FutureType = Future
elif Engine.get() == "Python":
    put_func = lambda x: x  # noqa: E731
    get_func = lambda x: x  # noqa: E731
    FutureType = object
else:
    raise NotImplementedError(
        f"'{Engine.get()}' engine is not supported by these test suites")

NPartitions.put(4)
# HACK: implicit engine initialization (Modin issue #2989)
pd.DataFrame([])