Beispiel #1
0
def deploy_python_func_between_two_axis_partitions(axis, func, num_splits,
                                                   len_of_left, kwargs,
                                                   *partitions):
    """Deploy a function along a full axis between two data sets in Ray.

    Args:
        axis: The axis to perform the function along.
        func: The function to perform.
        num_splits: The number of splits to return
            (see `split_result_of_axis_func_pandas`).
        len_of_left: The number of values in `partitions` that belong to the
            left data set.
        kwargs: A dictionary of keyword arguments.
        partitions: All partitions that make up the full axis (row or column)
            for both data sets.

    Returns:
        A list of Pandas DataFrames.
    """
    lt_frame = pandas.concat(list(partitions[:len_of_left]),
                             axis=axis,
                             copy=False)
    rt_frame = pandas.concat(list(partitions[len_of_left:]),
                             axis=axis,
                             copy=False)

    result = func(lt_frame, rt_frame, **kwargs)
    return [
        df.copy()
        for df in split_result_of_axis_func_pandas(axis, num_splits, result)
    ]
Beispiel #2
0
def deploy_python_axis_func(axis, func, num_splits, kwargs, *partitions):
    """Deploy a function along a full axis in Ray.

    Args:
        axis: The axis to perform the function along.
        func: The function to perform.
        num_splits: The number of splits to return
            (see `split_result_of_axis_func_pandas`)
        kwargs: A dictionary of keyword arguments.
        partitions: All partitions that make up the full axis (row or column)

    Returns:
        A list of Pandas DataFrames.
    """
    dataframe = pandas.concat(partitions, axis=axis, copy=False)
    result = func(dataframe, **kwargs)
    if num_splits != len(partitions) or isinstance(result, pandas.Series):
        lengths = None
    else:
        if axis == 0:
            lengths = [len(part) for part in partitions]
            if sum(lengths) != len(result):
                lengths = None
        else:
            lengths = [len(part.columns) for part in partitions]
            if sum(lengths) != len(result.columns):
                lengths = None
    return [
        df.copy() for df in split_result_of_axis_func_pandas(
            axis, num_splits, result, lengths)
    ]
    def from_pandas(cls, df, return_dims=False):
        """
        Create partitions from ``pandas.DataFrame/pandas.Series``.

        Parameters
        ----------
        df : pandas.DataFrame/pandas.Series
            A ``pandas.DataFrame`` to add.
        return_dims : boolean, default: False
            Is return dimensions or not.

        Returns
        -------
        list or tuple
            List of partitions in case `return_dims` == False,
            tuple (partitions, row lengths, col widths) in other case.
        """
        num_splits = GpuCount.get()
        put_func = cls._partition_class.put
        # For now, we default to row partitioning
        pandas_dfs = split_result_of_axis_func_pandas(0, num_splits, df)
        keys = [
            put_func(cls._get_gpu_managers()[i], pandas_dfs[i])
            for i in range(num_splits)
        ]
        keys = ray.get(keys)
        parts = cls._create_partitions(keys, cls._get_gpu_managers()).reshape(
            (num_splits, 1))
        if not return_dims:
            return parts
        else:
            row_lengths = [len(df.index) for df in pandas_dfs]
            col_widths = [len(df.columns)
                          ]  # single value since we only have row partitions
            return parts, row_lengths, col_widths
Beispiel #4
0
def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs,
                                        header):
    """Use a Ray task to read a chunk of a CSV into a Pandas DataFrame.

    Args:
        fname: The filename of the file to open.
        num_splits: The number of splits (partitions) to separate the DataFrame into.
        start: The start byte offset.
        end: The end byte offset.
        kwargs: The kwargs for the Pandas `read_csv` function.
        header: The header of the file.

    Returns:
         A list containing the split Pandas DataFrames and the Index as the last
            element. If there is not `index_col` set, then we just return the length.
            This is used to determine the total length of the DataFrame to build a
            default Index.
    """
    bio = open(fname, "rb")
    bio.seek(start)
    to_read = header + bio.read(end - start)
    bio.close()
    pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs)
    pandas_df.columns = pandas.RangeIndex(len(pandas_df.columns))
    if kwargs.get("index_col", None) is not None:
        index = pandas_df.index
        # Partitions must have RangeIndex
        pandas_df.index = pandas.RangeIndex(0, len(pandas_df))
    else:
        # We will use the lengths to build the index if we are not given an
        # `index_col`.
        index = len(pandas_df)
    return split_result_of_axis_func_pandas(1, num_splits, pandas_df) + [index]
Beispiel #5
0
    def deploy_axis_func(cls, axis, func, num_splits, kwargs,
                         maintain_partitioning, *partitions):
        """Deploy a function along a full axis in Ray.

            Args:
                axis: The axis to perform the function along.
                func: The function to perform.
                num_splits: The number of splits to return
                    (see `split_result_of_axis_func_pandas`)
                kwargs: A dictionary of keyword arguments.
                maintain_partitioning: If True, keep the old partitioning if possible.
                    If False, create a new partition layout.
                partitions: All partitions that make up the full axis (row or column)

            Returns:
                A list of Pandas DataFrames.
            """
        # Pop these off first because they aren't expected by the function.
        manual_partition = kwargs.pop("manual_partition", False)
        lengths = kwargs.pop("_lengths", None)
        transposed = kwargs.pop("_transposed", False)

        dataframe = pandas.concat(
            [
                set_indices_for_pandas_concat(df, transposed)
                for df in partitions
            ],
            axis=axis,
            copy=False,
        )
        result = func(dataframe, **kwargs)
        if isinstance(result, pandas.Series):
            if num_splits == 1:
                return result
            return [result
                    ] + [pandas.Series([]) for _ in range(num_splits - 1)]

        if manual_partition:
            # The split function is expecting a list
            lengths = list(lengths)
        # We set lengths to None so we don't use the old lengths for the resulting partition
        # layout. This is done if the number of splits is changing or we are told not to
        # keep the old partitioning.
        elif num_splits != len(partitions) or not maintain_partitioning:
            lengths = None
        else:
            if axis == 0:
                lengths = [len(part) for part in partitions]
                if sum(lengths) != len(result):
                    lengths = None
            else:
                lengths = [len(part.columns) for part in partitions]
                if sum(lengths) != len(result.columns):
                    lengths = None
        return split_result_of_axis_func_pandas(axis, num_splits, result,
                                                lengths)
Beispiel #6
0
    def deploy_axis_func(cls, axis, func, num_splits, kwargs,
                         maintain_partitioning, *partitions):
        """
        Deploy a function along a full axis.

        Parameters
        ----------
        axis : {0, 1}
            The axis to perform the function along.
        func : callable
            The function to perform.
        num_splits : int
            The number of splits to return (see `split_result_of_axis_func_pandas`).
        kwargs : dict
            Additional keywords arguments to be passed in `func`.
        maintain_partitioning : bool
            If True, keep the old partitioning if possible.
            If False, create a new partition layout.
        *partitions : iterable
            All partitions that make up the full axis (row or column).

        Returns
        -------
        list
            A list of pandas DataFrames.
        """
        # Pop these off first because they aren't expected by the function.
        manual_partition = kwargs.pop("manual_partition", False)
        lengths = kwargs.pop("_lengths", None)

        dataframe = pandas.concat(list(partitions), axis=axis, copy=False)
        result = func(dataframe, **kwargs)

        if manual_partition:
            # The split function is expecting a list
            lengths = list(lengths)
        # We set lengths to None so we don't use the old lengths for the resulting partition
        # layout. This is done if the number of splits is changing or we are told not to
        # keep the old partitioning.
        elif num_splits != len(partitions) or not maintain_partitioning:
            lengths = None
        else:
            if axis == 0:
                lengths = [len(part) for part in partitions]
                if sum(lengths) != len(result):
                    lengths = None
            else:
                lengths = [len(part.columns) for part in partitions]
                if sum(lengths) != len(result.columns):
                    lengths = None
        return split_result_of_axis_func_pandas(axis, num_splits, result,
                                                lengths)
Beispiel #7
0
def _split_result_for_readers(axis, num_splits, df):
    """Splits the DataFrame read into smaller DataFrames and handles all edge cases.

    Args:
        axis: Which axis to split over.
        num_splits: The number of splits to create.
        df: The DataFrame after it has been read.

    Returns:
        A list of pandas DataFrames.
    """
    splits = split_result_of_axis_func_pandas(axis, num_splits, df)
    if not isinstance(splits, list):
        splits = [splits]
    return splits
Beispiel #8
0
    def deploy_func_between_two_axis_partitions(cls, axis, func, num_splits,
                                                len_of_left, other_shape,
                                                kwargs, *partitions):
        """
        Deploy a function along a full axis between two data sets.

        Parameters
        ----------
        axis : {0, 1}
            The axis to perform the function along.
        func : callable
            The function to perform.
        num_splits : int
            The number of splits to return (see `split_result_of_axis_func_pandas`).
        len_of_left : int
            The number of values in `partitions` that belong to the left data set.
        other_shape : np.ndarray
            The shape of right frame in terms of partitions, i.e.
            (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition.
        kwargs : dict
            Additional keywords arguments to be passed in `func`.
        *partitions : iterable
            All partitions that make up the full axis (row or column) for both data sets.

        Returns
        -------
        list
            A list of pandas DataFrames.
        """
        lt_frame = pandas.concat(partitions[:len_of_left],
                                 axis=axis,
                                 copy=False)

        rt_parts = partitions[len_of_left:]

        # reshaping flattened `rt_parts` array into a frame with shape `other_shape`
        combined_axis = [
            pandas.concat(
                rt_parts[other_shape[i - 1]:other_shape[i]],
                axis=axis,
                copy=False,
            ) for i in range(1, len(other_shape))
        ]
        rt_frame = pandas.concat(combined_axis, axis=axis ^ 1, copy=False)

        result = func(lt_frame, rt_frame, **kwargs)
        return split_result_of_axis_func_pandas(axis, num_splits, result)
Beispiel #9
0
    def deploy_func_between_two_axis_partitions(cls, axis, func, num_splits,
                                                len_of_left, other_shape,
                                                kwargs, *partitions):
        """Deploy a function along a full axis between two data sets in Ray.

        Parameters
        ----------
            axis: The axis to perform the function along.
            func: The function to perform.
            num_splits: The number of splits to return
                (see `split_result_of_axis_func_pandas`).
            len_of_left: The number of values in `partitions` that belong to the
                left data set.
            other_shape: The shape of right frame in terms of partitions
            kwargs: A dictionary of keyword arguments.
            partitions: All partitions that make up the full axis (row or column)
                for both data sets.

        Returns
        -------
            A list of Pandas DataFrames.
        """
        lt_frame = pandas.concat(partitions[:len_of_left],
                                 axis=axis,
                                 copy=False)

        rt_parts = partitions[len_of_left:]

        # reshaping flattened `rt_parts` array into with shape `other_shape`
        combined_axis = [
            pandas.concat(
                [
                    rt_parts[other_shape[axis] * i + j]
                    for j in range(other_shape[axis])
                ],
                axis=axis,
                copy=False,
            ) for i in range(other_shape[axis ^ 1])
        ]
        rt_frame = pandas.concat(combined_axis, axis=axis ^ 1, copy=False)

        result = func(lt_frame, rt_frame, **kwargs)
        return split_result_of_axis_func_pandas(axis, num_splits, result)
Beispiel #10
0
def _read_hdf_columns(path_or_buf, columns, num_splits, key, mode):
    """Use a Ray task to read a column from HDF5 into a Pandas DataFrame.

    Args:
        path: The path of the HDF5 file.
        columns: The list of column names to read.
        num_splits: The number of partitions to split the column into.

    Returns:
         A list containing the split Pandas DataFrames and the Index as the last
            element. If there is not `index_col` set, then we just return the length.
            This is used to determine the total length of the DataFrame to build a
            default Index.
    """

    df = pandas.read_hdf(path_or_buf, key, mode, columns=columns)
    # Append the length of the index here to build it externally
    return split_result_of_axis_func_pandas(0, num_splits,
                                            df) + [len(df.index)]
Beispiel #11
0
def _read_parquet_columns(path, columns, num_splits, kwargs):
    """Use a Ray task to read a column from Parquet into a Pandas DataFrame.

    Args:
        path: The path of the Parquet file.
        columns: The list of column names to read.
        num_splits: The number of partitions to split the column into.

    Returns:
         A list containing the split Pandas DataFrames and the Index as the last
            element. If there is not `index_col` set, then we just return the length.
            This is used to determine the total length of the DataFrame to build a
            default Index.
    """
    import pyarrow.parquet as pq

    df = pq.read_pandas(path, columns=columns, **kwargs).to_pandas()
    # Append the length of the index here to build it externally
    return split_result_of_axis_func_pandas(0, num_splits,
                                            df) + [len(df.index)]
Beispiel #12
0
def deploy_axis_func(
    axis, func, num_splits, kwargs, maintain_partitioning, *partitions
):
    """Deploy a function along a full axis

    Args:
        axis: The axis to perform the function along.
        func: The function to perform.
        num_splits: The number of splits to return
            (see `split_result_of_axis_func_pandas`)
        kwargs: A dictionary of keyword arguments.
        partitions: All partitions that make up the full axis (row or column)

    Returns:
        A list of Pandas DataFrames.
    """
    dataframe = pandas.concat(partitions, axis=axis, copy=False)
    result = func(dataframe, **kwargs)
    # XXX pandas_on_python.py is slightly different here but that implementation seems wrong as
    # uncovered by test_var
    if isinstance(result, pandas.Series):
        return [result] + [pandas.Series([]) for _ in range(num_splits - 1)]
    if num_splits != len(partitions) or not maintain_partitioning:
        lengths = None

    #    if num_splits != len(partitions) or isinstance(result, pandas.Series):
    #        import pdb; pdb.set_trace()
    #        lengths = None
    else:
        if axis == 0:
            lengths = [len(part) for part in partitions]
            if sum(lengths) != len(result):
                lengths = None
        else:
            lengths = [len(part.columns) for part in partitions]
            if sum(lengths) != len(result.columns):
                lengths = None
    return [
        df.copy()
        for df in split_result_of_axis_func_pandas(axis, num_splits, result, lengths)
    ]
Beispiel #13
0
 def from_pandas(cls, df, return_dims=False):
     num_splits = GpuCount.get()
     put_func = cls._partition_class.put
     # For now, we default to row partitioning
     pandas_dfs = split_result_of_axis_func_pandas(0, num_splits, df)
     keys = [
         put_func(cls._get_gpu_managers()[i], pandas_dfs[i])
         for i in range(num_splits)
     ]
     keys = ray.get(keys)
     parts = cls._create_partitions(keys, cls._get_gpu_managers()).reshape(
         (num_splits, 1)
     )
     if not return_dims:
         return parts
     else:
         row_lengths = [len(df.index) for df in pandas_dfs]
         col_widths = [
             len(df.columns)
         ]  # single value since we only have row partitions
         return parts, row_lengths, col_widths
Beispiel #14
0
def _split_result_for_readers(axis, num_splits, df):  # pragma: no cover
    """
    Split the read DataFrame into smaller DataFrames and handle all edge cases.

    Parameters
    ----------
    axis : int
        The axis to split across (0 - index, 1 - columns).
    num_splits : int
        The number of splits to create.
    df : pandas.DataFrame
        `pandas.DataFrame` to split.

    Returns
    -------
    list
        A list of pandas DataFrames.
    """
    splits = split_result_of_axis_func_pandas(axis, num_splits, df)
    if not isinstance(splits, list):
        splits = [splits]
    return splits
Beispiel #15
0
def deploy_ray_axis_func(
    axis, func, num_splits, kwargs, maintain_partitioning, *partitions
):
    """Deploy a function along a full axis in Ray.

    Args:
        axis: The axis to perform the function along.
        func: The function to perform.
        num_splits: The number of splits to return
            (see `split_result_of_axis_func_pandas`)
        kwargs: A dictionary of keyword arguments.
        maintain_partitioning: If True, keep the old partitioning if possible.
            If False, create a new partition layout.
        partitions: All partitions that make up the full axis (row or column)

    Returns:
        A list of Pandas DataFrames.
    """
    dataframe = pandas.concat(partitions, axis=axis, copy=False)
    result = func(dataframe, **kwargs)
    if isinstance(result, pandas.Series):
        if num_splits == 1:
            return result
        return [result] + [pandas.Series([]) for _ in range(num_splits - 1)]
    # We set lengths to None so we don't use the old lengths for the resulting partition
    # layout. This is done if the number of splits is changing or we are told not to
    # keep the old partitioning.
    if num_splits != len(partitions) or not maintain_partitioning:
        lengths = None
    else:
        if axis == 0:
            lengths = [len(part) for part in partitions]
            if sum(lengths) != len(result):
                lengths = None
        else:
            lengths = [len(part.columns) for part in partitions]
            if sum(lengths) != len(result.columns):
                lengths = None
    return split_result_of_axis_func_pandas(axis, num_splits, result, lengths)