def deploy_python_func_between_two_axis_partitions(axis, func, num_splits, len_of_left, kwargs, *partitions): """Deploy a function along a full axis between two data sets in Ray. Args: axis: The axis to perform the function along. func: The function to perform. num_splits: The number of splits to return (see `split_result_of_axis_func_pandas`). len_of_left: The number of values in `partitions` that belong to the left data set. kwargs: A dictionary of keyword arguments. partitions: All partitions that make up the full axis (row or column) for both data sets. Returns: A list of Pandas DataFrames. """ lt_frame = pandas.concat(list(partitions[:len_of_left]), axis=axis, copy=False) rt_frame = pandas.concat(list(partitions[len_of_left:]), axis=axis, copy=False) result = func(lt_frame, rt_frame, **kwargs) return [ df.copy() for df in split_result_of_axis_func_pandas(axis, num_splits, result) ]
def deploy_python_axis_func(axis, func, num_splits, kwargs, *partitions): """Deploy a function along a full axis in Ray. Args: axis: The axis to perform the function along. func: The function to perform. num_splits: The number of splits to return (see `split_result_of_axis_func_pandas`) kwargs: A dictionary of keyword arguments. partitions: All partitions that make up the full axis (row or column) Returns: A list of Pandas DataFrames. """ dataframe = pandas.concat(partitions, axis=axis, copy=False) result = func(dataframe, **kwargs) if num_splits != len(partitions) or isinstance(result, pandas.Series): lengths = None else: if axis == 0: lengths = [len(part) for part in partitions] if sum(lengths) != len(result): lengths = None else: lengths = [len(part.columns) for part in partitions] if sum(lengths) != len(result.columns): lengths = None return [ df.copy() for df in split_result_of_axis_func_pandas( axis, num_splits, result, lengths) ]
def from_pandas(cls, df, return_dims=False): """ Create partitions from ``pandas.DataFrame/pandas.Series``. Parameters ---------- df : pandas.DataFrame/pandas.Series A ``pandas.DataFrame`` to add. return_dims : boolean, default: False Is return dimensions or not. Returns ------- list or tuple List of partitions in case `return_dims` == False, tuple (partitions, row lengths, col widths) in other case. """ num_splits = GpuCount.get() put_func = cls._partition_class.put # For now, we default to row partitioning pandas_dfs = split_result_of_axis_func_pandas(0, num_splits, df) keys = [ put_func(cls._get_gpu_managers()[i], pandas_dfs[i]) for i in range(num_splits) ] keys = ray.get(keys) parts = cls._create_partitions(keys, cls._get_gpu_managers()).reshape( (num_splits, 1)) if not return_dims: return parts else: row_lengths = [len(df.index) for df in pandas_dfs] col_widths = [len(df.columns) ] # single value since we only have row partitions return parts, row_lengths, col_widths
def _read_csv_with_offset_pandas_on_ray(fname, num_splits, start, end, kwargs, header): """Use a Ray task to read a chunk of a CSV into a Pandas DataFrame. Args: fname: The filename of the file to open. num_splits: The number of splits (partitions) to separate the DataFrame into. start: The start byte offset. end: The end byte offset. kwargs: The kwargs for the Pandas `read_csv` function. header: The header of the file. Returns: A list containing the split Pandas DataFrames and the Index as the last element. If there is not `index_col` set, then we just return the length. This is used to determine the total length of the DataFrame to build a default Index. """ bio = open(fname, "rb") bio.seek(start) to_read = header + bio.read(end - start) bio.close() pandas_df = pandas.read_csv(BytesIO(to_read), **kwargs) pandas_df.columns = pandas.RangeIndex(len(pandas_df.columns)) if kwargs.get("index_col", None) is not None: index = pandas_df.index # Partitions must have RangeIndex pandas_df.index = pandas.RangeIndex(0, len(pandas_df)) else: # We will use the lengths to build the index if we are not given an # `index_col`. index = len(pandas_df) return split_result_of_axis_func_pandas(1, num_splits, pandas_df) + [index]
def deploy_axis_func(cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions): """Deploy a function along a full axis in Ray. Args: axis: The axis to perform the function along. func: The function to perform. num_splits: The number of splits to return (see `split_result_of_axis_func_pandas`) kwargs: A dictionary of keyword arguments. maintain_partitioning: If True, keep the old partitioning if possible. If False, create a new partition layout. partitions: All partitions that make up the full axis (row or column) Returns: A list of Pandas DataFrames. """ # Pop these off first because they aren't expected by the function. manual_partition = kwargs.pop("manual_partition", False) lengths = kwargs.pop("_lengths", None) transposed = kwargs.pop("_transposed", False) dataframe = pandas.concat( [ set_indices_for_pandas_concat(df, transposed) for df in partitions ], axis=axis, copy=False, ) result = func(dataframe, **kwargs) if isinstance(result, pandas.Series): if num_splits == 1: return result return [result ] + [pandas.Series([]) for _ in range(num_splits - 1)] if manual_partition: # The split function is expecting a list lengths = list(lengths) # We set lengths to None so we don't use the old lengths for the resulting partition # layout. This is done if the number of splits is changing or we are told not to # keep the old partitioning. elif num_splits != len(partitions) or not maintain_partitioning: lengths = None else: if axis == 0: lengths = [len(part) for part in partitions] if sum(lengths) != len(result): lengths = None else: lengths = [len(part.columns) for part in partitions] if sum(lengths) != len(result.columns): lengths = None return split_result_of_axis_func_pandas(axis, num_splits, result, lengths)
def deploy_axis_func(cls, axis, func, num_splits, kwargs, maintain_partitioning, *partitions): """ Deploy a function along a full axis. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. num_splits : int The number of splits to return (see `split_result_of_axis_func_pandas`). kwargs : dict Additional keywords arguments to be passed in `func`. maintain_partitioning : bool If True, keep the old partitioning if possible. If False, create a new partition layout. *partitions : iterable All partitions that make up the full axis (row or column). Returns ------- list A list of pandas DataFrames. """ # Pop these off first because they aren't expected by the function. manual_partition = kwargs.pop("manual_partition", False) lengths = kwargs.pop("_lengths", None) dataframe = pandas.concat(list(partitions), axis=axis, copy=False) result = func(dataframe, **kwargs) if manual_partition: # The split function is expecting a list lengths = list(lengths) # We set lengths to None so we don't use the old lengths for the resulting partition # layout. This is done if the number of splits is changing or we are told not to # keep the old partitioning. elif num_splits != len(partitions) or not maintain_partitioning: lengths = None else: if axis == 0: lengths = [len(part) for part in partitions] if sum(lengths) != len(result): lengths = None else: lengths = [len(part.columns) for part in partitions] if sum(lengths) != len(result.columns): lengths = None return split_result_of_axis_func_pandas(axis, num_splits, result, lengths)
def _split_result_for_readers(axis, num_splits, df): """Splits the DataFrame read into smaller DataFrames and handles all edge cases. Args: axis: Which axis to split over. num_splits: The number of splits to create. df: The DataFrame after it has been read. Returns: A list of pandas DataFrames. """ splits = split_result_of_axis_func_pandas(axis, num_splits, df) if not isinstance(splits, list): splits = [splits] return splits
def deploy_func_between_two_axis_partitions(cls, axis, func, num_splits, len_of_left, other_shape, kwargs, *partitions): """ Deploy a function along a full axis between two data sets. Parameters ---------- axis : {0, 1} The axis to perform the function along. func : callable The function to perform. num_splits : int The number of splits to return (see `split_result_of_axis_func_pandas`). len_of_left : int The number of values in `partitions` that belong to the left data set. other_shape : np.ndarray The shape of right frame in terms of partitions, i.e. (other_shape[i-1], other_shape[i]) will indicate slice to restore i-1 axis partition. kwargs : dict Additional keywords arguments to be passed in `func`. *partitions : iterable All partitions that make up the full axis (row or column) for both data sets. Returns ------- list A list of pandas DataFrames. """ lt_frame = pandas.concat(partitions[:len_of_left], axis=axis, copy=False) rt_parts = partitions[len_of_left:] # reshaping flattened `rt_parts` array into a frame with shape `other_shape` combined_axis = [ pandas.concat( rt_parts[other_shape[i - 1]:other_shape[i]], axis=axis, copy=False, ) for i in range(1, len(other_shape)) ] rt_frame = pandas.concat(combined_axis, axis=axis ^ 1, copy=False) result = func(lt_frame, rt_frame, **kwargs) return split_result_of_axis_func_pandas(axis, num_splits, result)
def deploy_func_between_two_axis_partitions(cls, axis, func, num_splits, len_of_left, other_shape, kwargs, *partitions): """Deploy a function along a full axis between two data sets in Ray. Parameters ---------- axis: The axis to perform the function along. func: The function to perform. num_splits: The number of splits to return (see `split_result_of_axis_func_pandas`). len_of_left: The number of values in `partitions` that belong to the left data set. other_shape: The shape of right frame in terms of partitions kwargs: A dictionary of keyword arguments. partitions: All partitions that make up the full axis (row or column) for both data sets. Returns ------- A list of Pandas DataFrames. """ lt_frame = pandas.concat(partitions[:len_of_left], axis=axis, copy=False) rt_parts = partitions[len_of_left:] # reshaping flattened `rt_parts` array into with shape `other_shape` combined_axis = [ pandas.concat( [ rt_parts[other_shape[axis] * i + j] for j in range(other_shape[axis]) ], axis=axis, copy=False, ) for i in range(other_shape[axis ^ 1]) ] rt_frame = pandas.concat(combined_axis, axis=axis ^ 1, copy=False) result = func(lt_frame, rt_frame, **kwargs) return split_result_of_axis_func_pandas(axis, num_splits, result)
def _read_hdf_columns(path_or_buf, columns, num_splits, key, mode): """Use a Ray task to read a column from HDF5 into a Pandas DataFrame. Args: path: The path of the HDF5 file. columns: The list of column names to read. num_splits: The number of partitions to split the column into. Returns: A list containing the split Pandas DataFrames and the Index as the last element. If there is not `index_col` set, then we just return the length. This is used to determine the total length of the DataFrame to build a default Index. """ df = pandas.read_hdf(path_or_buf, key, mode, columns=columns) # Append the length of the index here to build it externally return split_result_of_axis_func_pandas(0, num_splits, df) + [len(df.index)]
def _read_parquet_columns(path, columns, num_splits, kwargs): """Use a Ray task to read a column from Parquet into a Pandas DataFrame. Args: path: The path of the Parquet file. columns: The list of column names to read. num_splits: The number of partitions to split the column into. Returns: A list containing the split Pandas DataFrames and the Index as the last element. If there is not `index_col` set, then we just return the length. This is used to determine the total length of the DataFrame to build a default Index. """ import pyarrow.parquet as pq df = pq.read_pandas(path, columns=columns, **kwargs).to_pandas() # Append the length of the index here to build it externally return split_result_of_axis_func_pandas(0, num_splits, df) + [len(df.index)]
def deploy_axis_func( axis, func, num_splits, kwargs, maintain_partitioning, *partitions ): """Deploy a function along a full axis Args: axis: The axis to perform the function along. func: The function to perform. num_splits: The number of splits to return (see `split_result_of_axis_func_pandas`) kwargs: A dictionary of keyword arguments. partitions: All partitions that make up the full axis (row or column) Returns: A list of Pandas DataFrames. """ dataframe = pandas.concat(partitions, axis=axis, copy=False) result = func(dataframe, **kwargs) # XXX pandas_on_python.py is slightly different here but that implementation seems wrong as # uncovered by test_var if isinstance(result, pandas.Series): return [result] + [pandas.Series([]) for _ in range(num_splits - 1)] if num_splits != len(partitions) or not maintain_partitioning: lengths = None # if num_splits != len(partitions) or isinstance(result, pandas.Series): # import pdb; pdb.set_trace() # lengths = None else: if axis == 0: lengths = [len(part) for part in partitions] if sum(lengths) != len(result): lengths = None else: lengths = [len(part.columns) for part in partitions] if sum(lengths) != len(result.columns): lengths = None return [ df.copy() for df in split_result_of_axis_func_pandas(axis, num_splits, result, lengths) ]
def from_pandas(cls, df, return_dims=False): num_splits = GpuCount.get() put_func = cls._partition_class.put # For now, we default to row partitioning pandas_dfs = split_result_of_axis_func_pandas(0, num_splits, df) keys = [ put_func(cls._get_gpu_managers()[i], pandas_dfs[i]) for i in range(num_splits) ] keys = ray.get(keys) parts = cls._create_partitions(keys, cls._get_gpu_managers()).reshape( (num_splits, 1) ) if not return_dims: return parts else: row_lengths = [len(df.index) for df in pandas_dfs] col_widths = [ len(df.columns) ] # single value since we only have row partitions return parts, row_lengths, col_widths
def _split_result_for_readers(axis, num_splits, df): # pragma: no cover """ Split the read DataFrame into smaller DataFrames and handle all edge cases. Parameters ---------- axis : int The axis to split across (0 - index, 1 - columns). num_splits : int The number of splits to create. df : pandas.DataFrame `pandas.DataFrame` to split. Returns ------- list A list of pandas DataFrames. """ splits = split_result_of_axis_func_pandas(axis, num_splits, df) if not isinstance(splits, list): splits = [splits] return splits
def deploy_ray_axis_func( axis, func, num_splits, kwargs, maintain_partitioning, *partitions ): """Deploy a function along a full axis in Ray. Args: axis: The axis to perform the function along. func: The function to perform. num_splits: The number of splits to return (see `split_result_of_axis_func_pandas`) kwargs: A dictionary of keyword arguments. maintain_partitioning: If True, keep the old partitioning if possible. If False, create a new partition layout. partitions: All partitions that make up the full axis (row or column) Returns: A list of Pandas DataFrames. """ dataframe = pandas.concat(partitions, axis=axis, copy=False) result = func(dataframe, **kwargs) if isinstance(result, pandas.Series): if num_splits == 1: return result return [result] + [pandas.Series([]) for _ in range(num_splits - 1)] # We set lengths to None so we don't use the old lengths for the resulting partition # layout. This is done if the number of splits is changing or we are told not to # keep the old partitioning. if num_splits != len(partitions) or not maintain_partitioning: lengths = None else: if axis == 0: lengths = [len(part) for part in partitions] if sum(lengths) != len(result): lengths = None else: lengths = [len(part.columns) for part in partitions] if sum(lengths) != len(result.columns): lengths = None return split_result_of_axis_func_pandas(axis, num_splits, result, lengths)