Example #1
0
def merge_ordered(
    left,
    right,
    on=None,
    left_on=None,
    right_on=None,
    left_by=None,
    right_by=None,
    fill_method=None,
    suffixes=("_x", "_y"),
    how: str = "outer",
) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Perform a merge for ordered data with optional filling/interpolation.
    """
    if not isinstance(left, DataFrame):
        raise ValueError(
            "can not merge DataFrame with instance of type {}".format(
                type(right)))
    ErrorMessage.default_to_pandas("`merge_ordered`")
    if isinstance(right, DataFrame):
        right = to_pandas(right)
    return DataFrame(
        pandas.merge_ordered(
            to_pandas(left),
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            left_by=left_by,
            right_by=right_by,
            fill_method=fill_method,
            suffixes=suffixes,
            how=how,
        ))
Example #2
0
    def get_indices(self, axis=0, index_func=None, old_blocks=None):
        """This gets the internal indices stored in the partitions.

        Note: These are the global indices of the object. This is mostly useful
            when you have deleted rows/columns internally, but do not know
            which ones were deleted.

        Args:
            axis: This axis to extract the labels. (0 - index, 1 - columns).
            index_func: The function to be used to extract the function.
            old_blocks: An optional previous object that this object was
                created from. This is used to compute the correct offsets.

        Returns:
            A Pandas Index object.
        """
        ErrorMessage.catch_bugs_and_request_email(not callable(index_func))
        if axis == 0:
            func = self.preprocess_func(index_func)
            # We grab the first column of blocks and extract the indices
            new_indices = [
                idx.apply(func).get() for idx in self.partitions.T[0]
            ]
            # This is important because sometimes we have resized the data. The new
            # sizes will not be valid if we are trying to compute the index on a
            # new object that has a different length.
            if old_blocks is not None:
                cumulative_block_lengths = np.array(
                    old_blocks.block_lengths).cumsum()
            else:
                cumulative_block_lengths = np.array(
                    self.block_lengths).cumsum()
        else:
            func = self.preprocess_func(index_func)
            new_indices = [idx.apply(func).get() for idx in self.partitions[0]]

            if old_blocks is not None:
                cumulative_block_lengths = np.array(
                    old_blocks.block_widths).cumsum()
            else:
                cumulative_block_lengths = np.array(self.block_widths).cumsum()
        full_indices = new_indices[0]
        if old_blocks is not None:
            for i in range(len(new_indices)):
                # If the length is 0 there is nothing to append.
                if i == 0 or len(new_indices[i]) == 0:
                    continue
                # The try-except here is intended to catch issues where we are
                # trying to get a string index out of the internal index.
                try:
                    append_val = new_indices[i] + cumulative_block_lengths[i -
                                                                           1]
                except TypeError:
                    append_val = new_indices[i]

                full_indices = full_indices.append(append_val)
        else:
            full_indices = full_indices.append(new_indices[1:])
        return full_indices
Example #3
0
    def _apply_index_objs(self, axis=None):
        """Lazily applies the index object (Index or Columns) to the partitions.

        Args:
            axis: The axis to apply to, None applies to both axes.

        Returns:
            A new 2D array of partitions that have the index assignment added to the
            call queue.
        """
        self._filter_empties()
        if axis is None or axis == 0:
            cum_row_lengths = np.cumsum([0] + self._row_lengths)
        if axis is None or axis == 1:
            cum_col_widths = np.cumsum([0] + self._column_widths)

        if axis is None:

            def apply_idx_objs(df, idx, cols):
                df.index, df.columns = idx, cols
                return df

            self._partitions = np.array([[
                self._partitions[i][j].add_to_apply_calls(
                    apply_idx_objs,
                    idx=self.index[slice(cum_row_lengths[i],
                                         cum_row_lengths[i + 1])],
                    cols=self.columns[slice(cum_col_widths[j],
                                            cum_col_widths[j + 1])],
                ) for j in range(len(self._partitions[i]))
            ] for i in range(len(self._partitions))])
        elif axis == 0:

            def apply_idx_objs(df, idx):
                df.index = idx
                return df

            self._partitions = np.array([[
                self._partitions[i][j].add_to_apply_calls(
                    apply_idx_objs,
                    idx=self.index[slice(cum_row_lengths[i],
                                         cum_row_lengths[i + 1])],
                ) for j in range(len(self._partitions[i]))
            ] for i in range(len(self._partitions))])
        elif axis == 1:

            def apply_idx_objs(df, cols):
                df.columns = cols
                return df

            self._partitions = np.array([[
                self._partitions[i][j].add_to_apply_calls(
                    apply_idx_objs,
                    cols=self.columns[slice(cum_col_widths[j],
                                            cum_col_widths[j + 1])],
                ) for j in range(len(self._partitions[i]))
            ] for i in range(len(self._partitions))])
            ErrorMessage.catch_bugs_and_request_email(axis is not None
                                                      and axis not in [0, 1])
Example #4
0
    def read_hdf(cls, path_or_buf, **kwargs):
        """Load a h5 file from the file path or buffer, returning a DataFrame.

        Args:
            path_or_buf: string, buffer or path object
                Path to the file to open, or an open :class:`pandas.HDFStore` object.
            kwargs: Pass into pandas.read_hdf function.

        Returns:
            DataFrame constructed from the h5 file.
        """
        if cls.read_hdf_remote_task is None:
            return super(RayIO, cls).read_hdf(path_or_buf, **kwargs)

        format = cls._validate_hdf_format(path_or_buf=path_or_buf)

        if format is None:
            ErrorMessage.default_to_pandas(
                "File format seems to be `fixed`. For better distribution consider saving the file in `table` format. "
                "df.to_hdf(format=`table`).")
            return cls.from_pandas(
                pandas.read_hdf(path_or_buf=path_or_buf, **kwargs))

        columns = kwargs.get("columns", None)
        if not columns:
            start = kwargs.pop("start", None)
            stop = kwargs.pop("stop", None)
            empty_pd_df = pandas.read_hdf(path_or_buf,
                                          start=0,
                                          stop=0,
                                          **kwargs)
            kwargs["start"] = start
            kwargs["stop"] = stop
            columns = empty_pd_df.columns

        num_partitions = cls.frame_mgr_cls._compute_num_partitions()
        num_splits = min(len(columns), num_partitions)
        # Each item in this list will be a list of column names of the original df
        column_splits = (len(columns) // num_partitions if len(columns) %
                         num_partitions == 0 else
                         len(columns) // num_partitions + 1)
        col_partitions = [
            columns[i:i + column_splits]
            for i in range(0, len(columns), column_splits)
        ]
        blk_partitions = np.array([
            cls.read_hdf_remote_task._remote(
                args=(path_or_buf, cols, num_splits, kwargs),
                num_return_vals=num_splits + 1,
            ) for cols in col_partitions
        ]).T
        remote_partitions = np.array(
            [[cls.frame_partition_cls(obj) for obj in row]
             for row in blk_partitions[:-1]])
        index_len = ray.get(blk_partitions[-1][0])
        index = pandas.RangeIndex(index_len)
        new_query_compiler = cls.query_compiler_cls(
            cls.frame_mgr_cls(remote_partitions), index, columns)
        return new_query_compiler
Example #5
0
    def read(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a DataFrame.
           Ray DataFrame only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Ray only support pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        from pyarrow.parquet import ParquetFile, ParquetDataset
        from modin.pandas.io import PQ_INDEX_REGEX

        if os.path.isdir(path):
            partitioned_columns = set()
            directory = True
            original_path = path
            # We do a tree walk of the path directory because partitioned
            # parquet directories have a unique column at each directory level.
            # Thus, we can use os.walk(), which does a dfs search, to walk
            # through the different columns that the data is partitioned on
            for (root, dir_names, files) in os.walk(path):
                if dir_names:
                    partitioned_columns.add(dir_names[0].split("=")[0])
                if files:
                    # Metadata files, git files, .DSStore
                    if files[0][0] == ".":
                        continue
                    path = os.path.join(root, files[0])
                    break
            partitioned_columns = list(partitioned_columns)
            if len(partitioned_columns):
                ErrorMessage.default_to_pandas(
                    "Partitioned Columns in Parquet")
                return cls.single_worker_read(original_path,
                                              engine=engine,
                                              columns=columns,
                                              **kwargs)
        else:
            directory = False

        if not columns:
            if directory:
                # Path of the sample file that we will read to get the remaining columns
                pd = ParquetDataset(path)
                column_names = pd.schema.names
            else:
                pf = ParquetFile(path)
                column_names = pf.metadata.schema.names
            columns = [
                name for name in column_names if not PQ_INDEX_REGEX.match(name)
            ]
        return cls.build_query_compiler(path, columns, **kwargs)
Example #6
0
File: io.py Project: prutskov/modin
def read_orc(path,
             columns: Optional[List[str]] = None,
             **kwargs) -> DataFrame:  # noqa: PR01, RT01, D200
    """
    Load an ORC object from the file path, returning a DataFrame.
    """
    ErrorMessage.default_to_pandas("read_orc")
    Engine.subscribe(_update_engine)
    return DataFrame(pandas.read_orc(path, columns, **kwargs))
Example #7
0
    def _index_grouped(self):
        """
        Implement [METHOD_NAME].

        TODO: Add more details for this docstring template.

        Parameters
        ----------
        What arguments does this function have.
        [
        PARAMETER_NAME: PARAMETERS TYPES
            Description.
        ]

        Returns
        -------
        What this returns (if anything)
        """
        if self._index_grouped_cache is None:
            if hasattr(self._by, "columns") and len(self._by.columns) > 1:
                by = list(self._by.columns)
                is_multi_by = True
            else:
                by = self._by
                is_multi_by = self._is_multi_by
            if is_multi_by:
                # Because we are doing a collect (to_pandas) here and then groupby, we
                # end up using pandas implementation. Add the warning so the user is
                # aware.
                ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
                ErrorMessage.default_to_pandas("Groupby with multiple columns")
                if isinstance(by, list) and all(
                    hashable(o)
                    and (
                        o in self._df
                        or o in self._df._query_compiler.get_index_names(self._axis)
                    )
                    for o in by
                ):
                    pandas_df = self._df._query_compiler.getitem_column_array(
                        by
                    ).to_pandas()
                else:
                    by = try_cast_to_pandas(by, squeeze=True)
                    pandas_df = self._df._to_pandas()
                self._index_grouped_cache = pandas_df.groupby(by=by).groups
            else:
                if isinstance(self._by, type(self._query_compiler)):
                    by = self._by.to_pandas().squeeze().values
                else:
                    by = self._by
                if self._axis == 0:
                    self._index_grouped_cache = self._index.groupby(by)
                else:
                    self._index_grouped_cache = self._columns.groupby(by)
        return self._index_grouped_cache
Example #8
0
    def to_numpy(self):
        """Converts Modin DataFrame to NumPy Array.

        Returns:
            NumPy Array of the QueryCompiler.
        """
        arr = self._modin_frame.to_numpy()
        ErrorMessage.catch_bugs_and_request_email(
            len(arr) != len(self.index) or len(arr[0]) != len(self.columns))
        return arr
Example #9
0
 def single_worker_read(cls, fname, **kwargs):
     ErrorMessage.default_to_pandas("Parameters provided")
     # Use default args for everything
     pandas_frame = cls.parse(fname, **kwargs)
     if isinstance(pandas_frame, pandas.io.parsers.TextFileReader):
         pd_read = pandas_frame.read
         pandas_frame.read = lambda *args, **kwargs: cls.query_compiler_cls.from_pandas(
             pd_read(*args, **kwargs), cls.frame_cls)
         return pandas_frame
     return cls.query_compiler_cls.from_pandas(pandas_frame, cls.frame_cls)
Example #10
0
    def __getitem__(self, row_lookup, col_lookup, ndim):
        """
        Retrieve dataset according to `row_lookup` and `col_lookup`.

        Parameters
        ----------
        row_lookup : slice(None), range or np.ndarray
            The global row index to retrieve data from.
        col_lookup : slice(None), range or np.ndarray
            The global col index to retrieve data from.
        ndim : {0, 1, 2}
            Number of dimensions in dataset to be retrieved.

        Returns
        -------
        modin.pandas.DataFrame or modin.pandas.Series
            Located dataset.

        Notes
        -----
        Usage of `slice(None)` as a lookup is a hack to pass information about
        full-axis grab without computing actual indices that triggers lazy computations.
        Ideally, this API should get rid of using slices as indexers and either use a
        common ``Indexer`` object or range and ``np.ndarray`` only.
        """
        if isinstance(row_lookup, slice):
            ErrorMessage.catch_bugs_and_request_email(
                failure_condition=row_lookup != slice(None),
                extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {row_lookup}",
            )
            row_lookup = None
        if isinstance(col_lookup, slice):
            ErrorMessage.catch_bugs_and_request_email(
                failure_condition=col_lookup != slice(None),
                extra_log=f"Only None-slices are acceptable as a slice argument in masking, got: {col_lookup}",
            )
            col_lookup = None
        qc_view = self.qc.view(row_lookup, col_lookup)
        if ndim == 2:
            return self.df.__constructor__(query_compiler=qc_view)
        if isinstance(self.df, Series) and not self.row_scalar:
            return self.df.__constructor__(query_compiler=qc_view)
        if isinstance(self.df, Series):
            axis = 0
        elif ndim == 0:
            axis = None
        else:
            axis = (
                None
                if self.col_scalar and self.row_scalar
                else 1
                if self.col_scalar
                else 0
            )
        return self.df.__constructor__(query_compiler=qc_view).squeeze(axis=axis)
Example #11
0
def get_dummies(
    data,
    prefix=None,
    prefix_sep="_",
    dummy_na=False,
    columns=None,
    sparse=False,
    drop_first=False,
    dtype=None,
):
    """Convert categorical variable into indicator variables.

    Args:
        data (array-like, Series, or DataFrame): data to encode.
        prefix (string, [string]): Prefix to apply to each encoded column
                                   label.
        prefix_sep (string, [string]): Separator between prefix and value.
        dummy_na (bool): Add a column to indicate NaNs.
        columns: Which columns to encode.
        sparse (bool): Not Implemented: If True, returns SparseDataFrame.
        drop_first (bool): Whether to remove the first level of encoded data.
        dtype: The dtype for the get_dummies call.

    Returns:
        DataFrame or one-hot encoded data.
    """
    if sparse:
        raise NotImplementedError("SparseDataFrame is not implemented. "
                                  "To contribute to Modin, please visit "
                                  "github.com/modin-project/modin.")
    if not isinstance(data, DataFrame):
        ErrorMessage.default_to_pandas("`get_dummies` on non-DataFrame")
        if isinstance(data, Series):
            data = data._to_pandas()
        return DataFrame(
            pandas.get_dummies(
                data,
                prefix=prefix,
                prefix_sep=prefix_sep,
                dummy_na=dummy_na,
                columns=columns,
                sparse=sparse,
                drop_first=drop_first,
                dtype=dtype,
            ))
    else:
        new_manager = data._query_compiler.get_dummies(
            columns,
            prefix=prefix,
            prefix_sep=prefix_sep,
            dummy_na=dummy_na,
            drop_first=drop_first,
            dtype=dtype,
        )
        return DataFrame(query_compiler=new_manager)
Example #12
0
    def check_parameters_support(
        cls,
        filepath_or_buffer,
        read_kwargs: dict,
        skiprows_md: Union[Sequence, callable, int],
        header_size: int,
    ) -> bool:
        """
        Check support of only general parameters of `read_*` function.

        Parameters
        ----------
        filepath_or_buffer : str, path object or file-like object
            `filepath_or_buffer` parameter of `read_*` function.
        read_kwargs : dict
            Parameters of `read_*` function.
        skiprows_md : int, array or callable
            `skiprows` parameter modified for easier handling by Modin.
        header_size : int
            Number of rows that are used by header.

        Returns
        -------
        bool
            Whether passed parameters are supported or not.
        """
        skiprows = read_kwargs.get("skiprows")
        if isinstance(filepath_or_buffer, str):
            if not cls.file_exists(filepath_or_buffer):
                return False
        elif not cls.pathlib_or_pypath(filepath_or_buffer):
            return False

        if read_kwargs["chunksize"] is not None:
            return False

        skiprows_supported = True
        if is_list_like(skiprows_md) and skiprows_md[0] < header_size:
            skiprows_supported = False
        elif callable(skiprows):
            # check if `skiprows` callable gives True for any of header indices
            is_intersection = any(
                cls._get_skip_mask(pandas.RangeIndex(header_size), skiprows))
            if is_intersection:
                skiprows_supported = False

        if not skiprows_supported:
            ErrorMessage.single_warning(
                "Values of `header` and `skiprows` parameters have intersections. "
                +
                "This case is unsupported by Modin, so pandas implementation will be used"
            )
            return False

        return True
Example #13
0
 def read_pickle(
     cls, filepath_or_buffer, compression="infer", storage_options=None
 ):  # noqa: PR01
     ErrorMessage.default_to_pandas("`read_pickle`")
     return cls.from_pandas(
         pandas.read_pickle(
             filepath_or_buffer,
             compression=compression,
             storage_options=storage_options,
         )
     )
Example #14
0
    def to_csv(cls, obj, **kwargs):  # noqa: PR01
        """
        Write object to a comma-separated values (CSV) file using pandas.

        For parameters description please refer to pandas API.
        """
        ErrorMessage.default_to_pandas("`to_csv`")
        if isinstance(obj, BaseQueryCompiler):
            obj = obj.to_pandas()

        return obj.to_csv(**kwargs)
Example #15
0
 def read_msgpack(cls,
                  path_or_buf,
                  encoding="utf-8",
                  iterator=False,
                  **kwargs):
     ErrorMessage.default_to_pandas("`read_msgpack`")
     return cls.from_pandas(
         pandas.read_msgpack(path_or_buf,
                             encoding=encoding,
                             iterator=iterator,
                             **kwargs))
Example #16
0
File: io.py Project: RehanSD/modin
    def to_parquet(cls, obj, **kwargs):  # noqa: PR01
        """
        Write object to the binary parquet format using pandas.

        For parameters description please refer to pandas API.
        """
        ErrorMessage.default_to_pandas("`to_parquet`")
        if isinstance(obj, BaseQueryCompiler):
            obj = obj.to_pandas()

        return obj.to_parquet(**kwargs)
Example #17
0
def wide_to_long(df, stubnames, i, j, sep="", suffix=r"\d+"):
    if not isinstance(df, DataFrame):
        raise ValueError(
            "can not wide_to_long with instance of type {}".format(type(df)))
    ErrorMessage.default_to_pandas("`wide_to_long`")
    return DataFrame(
        pandas.wide_to_long(to_pandas(df),
                            stubnames,
                            i,
                            j,
                            sep=sep,
                            suffix=suffix))
Example #18
0
 def to_pickle(cls, obj, path, compression="infer", protocol=4):
     if protocol == 4:
         protocol = -1
     ErrorMessage.default_to_pandas("`to_pickle`")
     if isinstance(obj, BaseQueryCompiler):
         return pandas.to_pickle(
             obj.to_pandas(), path, compression=compression, protocol=protocol
         )
     else:
         return pandas.to_pickle(
             obj, path, compression=compression, protocol=protocol
         )
Example #19
0
 def read_feather(
     cls, path, columns=None, use_threads=True, storage_options=None
 ):  # noqa: PR01
     ErrorMessage.default_to_pandas("`read_feather`")
     return cls.from_pandas(
         pandas.read_feather(
             path,
             columns=columns,
             use_threads=use_threads,
             storage_options=storage_options,
         )
     )
Example #20
0
 def read_parquet(cls, path, engine, columns, storage_options,
                  use_nullable_dtypes, **kwargs):  # noqa: PR01
     ErrorMessage.default_to_pandas("`read_parquet`")
     return cls.from_pandas(
         pandas.read_parquet(
             path,
             engine=engine,
             columns=columns,
             storage_options=storage_options,
             use_nullable_dtypes=use_nullable_dtypes,
             **kwargs,
         ))
Example #21
0
 def read_fwf(cls,
              filepath_or_buffer,
              colspecs="infer",
              widths=None,
              infer_nrows=100,
              **kwds):
     ErrorMessage.default_to_pandas("`read_fwf`")
     return cls.from_pandas(
         pandas.read_fwf(filepath_or_buffer,
                         colspecs=colspecs,
                         widths=widths,
                         infer_nrows=infer_nrows,
                         **kwds))
Example #22
0
 def read_excel(
     cls,
     io,
     sheet_name=0,
     header=0,
     names=None,
     index_col=None,
     usecols=None,
     squeeze=False,
     dtype=None,
     engine=None,
     converters=None,
     true_values=None,
     false_values=None,
     skiprows=None,
     nrows=None,
     na_values=None,
     parse_dates=False,
     date_parser=None,
     thousands=None,
     comment=None,
     skipfooter=0,
     convert_float=True,
     **kwds
 ):
     ErrorMessage.default_to_pandas("`read_excel`")
     kwargs = {
         "io": io,
         "sheet_name": sheet_name,
         "header": header,
         "skiprows": skiprows,
         "nrows": nrows,
         "index_col": index_col,
         "names": names,
         "usecols": usecols,
         "parse_dates": parse_dates,
         "date_parser": date_parser,
         "na_values": na_values,
         "thousands": thousands,
         "comment": comment,
         "convert_float": convert_float,
         "converters": converters,
         "dtype": dtype,
         "true_values": true_values,
         "false_values": false_values,
         "engine": engine,
         "squeeze": squeeze,
         "skipfooter": skipfooter,
     }
     kwargs.update(kwds)
     return cls.from_pandas(pandas.read_excel(**kwargs))
Example #23
0
def json_normalize(
    data: Union[Dict, List[Dict]],
    record_path: Optional[Union[str, List]] = None,
    meta: Optional[Union[str, List[Union[str, List[str]]]]] = None,
    meta_prefix: Optional[str] = None,
    record_prefix: Optional[str] = None,
    errors: Optional[str] = "raise",
    sep: str = ".",
    max_level: Optional[int] = None,
) -> DataFrame:
    ErrorMessage.default_to_pandas("json_normalize")
    return DataFrame(
        pandas.json_normalize(data, record_path, meta, meta_prefix,
                              record_prefix, errors, sep, max_level))
Example #24
0
File: io.py Project: wzhang1/modin
 def to_pickle(cls, obj, path, compression="infer", protocol=4):
     if protocol == 4:
         # This forces pandas to use default pickling, which is different for python3
         # and python2
         protocol = -1
     ErrorMessage.default_to_pandas("`to_pickle`")
     if isinstance(obj, BaseQueryCompiler):
         return pandas.to_pickle(
             obj.to_pandas(), path, compression=compression, protocol=protocol
         )
     else:
         return pandas.to_pickle(
             obj, path, compression=compression, protocol=protocol
         )
Example #25
0
    def _apply_index_objs(self, axis=None):
        """Eagerly applies the index object (Index or Columns) to the partitions.

        Args:
            axis: The axis to apply to, None applies to both axes.

        Returns
        -------
            A new 2D array of partitions that have the index assignment added to the
            call queue.
        """
        ErrorMessage.catch_bugs_and_request_email(axis is not None
                                                  and axis not in [0, 1])

        cum_row_lengths = np.cumsum([0] + self._row_lengths)
        cum_col_widths = np.cumsum([0] + self._column_widths)

        def apply_idx_objs(df, idx, cols, axis):
            # cudf does not support set_axis. It only supports rename with 1-to-1 mapping.
            # Therefore, we need to create the dictionary that have the relationship between
            # current index and new ones.
            idx = {df.index[i]: idx[i] for i in range(len(idx))}
            cols = {df.index[i]: cols[i] for i in range(len(cols))}

            if axis == 0:
                return df.rename(index=idx)
            elif axis == 1:
                return df.rename(columns=cols)
            else:
                return df.rename(index=idx, columns=cols)

        keys = np.array([[
            self._partitions[i][j].apply(
                apply_idx_objs,
                idx=self.index[slice(cum_row_lengths[i],
                                     cum_row_lengths[i + 1])],
                cols=self.columns[slice(cum_col_widths[j],
                                        cum_col_widths[j + 1])],
                axis=axis,
            ) for j in range(len(self._partitions[i]))
        ] for i in range(len(self._partitions))])

        self._partitions = np.array([[
            cuDFOnRayFramePartition(
                self._partitions[i][j].get_gpu_manager(),
                keys[i][j],
                self._partitions[i][j]._length_cache,
                self._partitions[i][j]._width_cache,
            ) for j in range(len(keys[i]))
        ] for i in range(len(keys))])
Example #26
0
    def synchronize_labels(self, axis=None):
        """
        Synchronize labels by applying the index object (Index or Columns) to the partitions eagerly.

        Parameters
        ----------
        axis : {0, 1, None}, default: None
            The axis to apply to. If None, it applies to both axes.
        """
        ErrorMessage.catch_bugs_and_request_email(axis is not None
                                                  and axis not in [0, 1])

        cum_row_lengths = np.cumsum([0] + self._row_lengths)
        cum_col_widths = np.cumsum([0] + self._column_widths)

        def apply_idx_objs(df, idx, cols, axis):
            # cudf does not support set_axis. It only supports rename with 1-to-1 mapping.
            # Therefore, we need to create the dictionary that have the relationship between
            # current index and new ones.
            idx = {df.index[i]: idx[i] for i in range(len(idx))}
            cols = {df.index[i]: cols[i] for i in range(len(cols))}

            if axis == 0:
                return df.rename(index=idx)
            elif axis == 1:
                return df.rename(columns=cols)
            else:
                return df.rename(index=idx, columns=cols)

        keys = np.array([[
            self._partitions[i][j].apply(
                apply_idx_objs,
                idx=self.index[slice(cum_row_lengths[i],
                                     cum_row_lengths[i + 1])],
                cols=self.columns[slice(cum_col_widths[j],
                                        cum_col_widths[j + 1])],
                axis=axis,
            ) for j in range(len(self._partitions[i]))
        ] for i in range(len(self._partitions))])

        self._partitions = np.array([[
            cuDFOnRayDataframePartition(
                self._partitions[i][j].get_gpu_manager(),
                keys[i][j],
                self._partitions[i][j]._length_cache,
                self._partitions[i][j]._width_cache,
            ) for j in range(len(keys[i]))
        ] for i in range(len(keys))])
Example #27
0
    def from_pandas(cls, df):
        new_index = df.index
        new_columns = df.columns
        # If there is non-trivial index, we put it into columns.
        # That's what we usually have for arrow tables and execution
        # result. Unnamed index is renamed to __index__. Also all
        # columns get 'F_' prefix to handle names unsupported in
        # OmniSci.
        if cls._is_trivial_index(df.index):
            index_cols = None
        else:
            orig_index_names = df.index.names
            orig_df = df

            index_cols = [
                f"__index__{i}_{'__None__' if n is None else n}"
                for i, n in enumerate(df.index.names)
            ]
            df.index.names = index_cols
            df = df.reset_index()

            orig_df.index.names = orig_index_names
        new_dtypes = df.dtypes
        df = df.add_prefix("F_")

        (
            new_parts,
            new_lengths,
            new_widths,
            unsupported_cols,
        ) = cls._frame_mgr_cls.from_pandas(df, True)

        if len(unsupported_cols) > 0:
            ErrorMessage.single_warning(
                f"Frame contain columns with unsupported data-types: {unsupported_cols}. "
                "All operations with this frame will be default to pandas!")

        return cls(
            new_parts,
            new_index,
            new_columns,
            new_lengths,
            new_widths,
            dtypes=new_dtypes,
            index_cols=index_cols,
            has_unsupported_data=len(unsupported_cols) > 0,
        )
Example #28
0
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
):
    """Read SQL query or database table into a DataFrame.

    Args:
        sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name.
        con: SQLAlchemy connectable (engine/connection) or database string URI or DBAPI2 connection (fallback mode)
        index_col: Column(s) to set as index(MultiIndex).
        coerce_float: Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to
                      floating point, useful for SQL result sets.
        params: List of parameters to pass to execute method. The syntax used
                to pass parameters is database driver dependent. Check your
                database driver documentation for which of the five syntax styles,
                described in PEP 249's paramstyle, is supported.
        parse_dates:
                     - List of column names to parse as dates.
                     - Dict of ``{column_name: format string}`` where format string is
                       strftime compatible in case of parsing string times, or is one of
                       (D, s, ns, ms, us) in case of parsing integer timestamps.
                     - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
                       to the keyword arguments of :func:`pandas.to_datetime`
                       Especially useful with databases without native Datetime support,
                       such as SQLite.
        columns: List of column names to select from SQL table (only used when reading a table).
        chunksize: If specified, return an iterator where `chunksize` is the number of rows to include in each chunk.

    Returns:
        Modin Dataframe
    """
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.dispatcher import EngineDispatcher

    if kwargs.get("chunksize") is not None:
        ErrorMessage.default_to_pandas("Parameters provided [chunksize]")
        df_gen = pandas.read_sql(**kwargs)
        return (DataFrame(query_compiler=EngineDispatcher.from_pandas(df))
                for df in df_gen)
    return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
Example #29
0
    def to_pickle(cls, obj, path, compression="infer", protocol=4):  # noqa: PR01
        """
        Pickle (serialize) object to file using pandas.

        For parameters description please refer to pandas API.
        """
        if protocol == 4:
            protocol = -1
        ErrorMessage.default_to_pandas("`to_pickle`")
        if isinstance(obj, BaseQueryCompiler):
            return pandas.to_pickle(
                obj.to_pandas(), path, compression=compression, protocol=protocol
            )
        else:
            return pandas.to_pickle(
                obj, path, compression=compression, protocol=protocol
            )
Example #30
0
    def read_parquet(cls, path, engine, columns, **kwargs):
        """Load a parquet object from the file path, returning a Modin DataFrame.
           Modin only supports pyarrow engine for now.

        Args:
            path: The filepath of the parquet file.
                  We only support local files for now.
            engine: Modin only supports pyarrow reader.
                    This argument doesn't do anything for now.
            kwargs: Pass into parquet's read_pandas function.

        Notes:
            ParquetFile API is used. Please refer to the documentation here
            https://arrow.apache.org/docs/python/parquet.html
        """
        ErrorMessage.default_to_pandas("`read_parquet`")
        return cls.from_pandas(pandas.read_parquet(path, engine, columns, **kwargs))