Exemple #1
0
 def read_json(
     cls,
     path_or_buf=None,
     orient=None,
     typ="frame",
     dtype=True,
     convert_axes=True,
     convert_dates=True,
     keep_default_dates=True,
     numpy=False,
     precise_float=False,
     date_unit=None,
     encoding=None,
     encoding_errors="strict",
     lines=False,
     chunksize=None,
     compression="infer",
     nrows: Optional[int] = None,
     storage_options=None,
 ):  # noqa: PR01
     ErrorMessage.default_to_pandas("`read_json`")
     kwargs = {
         "path_or_buf": path_or_buf,
         "orient": orient,
         "typ": typ,
         "dtype": dtype,
         "convert_axes": convert_axes,
         "convert_dates": convert_dates,
         "keep_default_dates": keep_default_dates,
         "numpy": numpy,
         "precise_float": precise_float,
         "date_unit": date_unit,
         "encoding": encoding,
         "encoding_errors": encoding_errors,
         "lines": lines,
         "chunksize": chunksize,
         "compression": compression,
         "nrows": nrows,
         "storage_options": storage_options,
     }
     return cls.from_pandas(pandas.read_json(**kwargs))
Exemple #2
0
 def read_fwf(
     cls, filepath_or_buffer, colspecs="infer", widths=None, infer_nrows=100, **kwds
 ):  # noqa: PR01
     ErrorMessage.default_to_pandas("`read_fwf`")
     pd_obj = pandas.read_fwf(
         filepath_or_buffer,
         colspecs=colspecs,
         widths=widths,
         infer_nrows=infer_nrows,
         **kwds,
     )
     if isinstance(pd_obj, pandas.DataFrame):
         return cls.from_pandas(pd_obj)
     if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
         # Overwriting the read method should return a Modin DataFrame for calls
         # to __next__ and get_chunk
         pd_read = pd_obj.read
         pd_obj.read = lambda *args, **kwargs: cls.from_pandas(
             pd_read(*args, **kwargs)
         )
     return pd_obj
Exemple #3
0
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import EngineDispatcher

    Engine.subscribe(_update_engine)
    if kwargs.get("chunksize") is not None:
        ErrorMessage.default_to_pandas("Parameters provided [chunksize]")
        df_gen = pandas.read_sql(**kwargs)
        return (DataFrame(query_compiler=EngineDispatcher.from_pandas(df))
                for df in df_gen)
    return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
Exemple #4
0
 def read_gbq(cls,
              query,
              project_id=None,
              index_col=None,
              col_order=None,
              reauth=False,
              verbose=None,
              private_key=None,
              dialect="legacy",
              **kwargs):
     ErrorMessage.default_to_pandas()
     return cls.from_pandas(
         pandas.read_gbq(query,
                         project_id=project_id,
                         index_col=index_col,
                         col_order=col_order,
                         reauth=reauth,
                         verbose=verbose,
                         private_key=private_key,
                         dialect=dialect,
                         **kwargs))
Exemple #5
0
 def read_sql_query(
     cls,
     sql,
     con,
     index_col=None,
     coerce_float=True,
     params=None,
     parse_dates=None,
     chunksize=None,
 ):
     ErrorMessage.default_to_pandas("`read_sql_query`")
     return cls.from_pandas(
         pandas.read_sql_query(
             sql,
             con,
             index_col=index_col,
             coerce_float=coerce_float,
             params=params,
             parse_dates=parse_dates,
             chunksize=chunksize,
         ))
Exemple #6
0
def merge_asof(
    left,
    right,
    on=None,
    left_on=None,
    right_on=None,
    left_index: bool = False,
    right_index: bool = False,
    by=None,
    left_by=None,
    right_by=None,
    suffixes=("_x", "_y"),
    tolerance=None,
    allow_exact_matches: bool = True,
    direction: str = "backward",
) -> DataFrame:
    if not isinstance(left, DataFrame):
        raise ValueError(
            "can not merge DataFrame with instance of type {}".format(
                type(right)))
    ErrorMessage.default_to_pandas("`merge_asof`")
    if isinstance(right, DataFrame):
        right = to_pandas(right)
    return DataFrame(
        pandas.merge_asof(
            to_pandas(left),
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            left_index=left_index,
            right_index=right_index,
            by=by,
            left_by=left_by,
            right_by=right_by,
            suffixes=suffixes,
            tolerance=tolerance,
            allow_exact_matches=allow_exact_matches,
            direction=direction,
        ))
Exemple #7
0
    def _read(cls, path_or_buf, **kwargs):
        """
        Load an h5 file from the file path or buffer, returning a query compiler.

        Parameters
        ----------
        path_or_buf : str, buffer or path object
            Path to the file to open, or an open :class:`pandas.HDFStore` object.
        **kwargs : dict
            Pass into pandas.read_hdf function.

        Returns
        -------
        BaseQueryCompiler
            Query compiler with imported data for further processing.
        """
        if cls._validate_hdf_format(path_or_buf=path_or_buf) is None:
            ErrorMessage.default_to_pandas(
                "File format seems to be `fixed`. For better distribution consider "
                +
                "saving the file in `table` format. df.to_hdf(format=`table`)."
            )
            return cls.single_worker_read(path_or_buf, **kwargs)

        columns = kwargs.pop("columns", None)
        # Have to do this because of Dask's keyword arguments
        kwargs["_key"] = kwargs.pop("key", None)
        if not columns:
            start = kwargs.pop("start", None)
            stop = kwargs.pop("stop", None)
            empty_pd_df = pandas.read_hdf(path_or_buf,
                                          start=0,
                                          stop=0,
                                          **kwargs)
            if start is not None:
                kwargs["start"] = start
            if stop is not None:
                kwargs["stop"] = stop
            columns = empty_pd_df.columns
        return cls.build_query_compiler(path_or_buf, columns, **kwargs)
Exemple #8
0
    def to_pickle(
        cls,
        obj: Any,
        filepath_or_buffer,
        compression: CompressionOptions = "infer",
        protocol: int = pickle.HIGHEST_PROTOCOL,
        storage_options: StorageOptions = None,
    ):  # noqa: PR01, D200
        """
        Pickle (serialize) object to file.
        """
        ErrorMessage.default_to_pandas("`to_pickle`")
        if isinstance(obj, BaseQueryCompiler):
            obj = obj.to_pandas()

        return pandas.to_pickle(
            obj,
            filepath_or_buffer=filepath_or_buffer,
            compression=compression,
            protocol=protocol,
            storage_options=storage_options,
        )
Exemple #9
0
def get_dummies(
    data,
    prefix=None,
    prefix_sep="_",
    dummy_na=False,
    columns=None,
    sparse=False,
    drop_first=False,
    dtype=None,
):
    if sparse:
        raise NotImplementedError("SparseDataFrame is not implemented. "
                                  "To contribute to Modin, please visit "
                                  "github.com/modin-project/modin.")
    if not isinstance(data, DataFrame):
        ErrorMessage.default_to_pandas("`get_dummies` on non-DataFrame")
        if isinstance(data, Series):
            data = data._to_pandas()
        return DataFrame(
            pandas.get_dummies(
                data,
                prefix=prefix,
                prefix_sep=prefix_sep,
                dummy_na=dummy_na,
                columns=columns,
                sparse=sparse,
                drop_first=drop_first,
                dtype=dtype,
            ))
    else:
        new_manager = data._query_compiler.get_dummies(
            columns,
            prefix=prefix,
            prefix_sep=prefix_sep,
            dummy_na=dummy_na,
            drop_first=drop_first,
            dtype=dtype,
        )
        return DataFrame(query_compiler=new_manager)
Exemple #10
0
 def read_html(
     cls,
     io,
     match=".+",
     flavor=None,
     header=None,
     index_col=None,
     skiprows=None,
     attrs=None,
     parse_dates=False,
     tupleize_cols=None,
     thousands=",",
     encoding=None,
     decimal=".",
     converters=None,
     na_values=None,
     keep_default_na=True,
     displayed_only=True,
 ):
     ErrorMessage.default_to_pandas("`read_html`")
     kwargs = {
         "io": io,
         "match": match,
         "flavor": flavor,
         "header": header,
         "index_col": index_col,
         "skiprows": skiprows,
         "attrs": attrs,
         "parse_dates": parse_dates,
         "tupleize_cols": tupleize_cols,
         "thousands": thousands,
         "encoding": encoding,
         "decimal": decimal,
         "converters": converters,
         "na_values": na_values,
         "keep_default_na": keep_default_na,
         "displayed_only": displayed_only,
     }
     return cls.from_pandas(pandas.read_html(**kwargs)[0])
Exemple #11
0
 def read_gbq(
     cls,
     query: str,
     project_id=None,
     index_col=None,
     col_order=None,
     reauth=False,
     auth_local_webserver=False,
     dialect=None,
     location=None,
     configuration=None,
     credentials=None,
     use_bqstorage_api=None,
     private_key=None,
     verbose=None,
     progress_bar_type=None,
     max_results=None,
 ):  # noqa: PR01
     ErrorMessage.default_to_pandas("`read_gbq`")
     return cls.from_pandas(
         pandas.read_gbq(
             query,
             project_id=project_id,
             index_col=index_col,
             col_order=col_order,
             reauth=reauth,
             auth_local_webserver=auth_local_webserver,
             dialect=dialect,
             location=location,
             configuration=configuration,
             credentials=credentials,
             use_bqstorage_api=use_bqstorage_api,
             private_key=private_key,
             verbose=verbose,
             progress_bar_type=progress_bar_type,
             max_results=max_results,
         )
     )
Exemple #12
0
 def _index_grouped(self):
     if self._index_grouped_cache is None:
         if self._is_multi_by:
             # Because we are doing a collect (to_pandas) here and then groupby, we
             # end up using pandas implementation. Add the warning so the user is
             # aware.
             ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
             ErrorMessage.default_to_pandas("Groupby with multiple columns")
             self._index_grouped_cache = {
                 k: v.index
                 for k, v in self._df._query_compiler.getitem_column_array(
                     self._by).to_pandas().groupby(by=self._by)
             }
         else:
             if isinstance(self._by, type(self._query_compiler)):
                 by = self._by.to_pandas().squeeze()
             else:
                 by = self._by
             if self._axis == 0:
                 self._index_grouped_cache = self._index.groupby(by)
             else:
                 self._index_grouped_cache = self._columns.groupby(by)
     return self._index_grouped_cache
Exemple #13
0
    def to_pickle(cls,
                  obj,
                  path,
                  compression="infer",
                  protocol=4):  # noqa: PR01
        """
        Pickle (serialize) object to file using pandas.

        For parameters description please refer to pandas API.
        """
        if protocol == 4:
            protocol = -1
        ErrorMessage.default_to_pandas("`to_pickle`")
        if isinstance(obj, BaseQueryCompiler):
            return pandas.to_pickle(obj.to_pandas(),
                                    path,
                                    compression=compression,
                                    protocol=protocol)
        else:
            return pandas.to_pickle(obj,
                                    path,
                                    compression=compression,
                                    protocol=protocol)
Exemple #14
0
 def read_sql_table(
     cls,
     table_name,
     con,
     schema=None,
     index_col=None,
     coerce_float=True,
     parse_dates=None,
     columns=None,
     chunksize=None,
 ):
     ErrorMessage.default_to_pandas("`read_sql_table`")
     return cls.from_pandas(
         pandas.read_sql_table(
             table_name,
             con,
             schema=schema,
             index_col=index_col,
             coerce_float=coerce_float,
             parse_dates=parse_dates,
             columns=columns,
             chunksize=chunksize,
         ))
Exemple #15
0
                def return_handler(*args, **kwargs):
                    """
                    Replace the default behavior of methods with inplace kwarg.

                    Returns
                    -------
                    A Modin DataFrame in place of a pandas DataFrame, or the same
                    return type as pandas.HDFStore.

                    Notes
                    -----
                    This function will replace all of the arguments passed to
                    methods of HDFStore with the pandas equivalent. It will convert
                    Modin DataFrame to pandas DataFrame, etc. Currently, pytables
                    does not accept Modin DataFrame objects, so we must convert to
                    pandas.
                    """
                    from modin.utils import to_pandas

                    # We don't want to constantly be giving this error message for
                    # internal methods.
                    if item[0] != "_":
                        ErrorMessage.default_to_pandas("`{}`".format(item))
                    args = [
                        to_pandas(arg) if isinstance(arg, DataFrame) else arg
                        for arg in args
                    ]
                    kwargs = {
                        k: to_pandas(v) if isinstance(v, DataFrame) else v
                        for k, v in kwargs.items()
                    }
                    obj = super(HDFStore,
                                self).__getattribute__(item)(*args, **kwargs)
                    if self._return_modin_dataframe and isinstance(
                            obj, pandas.DataFrame):
                        return DataFrame(obj)
                    return obj
Exemple #16
0
    def read_hdf(
        cls,
        path_or_buf,
        key=None,
        mode: str = "r",
        errors: str = "strict",
        where=None,
        start=None,
        stop=None,
        columns=None,
        iterator=False,
        chunksize=None,
        **kwargs,
    ):  # noqa: PR01
        from modin.pandas.io import HDFStore

        ErrorMessage.default_to_pandas("`read_hdf`")
        modin_store = isinstance(path_or_buf, HDFStore)
        if modin_store:
            path_or_buf._return_modin_dataframe = False
        df = pandas.read_hdf(
            path_or_buf,
            key=key,
            mode=mode,
            columns=columns,
            errors=errors,
            where=where,
            start=start,
            stop=stop,
            iterator=iterator,
            chunksize=chunksize,
            **kwargs,
        )
        if modin_store:
            path_or_buf._return_modin_dataframe = True

        return cls.from_pandas(df)
Exemple #17
0
    def single_worker_read(cls, fname, **kwargs):
        """
        Perform reading by single worker (default-to-pandas implementation).

        Parameters
        ----------
        fname : str, path object or file-like object
            Name of the file or file-like object to read.
        **kwargs : dict
            Keywords arguments to be passed into `read_*` function.

        Returns
        -------
        BaseQueryCompiler or
        dict or
        pandas.io.parsers.TextFileReader
            Object with imported data (or with reference to data) for furher
            processing, object type depends on the child class `parse` function
            result type.
        """
        ErrorMessage.default_to_pandas("Parameters provided")
        # Use default args for everything
        pandas_frame = cls.parse(fname, **kwargs)
        if isinstance(pandas_frame, pandas.io.parsers.TextFileReader):
            pd_read = pandas_frame.read
            pandas_frame.read = (
                lambda *args, **kwargs: cls.query_compiler_cls.from_pandas(
                    pd_read(*args, **kwargs), cls.frame_cls
                )
            )
            return pandas_frame
        elif isinstance(pandas_frame, (OrderedDict, dict)):
            return {
                i: cls.query_compiler_cls.from_pandas(frame, cls.frame_cls)
                for i, frame in pandas_frame.items()
            }
        return cls.query_compiler_cls.from_pandas(pandas_frame, cls.frame_cls)
Exemple #18
0
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
):  # noqa: PR01, RT01, D200
    """
    Read SQL query or database table into a DataFrame.
    """
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    Engine.subscribe(_update_engine)
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    if kwargs.get("chunksize") is not None:
        ErrorMessage.default_to_pandas("Parameters provided [chunksize]")
        df_gen = pandas.read_sql(**kwargs)
        return (DataFrame(query_compiler=FactoryDispatcher.from_pandas(df))
                for df in df_gen)
    return DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs))
Exemple #19
0
    def _read(cls, filepath_or_buffer, **kwargs):
        """Read csv file from local disk.
        Args:
            filepath_or_buffer:
                  The filepath of the csv file.
                  We only support local files for now.
            kwargs: Keyword arguments in pandas.read_csv
        """
        # The intention of the inspection code is to reduce the amount of
        # communication we have to do between processes and nodes. We take a quick
        # pass over the arguments and remove those that are default values so we
        # don't have to serialize and send them to the workers. Because the
        # arguments list is so long, this does end up saving time based on the
        # number of nodes in the cluster.
        try:
            args, _, _, defaults, _, _, _ = inspect.getfullargspec(
                cls.read_csv)
            defaults = dict(zip(args[2:], defaults))
            filtered_kwargs = {
                kw: kwargs[kw]
                for kw in kwargs if kw in defaults
                and not isinstance(kwargs[kw], type(defaults[kw]))
                or kwargs[kw] != defaults[kw]
            }
        # This happens on Python2, we will just default to serializing the entire dictionary
        except AttributeError:
            filtered_kwargs = kwargs

        if isinstance(filepath_or_buffer, str):
            if not os.path.exists(filepath_or_buffer):
                ErrorMessage.default_to_pandas("File not found on disk")
                return cls._read_csv_from_pandas(filepath_or_buffer,
                                                 filtered_kwargs)
        elif not isinstance(filepath_or_buffer, py.path.local):
            read_from_pandas = True
            # Pandas read_csv supports pathlib.Path
            try:
                import pathlib

                if isinstance(filepath_or_buffer, pathlib.Path):
                    read_from_pandas = False
            except ImportError:
                pass
            if read_from_pandas:
                ErrorMessage.default_to_pandas("Reading from buffer.")
                return cls._read_csv_from_pandas(filepath_or_buffer, kwargs)
        if (_infer_compression(filepath_or_buffer, kwargs.get("compression"))
                is not None):
            ErrorMessage.default_to_pandas("Compression detected.")
            return cls._read_csv_from_pandas(filepath_or_buffer,
                                             filtered_kwargs)

        chunksize = kwargs.get("chunksize")
        if chunksize is not None:
            ErrorMessage.default_to_pandas("Reading chunks from a file.")
            return cls._read_csv_from_pandas(filepath_or_buffer,
                                             filtered_kwargs)

        skiprows = kwargs.get("skiprows")
        if skiprows is not None and not isinstance(skiprows, int):
            ErrorMessage.default_to_pandas(
                "skiprows parameter not optimized yet.")
            return cls._read_csv_from_pandas(filepath_or_buffer, kwargs)
        # TODO: replace this by reading lines from file.
        if kwargs.get("nrows") is not None:
            ErrorMessage.default_to_pandas("`read_csv` with `nrows`")
            return cls._read_csv_from_pandas(filepath_or_buffer,
                                             filtered_kwargs)
        else:
            return cls._read_csv_from_file_pandas_on_ray(
                filepath_or_buffer, filtered_kwargs)
Exemple #20
0
 def read_csv(
     cls,
     filepath_or_buffer,
     sep=",",
     delimiter=None,
     header="infer",
     names=None,
     index_col=None,
     usecols=None,
     squeeze=False,
     prefix=None,
     mangle_dupe_cols=True,
     dtype=None,
     engine=None,
     converters=None,
     true_values=None,
     false_values=None,
     skipinitialspace=False,
     skiprows=None,
     nrows=None,
     na_values=None,
     keep_default_na=True,
     na_filter=True,
     verbose=False,
     skip_blank_lines=True,
     parse_dates=False,
     infer_datetime_format=False,
     keep_date_col=False,
     date_parser=None,
     dayfirst=False,
     iterator=False,
     chunksize=None,
     compression="infer",
     thousands=None,
     decimal=b".",
     lineterminator=None,
     quotechar='"',
     quoting=0,
     escapechar=None,
     comment=None,
     encoding=None,
     dialect=None,
     tupleize_cols=None,
     error_bad_lines=True,
     warn_bad_lines=True,
     skipfooter=0,
     doublequote=True,
     delim_whitespace=False,
     low_memory=True,
     memory_map=False,
     float_precision=None,
 ):
     kwargs = {
         "filepath_or_buffer": filepath_or_buffer,
         "sep": sep,
         "delimiter": delimiter,
         "header": header,
         "names": names,
         "index_col": index_col,
         "usecols": usecols,
         "squeeze": squeeze,
         "prefix": prefix,
         "mangle_dupe_cols": mangle_dupe_cols,
         "dtype": dtype,
         "engine": engine,
         "converters": converters,
         "true_values": true_values,
         "false_values": false_values,
         "skipinitialspace": skipinitialspace,
         "skiprows": skiprows,
         "nrows": nrows,
         "na_values": na_values,
         "keep_default_na": keep_default_na,
         "na_filter": na_filter,
         "verbose": verbose,
         "skip_blank_lines": skip_blank_lines,
         "parse_dates": parse_dates,
         "infer_datetime_format": infer_datetime_format,
         "keep_date_col": keep_date_col,
         "date_parser": date_parser,
         "dayfirst": dayfirst,
         "iterator": iterator,
         "chunksize": chunksize,
         "compression": compression,
         "thousands": thousands,
         "decimal": decimal,
         "lineterminator": lineterminator,
         "quotechar": quotechar,
         "quoting": quoting,
         "escapechar": escapechar,
         "comment": comment,
         "encoding": encoding,
         "dialect": dialect,
         "tupleize_cols": tupleize_cols,
         "error_bad_lines": error_bad_lines,
         "warn_bad_lines": warn_bad_lines,
         "skipfooter": skipfooter,
         "doublequote": doublequote,
         "delim_whitespace": delim_whitespace,
         "low_memory": low_memory,
         "memory_map": memory_map,
         "float_precision": float_precision,
     }
     ErrorMessage.default_to_pandas()
     return cls._read(**kwargs)
Exemple #21
0
 def read_pickle(cls, path, compression="infer"):
     ErrorMessage.default_to_pandas()
     return cls.from_pandas(
         pandas.read_pickle(path, compression=compression))
Exemple #22
0
 def read_feather(cls, path, nthreads=1):
     ErrorMessage.default_to_pandas()
     return cls.from_pandas(pandas.read_feather(path, nthreads))
Exemple #23
0
 def read_msgpack(cls, path_or_buf, encoding="utf-8", iterator=False):
     ErrorMessage.default_to_pandas()
     return cls.from_pandas(
         pandas.read_msgpack(path_or_buf,
                             encoding=encoding,
                             iterator=iterator))
Exemple #24
0
 def read_hdf(cls, path_or_buf, key=None, mode="r", columns=None):
     ErrorMessage.default_to_pandas()
     return cls.from_pandas(
         pandas.read_hdf(path_or_buf, key=key, mode=mode, columns=columns))
Exemple #25
0
 def read_clipboard(cls, sep=r"\s+"):
     ErrorMessage.default_to_pandas()
     return cls.from_pandas(pandas.read_clipboard(sep=sep))
Exemple #26
0
    def read_csv(
        cls,
        filepath_or_buffer,
        sep=",",
        delimiter=None,
        header="infer",
        names=None,
        index_col=None,
        usecols=None,
        squeeze=False,
        prefix=None,
        mangle_dupe_cols=True,
        dtype=None,
        engine=None,
        converters=None,
        true_values=None,
        false_values=None,
        skipinitialspace=False,
        skiprows=None,
        nrows=None,
        na_values=None,
        keep_default_na=True,
        na_filter=True,
        verbose=False,
        skip_blank_lines=True,
        parse_dates=False,
        infer_datetime_format=False,
        keep_date_col=False,
        date_parser=None,
        dayfirst=False,
        cache_dates=True,
        iterator=False,
        chunksize=None,
        compression="infer",
        thousands=None,
        decimal=b".",
        lineterminator=None,
        quotechar='"',
        quoting=0,
        escapechar=None,
        comment=None,
        encoding=None,
        dialect=None,
        error_bad_lines=True,
        warn_bad_lines=True,
        skipfooter=0,
        doublequote=True,
        delim_whitespace=False,
        low_memory=True,
        memory_map=False,
        float_precision=None,
        storage_options=None,
    ):
        items = locals().copy()
        mykwargs = {k: items[k] for k in items if k in cls.arg_keys}
        eng = str(engine).lower().strip()
        try:
            if eng in ["pandas", "c"]:
                return cls._read(**mykwargs)

            if isinstance(dtype, dict):
                column_types = {
                    c: cls._dtype_to_arrow(t)
                    for c, t in dtype.items()
                }
            else:
                column_types = cls._dtype_to_arrow(dtype)

            if (type(parse_dates) is list) and type(column_types) is dict:
                for c in parse_dates:
                    column_types[c] = pa.timestamp("s")

            if names:
                if header == 0:
                    skiprows = skiprows + 1 if skiprows is not None else 1
                elif header is None or header == "infer":
                    pass
                else:
                    raise NotImplementedError(
                        "read_csv with 'arrow' engine and provided 'names' parameter supports only 0, None and 'infer' header values"
                    )
            else:
                if header == 0 or header == "infer":
                    pass
                else:
                    raise NotImplementedError(
                        "read_csv with 'arrow' engine without 'names' parameter provided supports only 0 and 'infer' header values"
                    )

            if delimiter is None:
                delimiter = sep

            if delim_whitespace and delimiter != ",":
                raise ValueError(
                    "Specified a delimiter and delim_whitespace=True; you can only specify one."
                )

            usecols_md = cls._prepare_pyarrow_usecols(mykwargs)

            po = ParseOptions(
                delimiter="\\s+" if delim_whitespace else delimiter,
                quote_char=quotechar,
                double_quote=doublequote,
                escape_char=escapechar,
                newlines_in_values=False,
                ignore_empty_lines=skip_blank_lines,
            )
            co = ConvertOptions(
                check_utf8=None,
                column_types=column_types,
                null_values=None,
                true_values=None,
                false_values=None,
                # timestamp fields should be handled as strings if parse_dates
                # didn't passed explicitly as an array or a dict
                timestamp_parsers=[""]
                if isinstance(parse_dates, bool) else None,
                strings_can_be_null=None,
                include_columns=usecols_md,
                include_missing_columns=None,
                auto_dict_encode=None,
                auto_dict_max_cardinality=None,
            )
            ro = ReadOptions(
                use_threads=True,
                block_size=None,
                skip_rows=skiprows,
                column_names=names,
                autogenerate_column_names=None,
            )

            at = read_csv(
                filepath_or_buffer,
                read_options=ro,
                parse_options=po,
                convert_options=co,
            )

            return cls.from_arrow(at)
        except (pa.ArrowNotImplementedError, NotImplementedError):
            if eng in ["arrow"]:
                raise

            ErrorMessage.default_to_pandas("`read_csv`")
            return cls._read(**mykwargs)
Exemple #27
0
    def read_json(
        cls,
        path_or_buf=None,
        orient=None,
        typ="frame",
        dtype=True,
        convert_axes=True,
        convert_dates=True,
        keep_default_dates=True,
        numpy=False,
        precise_float=False,
        date_unit=None,
        encoding=None,
        lines=False,
        chunksize=None,
        compression="infer",
    ):
        kwargs = {
            "path_or_buf": path_or_buf,
            "orient": orient,
            "typ": typ,
            "dtype": dtype,
            "convert_axes": convert_axes,
            "convert_dates": convert_dates,
            "keep_default_dates": keep_default_dates,
            "numpy": numpy,
            "precise_float": precise_float,
            "date_unit": date_unit,
            "encoding": encoding,
            "lines": lines,
            "chunksize": chunksize,
            "compression": compression,
        }
        if cls.read_json_remote_task is None:
            return super(RayIO, cls).read_json(**kwargs)

        if not lines:
            ErrorMessage.default_to_pandas(
                "`read_json` only optimized with `lines=True`")
            return super(RayIO, cls).read_json(**kwargs)
        else:
            # TODO: Pick up the columns in an optimized way from all data
            # All rows must be read because some rows may have missing data
            # Currently assumes all rows have the same columns
            from io import BytesIO

            columns = pandas.read_json(
                BytesIO(b"" + open(path_or_buf, "rb").readline()),
                lines=True).columns
            kwargs["columns"] = columns
            empty_pd_df = pandas.DataFrame(columns=columns)

            path_or_buf = kwargs.pop("path_or_buf")

            with file_open(path_or_buf, "rb",
                           kwargs.get("compression", "infer")) as f:
                total_bytes = file_size(f)
                num_partitions = cls.frame_mgr_cls._compute_num_partitions()
                num_splits = min(len(columns), num_partitions)
                chunk_size = max(1, (total_bytes - f.tell()) // num_partitions)

                partition_ids = []
                index_ids = []
                dtypes_ids = []

                column_chunksize = compute_chunksize(empty_pd_df,
                                                     num_splits,
                                                     axis=1)
                if column_chunksize > len(columns):
                    column_widths = [len(columns)]
                    num_splits = 1
                else:
                    column_widths = [
                        column_chunksize if i != num_splits - 1 else
                        len(columns) - (column_chunksize * (num_splits - 1))
                        for i in range(num_splits)
                    ]

                while f.tell() < total_bytes:
                    start = f.tell()
                    f.seek(chunk_size, os.SEEK_CUR)
                    f.readline()
                    partition_id = cls.read_json_remote_task._remote(
                        args=(path_or_buf, num_splits, start, f.tell(),
                              kwargs),
                        num_return_vals=num_splits + 3,
                    )
                    partition_ids.append(partition_id[:-3])
                    index_ids.append(partition_id[-3])
                    dtypes_ids.append(partition_id[-2])

            row_lengths = ray.get(index_ids)
            new_index = pandas.RangeIndex(sum(row_lengths))

            dtypes = (pandas.concat(ray.get(dtypes_ids), axis=1).apply(
                lambda row: find_common_type(row.values),
                axis=1).squeeze(axis=0))

            partition_ids = [[
                cls.frame_partition_cls(
                    partition_ids[i][j],
                    length=row_lengths[i],
                    width=column_widths[j],
                ) for j in range(len(partition_ids[i]))
            ] for i in range(len(partition_ids))]

            if isinstance(dtypes, pandas.Series):
                dtypes.index = columns
            else:
                dtypes = pandas.Series(dtypes, index=columns)

            new_query_compiler = cls.query_compiler_cls(
                cls.frame_mgr_cls(np.array(partition_ids)),
                new_index,
                columns,
                dtypes=dtypes,
            )
            return new_query_compiler
Exemple #28
0
    def _compute_index_grouped(self, numerical=False):
        """
        Construct an index of group IDs.

        Parameters
        ----------
        numerical : bool, default: False
            Whether a group indices should be positional (True) or label-based (False).

        Returns
        -------
        dict
            A dict of {group name -> group indices} values.

        See Also
        --------
        pandas.core.groupby.GroupBy.groups
        """
        # We end up using pure pandas to compute group indices, so raising a warning
        ErrorMessage.default_to_pandas("Group indices computation")

        # Splitting level-by and column-by since we serialize them in a different ways
        by = None
        level = []
        if self._level is not None:
            level = self._level
            if not isinstance(level, list):
                level = [level]
        elif isinstance(self._by, list):
            by = []
            for o in self._by:
                if hashable(o) and o in self._query_compiler.get_index_names(
                        self._axis):
                    level.append(o)
                else:
                    by.append(o)
        else:
            by = self._by

        is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)
        # `dropna` param is the only one that matters for the group indices result
        dropna = self._kwargs.get("dropna", True)

        if hasattr(self._by, "columns") and is_multi_by:
            by = list(self._by.columns)

        if is_multi_by:
            # Because we are doing a collect (to_pandas) here and then groupby, we
            # end up using pandas implementation. Add the warning so the user is
            # aware.
            ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
            if isinstance(by, list) and all(
                    is_label(self._df, o, self._axis) for o in by):
                pandas_df = self._df._query_compiler.getitem_column_array(
                    by).to_pandas()
            else:
                by = try_cast_to_pandas(by, squeeze=True)
                pandas_df = self._df._to_pandas()
            by = wrap_into_list(by, level)
            groupby_obj = pandas_df.groupby(by=by, dropna=dropna)
            return groupby_obj.indices if numerical else groupby_obj.groups
        else:
            if isinstance(self._by, type(self._query_compiler)):
                by = self._by.to_pandas().squeeze().values
            elif self._by is None:
                index = self._query_compiler.get_axis(self._axis)
                levels_to_drop = [
                    i for i, name in enumerate(index.names)
                    if name not in level and i not in level
                ]
                by = index.droplevel(levels_to_drop)
                if isinstance(by, pandas.MultiIndex):
                    by = by.reorder_levels(level)
            else:
                by = self._by
            axis_labels = self._query_compiler.get_axis(self._axis)
            if numerical:
                # Since we want positional indices of the groups, we want to group
                # on a `RangeIndex`, not on the actual index labels
                axis_labels = pandas.RangeIndex(len(axis_labels))
            # `pandas.Index.groupby` doesn't take any parameters except `by`.
            # Have to convert an Index to a Series to be able to process `dropna=False`:
            if dropna:
                return axis_labels.groupby(by)
            else:
                groupby_obj = axis_labels.to_series().groupby(by,
                                                              dropna=dropna)
                return groupby_obj.indices if numerical else groupby_obj.groups
Exemple #29
0
def read_orc(path: FilePathOrBuffer,
             columns: Optional[List[str]] = None,
             **kwargs) -> DataFrame:
    ErrorMessage.default_to_pandas("read_orc")
    return DataFrame(pandas.read_orc(path, columns, **kwargs))
Exemple #30
0
    def _index_grouped(self):
        """
        Implement [METHOD_NAME].

        TODO: Add more details for this docstring template.

        Parameters
        ----------
        What arguments does this function have.
        [
        PARAMETER_NAME: PARAMETERS TYPES
            Description.
        ]

        Returns
        -------
        What this returns (if anything)
        """
        if self._index_grouped_cache is None:
            # Splitting level-by and column-by since we serialize them in a different ways
            by = None
            level = []
            if self._level is not None:
                level = self._level
                if not isinstance(level, list):
                    level = [level]
            elif isinstance(self._by, list):
                by = []
                for o in self._by:
                    if hashable(o) and o in self._query_compiler.get_index_names(
                        self._axis
                    ):
                        level.append(o)
                    else:
                        by.append(o)
            else:
                by = self._by

            is_multi_by = self._is_multi_by or (by is not None and len(level) > 0)

            if hasattr(self._by, "columns") and is_multi_by:
                by = list(self._by.columns)

            if is_multi_by:
                # Because we are doing a collect (to_pandas) here and then groupby, we
                # end up using pandas implementation. Add the warning so the user is
                # aware.
                ErrorMessage.catch_bugs_and_request_email(self._axis == 1)
                ErrorMessage.default_to_pandas("Groupby with multiple columns")
                if isinstance(by, list) and all(
                    is_label(self._df, o, self._axis) for o in by
                ):
                    pandas_df = self._df._query_compiler.getitem_column_array(
                        by
                    ).to_pandas()
                else:
                    by = try_cast_to_pandas(by, squeeze=True)
                    pandas_df = self._df._to_pandas()
                by = wrap_into_list(by, level)
                self._index_grouped_cache = pandas_df.groupby(by=by).groups
            else:
                if isinstance(self._by, type(self._query_compiler)):
                    by = self._by.to_pandas().squeeze().values
                elif self._by is None:
                    index = self._query_compiler.get_axis(self._axis)
                    levels_to_drop = [
                        i
                        for i, name in enumerate(index.names)
                        if name not in level and i not in level
                    ]
                    by = index.droplevel(levels_to_drop)
                    if isinstance(by, pandas.MultiIndex):
                        by = by.reorder_levels(level)
                else:
                    by = self._by
                if self._axis == 0:
                    self._index_grouped_cache = self._index.groupby(by)
                else:
                    self._index_grouped_cache = self._columns.groupby(by)
        return self._index_grouped_cache