Esempio n. 1
0
    def _update_factory(cls, _):
        """
        Update and prepare factory with a new one specified via Modin config.

        Parameters
        ----------
        _ : object
            This parameters serves the compatibility purpose.
            Does not affect the result.
        """
        factory_name = get_current_backend() + "Factory"
        try:
            cls.__factory = getattr(factories, factory_name)
        except AttributeError:
            if not IsExperimental.get():
                # allow missing factories in experimenal mode only
                if hasattr(factories, "Experimental" + factory_name):
                    msg = (
                        "{0} on {1} is only accessible through the experimental API.\nRun "
                        "`import modin.experimental.pandas as pd` to use {0} on {1}."
                    )
                else:
                    msg = (
                        "Cannot find a factory for partition '{}' and execution engine '{}'. "
                        "Potential reason might be incorrect environment variable value for "
                        f"{Backend.varname} or {Engine.varname}")
                raise FactoryNotFoundError(
                    msg.format(Backend.get(), Engine.get()))
            cls.__factory = StubFactory.set_failing_name(factory_name)
        else:
            cls.__factory.prepare()
Esempio n. 2
0
def simulate_cloud(request):
    mode = request.config.getoption("--simulate-cloud").lower()
    if mode == "off":
        yield
        return

    if mode not in ("normal", "experimental"):
        raise ValueError(f"Unsupported --simulate-cloud mode: {mode}")
    assert IsExperimental.get(
    ), "Simulated cloud must be started in experimental mode"

    from modin.experimental.cloud import create_cluster, get_connection
    import modin.pandas.test.utils

    with create_cluster("local", cluster_type="local"):
        get_connection().teleport(set_experimental_env)(mode)
        with Patcher(
                get_connection(),
            (modin.pandas.test.utils, "assert_index_equal"),
            (modin.pandas.test.utils, "assert_series_equal"),
            (modin.pandas.test.utils, "assert_frame_equal"),
            (modin.pandas.test.utils, "assert_extension_array_equal"),
            (modin.pandas.test.utils, "assert_empty_frame_equal"),
        ):
            yield
Esempio n. 3
0
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
    partition_column=None,
    lower_bound=None,
    upper_bound=None,
    max_sessions=None,
):
    """Read SQL query or database table into a DataFrame.

    Args:
        sql: string or SQLAlchemy Selectable (select or text object) SQL query to be executed or a table name.
        con: SQLAlchemy connectable (engine/connection) or database string URI or DBAPI2 connection (fallback mode)
        index_col: Column(s) to set as index(MultiIndex).
        coerce_float: Attempts to convert values of non-string, non-numeric objects (like decimal.Decimal) to
                      floating point, useful for SQL result sets.
        params: List of parameters to pass to execute method. The syntax used
                to pass parameters is database driver dependent. Check your
                database driver documentation for which of the five syntax styles,
                described in PEP 249's paramstyle, is supported.
        parse_dates:
                     - List of column names to parse as dates.
                     - Dict of ``{column_name: format string}`` where format string is
                       strftime compatible in case of parsing string times, or is one of
                       (D, s, ns, ms, us) in case of parsing integer timestamps.
                     - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
                       to the keyword arguments of :func:`pandas.to_datetime`
                       Especially useful with databases without native Datetime support,
                       such as SQLite.
        columns: List of column names to select from SQL table (only used when reading a table).
        chunksize: If specified, return an iterator where `chunksize` is the number of rows to include in each chunk.
        partition_column: column used to share the data between the workers (MUST be a INTEGER column)
        lower_bound: the minimum value to be requested from the partition_column
        upper_bound: the maximum value to be requested from the partition_column
        max_sessions: the maximum number of simultaneous connections allowed to use

    Returns:
        Pandas Dataframe
    """
    Engine.subscribe(_update_engine)
    assert IsExperimental.get(), "This only works in experimental mode"
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    return DataFrame(query_compiler=EngineDispatcher.read_sql(**kwargs))
Esempio n. 4
0
def read_pickle_distributed(
    filepath_or_buffer,
    compression: Optional[str] = "infer",
    storage_options: StorageOptions = None,
):
    """
    Load pickled pandas object from files.

    This experimental feature provides parallel reading from multiple pickle files which are
    defined by glob pattern. The files must contain parts of one dataframe, which can be
    obtained, for example, by `to_pickle_distributed` function.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        File path, URL, or buffer where the pickled object will be loaded from.
        Accept URL. URL is not limited to S3 and GCS.
    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer'
        If 'infer' and 'path_or_url' is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
        compression) If 'infer' and 'path_or_url' is not path-like, then use
        None (= no decompression).
    storage_options : dict, optional
        Extra options that make sense for a particular storage connection, e.g.
        host, port, username, password, etc., if using a URL that will be parsed by
        fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing
        this argument with a non-fsspec URL. See the fsspec and backend storage
        implementation docs for the set of allowed keys and values.

    Returns
    -------
    unpickled : same type as object stored in file

    Notes
    -----
    The number of partitions is equal to the number of input files.
    """
    Engine.subscribe(_update_engine)
    assert IsExperimental.get(), "This only works in experimental mode"
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    return DataFrame(query_compiler=FactoryDispatcher.read_pickle_distributed(
        **kwargs))
Esempio n. 5
0
def read_custom_text(
    filepath_or_buffer,
    columns,
    custom_parser,
    compression="infer",
    nrows: Optional[int] = None,
    is_quoting=True,
):
    """
    Load custom text data from file.

    Parameters
    ----------
    filepath_or_buffer : str
        File path where the custom text data will be loaded from.
    columns : list or callable(file-like object, **kwargs) -> list
        Column names of list type or callable that create column names from opened file
        and passed `kwargs`.
    custom_parser : callable(file-like object, **kwargs) -> pandas.DataFrame
        Function that takes as input a part of the `filepath_or_buffer` file loaded into
        memory in file-like object form.
    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default: 'infer'
        If 'infer' and 'path_or_url' is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
        compression). If 'infer' and 'path_or_url' is not path-like, then use
        None (= no decompression).
    nrows : int, optional
        Amount of rows to read.
    is_quoting : bool, default: True
        Whether or not to consider quotes.

    Returns
    -------
    modin.DataFrame
    """
    Engine.subscribe(_update_engine)
    assert IsExperimental.get(), "This only works in experimental mode"
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    return DataFrame(query_compiler=FactoryDispatcher.read_custom_text(
        **kwargs))
Esempio n. 6
0
 def _update_engine(cls, _):
     factory_name = get_current_backend() + "Factory"
     try:
         cls.__engine = getattr(factories, factory_name)
     except AttributeError:
         if not IsExperimental.get():
             # allow missing factories in experimenal mode only
             if hasattr(factories, "Experimental" + factory_name):
                 msg = (
                     "{0} on {1} is only accessible through the experimental API.\nRun "
                     "`import modin.experimental.pandas as pd` to use {0} on {1}."
                 )
             else:
                 msg = (
                     "Cannot find a factory for partition '{}' and execution engine '{}'. "
                     "Potential reason might be incorrect environment variable value for "
                     f"{Backend.varname} or {Engine.varname}"
                 )
             raise FactoryNotFoundError(msg.format(Backend.get(), Engine.get()))
         cls.__engine = StubFactory.set_failing_name(factory_name)
     else:
         cls.__engine.prepare()
Esempio n. 7
0
def simulate_cloud(request):
    mode = request.config.getoption("--simulate-cloud").lower()
    if mode == "off":
        yield
        return

    if mode not in ("normal", "experimental"):
        raise ValueError(f"Unsupported --simulate-cloud mode: {mode}")
    assert IsExperimental.get(), "Simulated cloud must be started in experimental mode"

    from modin.experimental.cloud import create_cluster, get_connection
    import pandas._testing
    import pandas._libs.testing as cyx_testing

    with create_cluster("local", cluster_type="local"):
        get_connection().teleport(set_experimental_env)(mode)
        with Patcher(
            get_connection(),
            (pandas._testing, "assert_class_equal"),
            (cyx_testing, "assert_almost_equal"),
        ):
            yield
Esempio n. 8
0
def set_experimental_env(mode):
    from modin.config import IsExperimental

    IsExperimental.put(mode == "experimental")
Esempio n. 9
0
        What arguments does this function have.
        [
        PARAMETER_NAME: PARAMETERS TYPES
            Description.
        ]

        Returns
        -------
        What this returns (if anything)
        """
        group_ids = self._index_grouped.keys()
        if self._axis == 0:
            return ((
                k,
                Series(query_compiler=self._query_compiler.getitem_row_array(
                    self._index.get_indexer_for(
                        self._index_grouped[k].unique()))),
            ) for k in (sorted(group_ids) if self._sort else group_ids))
        else:
            return ((
                k,
                Series(query_compiler=self._query_compiler.
                       getitem_column_array(self._index_grouped[k].unique())),
            ) for k in (sorted(group_ids) if self._sort else group_ids))


if IsExperimental.get():
    from modin.experimental.cloud.meta_magic import make_wrapped_class

    make_wrapped_class(DataFrameGroupBy, "make_dataframe_groupby_wrapper")
Esempio n. 10
0
  performance.

* Although the use of experimental backends and engines is available through the
  `modin.pandas` module when defining environment variable `MODIN_EXPERIMENTAL=true`,
  the use of experimental I/O functions is available only through the
  `modin.experimental.pandas` module.

Examples
--------
>>> import modin.experimental.pandas as pd
>>> df = pd.read_csv_glob("data*.csv")
"""

from modin.config import IsExperimental

IsExperimental.put(True)

# import numpy_wrap as early as possible to intercept all "import numpy" statements
# in the user code
from .numpy_wrap import _CAUGHT_NUMPY  # noqa F401
from modin.pandas import *  # noqa F401, F403
from .io_exp import (  # noqa F401
    read_sql, read_csv_glob, read_pickle_distributed, to_pickle_distributed,
)
import warnings

setattr(DataFrame, "to_pickle_distributed",
        to_pickle_distributed)  # noqa: F405

warnings.warn(
    "Thank you for using the Modin Experimental pandas API."
Esempio n. 11
0
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
    partition_column: Optional[str] = None,
    lower_bound: Optional[int] = None,
    upper_bound: Optional[int] = None,
    max_sessions: Optional[int] = None,
) -> DataFrame:
    """
    General documentation is available in `modin.pandas.read_sql`.

    This experimental feature provides distributed reading from a sql file.

    Parameters
    ----------
    sql : str or SQLAlchemy Selectable (select or text object)
        SQL query to be executed or a table name.
    con : SQLAlchemy connectable, str, or sqlite3 connection
        Using SQLAlchemy makes it possible to use any DB supported by that
        library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible
        for engine disposal and connection closure for the SQLAlchemy
        connectable; str connections are closed automatically. See
        `here <https://docs.sqlalchemy.org/en/13/core/connections.html>`_.
    index_col : str or list of str, optional
        Column(s) to set as index(MultiIndex).
    coerce_float : bool, default: True
        Attempts to convert values of non-string, non-numeric objects (like
        decimal.Decimal) to floating point, useful for SQL result sets.
    params : list, tuple or dict, optional
        List of parameters to pass to execute method. The syntax used to pass
        parameters is database driver dependent. Check your database driver
        documentation for which of the five syntax styles, described in PEP 249's
        paramstyle, is supported. Eg. for psycopg2, uses %(name)s so use params=
        {'name' : 'value'}.
    parse_dates : list or dict, optional
        - List of column names to parse as dates.
        - Dict of ``{column_name: format string}`` where format string is
          strftime compatible in case of parsing string times, or is one of
          (D, s, ns, ms, us) in case of parsing integer timestamps.
        - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
          to the keyword arguments of :func:`pandas.to_datetime`
          Especially useful with databases without native Datetime support,
          such as SQLite.
    columns : list, optional
        List of column names to select from SQL table (only used when reading
        a table).
    chunksize : int, optional
        If specified, return an iterator where `chunksize` is the
        number of rows to include in each chunk.
    partition_column : str, optional
        Column used to share the data between the workers (MUST be a INTEGER column).
    lower_bound : int, optional
        The minimum value to be requested from the partition_column.
    upper_bound : int, optional
        The maximum value to be requested from the partition_column.
    max_sessions : int, optional
        The maximum number of simultaneous connections allowed to use.

    Returns
    -------
    modin.DataFrame
    """
    Engine.subscribe(_update_engine)
    assert IsExperimental.get(), "This only works in experimental mode"
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    return DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs))