Exemple #1
0
def test_factory_switch():
    Engine.put("Test")
    assert FactoryDispatcher.get_factory() == PandasOnTestFactory
    assert FactoryDispatcher.get_factory().io_cls == "Foo"
    Engine.put("Python")  # revert engine to default

    Backend.put("Test")
    assert FactoryDispatcher.get_factory() == TestOnPythonFactory
    assert FactoryDispatcher.get_factory().io_cls == "Bar"
    Backend.put("Pandas")  # revert engine to default
Exemple #2
0
def to_pickle_distributed(
    self,
    filepath_or_buffer: FilePathOrBuffer,
    compression: CompressionOptions = "infer",
    protocol: int = pickle.HIGHEST_PROTOCOL,
    storage_options: StorageOptions = None,
):
    """
    Pickle (serialize) object to file.

    If `*` in the filename all partitions are written to their own separate file,
    otherwise default pandas implementation is used.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        File path where the pickled object will be stored.
    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer'
        A string representing the compression to use in the output file. By
        default, infers from the file extension in specified path.
        Compression mode may be any of the following possible
        values: {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}. If compression
        mode is 'infer' and path_or_buf is path-like, then detect
        compression mode from the following extensions:
        '.gz', '.bz2', '.zip' or '.xz'. (otherwise no compression).
        If dict given and mode is 'zip' or inferred as 'zip', other entries
        passed as additional compression options.
    protocol : int, default: pickle.HIGHEST_PROTOCOL
        Int which indicates which protocol should be used by the pickler,
        default HIGHEST_PROTOCOL (see [1]_ paragraph 12.1.2). The possible
        values are 0, 1, 2, 3, 4, 5. A negative value for the protocol
        parameter is equivalent to setting its value to HIGHEST_PROTOCOL.
        .. [1] https://docs.python.org/3/library/pickle.html.
    storage_options : dict, optional
        Extra options that make sense for a particular storage connection, e.g.
        host, port, username, password, etc., if using a URL that will be parsed by
        fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing
        this argument with a non-fsspec URL. See the fsspec and backend storage
        implementation docs for the set of allowed keys and values.
    """
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    obj = self
    Engine.subscribe(_update_engine)
    if isinstance(self, DataFrame):
        obj = self._query_compiler
    FactoryDispatcher.to_pickle_distributed(
        obj,
        filepath_or_buffer=filepath_or_buffer,
        compression=compression,
        protocol=protocol,
        storage_options=storage_options,
    )
Exemple #3
0
def from_non_pandas(df, index, columns, dtype):
    """
    Convert a non-pandas DataFrame into Modin DataFrame.

    Parameters
    ----------
    df : object
        Non-pandas DataFrame.
    index : object
        Index for non-pandas DataFrame.
    columns : object
        Columns for non-pandas DataFrame.
    dtype : type
        Data type to force.

    Returns
    -------
    modin.pandas.DataFrame
        Converted DataFrame.
    """
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    new_qc = FactoryDispatcher.from_non_pandas(df, index, columns, dtype)
    if new_qc is not None:
        from .dataframe import DataFrame

        return DataFrame(query_compiler=new_qc)
    return new_qc
Exemple #4
0
def read_json(
    path_or_buf=None,
    orient=None,
    typ="frame",
    dtype=None,
    convert_axes=None,
    convert_dates=True,
    keep_default_dates=True,
    numpy=False,
    precise_float=False,
    date_unit=None,
    encoding=None,
    encoding_errors="strict",
    lines=False,
    chunksize=None,
    compression="infer",
    nrows: Optional[int] = None,
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_json(**kwargs))
Exemple #5
0
def _read(**kwargs) -> DataFrame:
    """
    General documentation is available in `modin.pandas.read_csv`.

    This experimental feature provides parallel reading from multiple csv files which are
    defined by glob pattern. Works for local files only!

    Parameters
    ----------
    **kwargs : dict
        Keyword arguments in `modin.pandas.read_csv`.

    Returns
    -------
    modin.DataFrame
    """
    Engine.subscribe(_update_engine)

    try:
        pd_obj = FactoryDispatcher.read_csv_glob(**kwargs)
    except AttributeError:
        raise AttributeError(
            "read_csv_glob() is only implemented for pandas on Ray.")

    # This happens when `read_csv` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader(
            *args, **kwargs))
        return pd_obj

    return DataFrame(query_compiler=pd_obj)
Exemple #6
0
def _read(**kwargs):
    """
    Read csv file from local disk.

    Parameters
    ----------
    **kwargs : dict
        Keyword arguments in pandas.read_csv.

    Returns
    -------
    modin.pandas.DataFrame
    """
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    pd_obj = FactoryDispatcher.read_csv(**kwargs)
    # This happens when `read_csv` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(
            query_compiler=reader(*args, **kwargs)
        )
        return pd_obj
    return DataFrame(query_compiler=pd_obj)
Exemple #7
0
def read_clipboard(sep=r"\s+", **kwargs):  # pragma: no cover
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_clipboard(**kwargs))
Exemple #8
0
def read_spss(
    path: Union[str, pathlib.Path],
    usecols: Union[Sequence[str], type(None)] = None,
    convert_categoricals: bool = True,
):
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_spss(
        path, usecols, convert_categoricals))
Exemple #9
0
def read_pickle(
    filepath_or_buffer: FilePathOrBuffer,
    compression: Optional[str] = "infer",
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_pickle(**kwargs))
Exemple #10
0
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    if kwargs.get("chunksize") is not None:
        ErrorMessage.default_to_pandas("Parameters provided [chunksize]")
        df_gen = pandas.read_sql(**kwargs)
        return (DataFrame(query_compiler=FactoryDispatcher.from_pandas(df))
                for df in df_gen)
    return DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs))
Exemple #11
0
def read_feather(
    path,
    columns=None,
    use_threads: bool = True,
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_feather(**kwargs))
Exemple #12
0
def read_sas(
    filepath_or_buffer,
    format=None,
    index=None,
    encoding=None,
    chunksize=None,
    iterator=False,
):  # pragma: no cover
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_sas(**kwargs))
Exemple #13
0
def read_sql_query(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    chunksize=None,
    dtype=None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_sql_query(**kwargs))
Exemple #14
0
def to_pickle(
    obj: Any,
    filepath_or_buffer: Union[str, pathlib.Path],
    compression: Optional[str] = "infer",
    protocol: int = pickle.HIGHEST_PROTOCOL,
    storage_options: StorageOptions = None,
):
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    if isinstance(obj, DataFrame):
        obj = obj._query_compiler
    return FactoryDispatcher.to_pickle(obj,
                                       filepath_or_buffer,
                                       compression=compression,
                                       protocol=protocol)
Exemple #15
0
def from_pandas(df):
    """
    Convert a pandas DataFrame to a Modin DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The pandas DataFrame to convert.

    Returns
    -------
    modin.pandas.DataFrame
        A new Modin DataFrame object.
    """
    from modin.data_management.factories.dispatcher import FactoryDispatcher
    from .dataframe import DataFrame

    return DataFrame(query_compiler=FactoryDispatcher.from_pandas(df))
Exemple #16
0
def from_arrow(at):
    """
    Convert an Arrow Table to a Modin DataFrame.

    Parameters
    ----------
    at : Arrow Table
        The Arrow Table to convert from.

    Returns
    -------
    DataFrame
        A new Modin DataFrame object.
    """
    from modin.data_management.factories.dispatcher import FactoryDispatcher
    from .dataframe import DataFrame

    return DataFrame(query_compiler=FactoryDispatcher.from_arrow(at))
Exemple #17
0
def read_parquet(
    path,
    engine: str = "auto",
    columns=None,
    storage_options: StorageOptions = None,
    use_nullable_dtypes: bool = False,
    **kwargs,
):
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_parquet(
        path=path,
        engine=engine,
        columns=columns,
        storage_options=storage_options,
        use_nullable_dtypes=use_nullable_dtypes,
        **kwargs,
    ))
Exemple #18
0
def read_hdf(
    path_or_buf,
    key=None,
    mode: str = "r",
    errors: str = "strict",
    where=None,
    start: Optional[int] = None,
    stop: Optional[int] = None,
    columns=None,
    iterator=False,
    chunksize: Optional[int] = None,
    **kwargs,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_hdf(**kwargs))
Exemple #19
0
def read_fwf(
    filepath_or_buffer: Union[str, pathlib.Path, IO[AnyStr]],
    colspecs="infer",
    widths=None,
    infer_nrows=100,
    **kwds,
):
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwds", {}))
    pd_obj = FactoryDispatcher.read_fwf(**kwargs)
    # When `read_fwf` returns a TextFileReader object for iterating through
    if isinstance(pd_obj, pandas.io.parsers.TextFileReader):
        reader = pd_obj.read
        pd_obj.read = lambda *args, **kwargs: DataFrame(query_compiler=reader(
            *args, **kwargs))
        return pd_obj
    return DataFrame(query_compiler=pd_obj)
Exemple #20
0
def read_stata(
    filepath_or_buffer,
    convert_dates=True,
    convert_categoricals=True,
    index_col=None,
    convert_missing=False,
    preserve_dtypes=True,
    columns=None,
    order_categoricals=True,
    chunksize=None,
    iterator=False,
    compression="infer",
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_stata(**kwargs))
Exemple #21
0
def read_excel(
    io,
    sheet_name=0,
    header=0,
    names=None,
    index_col=None,
    usecols=None,
    squeeze=False,
    dtype=None,
    engine=None,
    converters=None,
    true_values=None,
    false_values=None,
    skiprows=None,
    nrows=None,
    na_values=None,
    keep_default_na=True,
    na_filter=True,
    verbose=False,
    parse_dates=False,
    date_parser=None,
    thousands=None,
    comment=None,
    skipfooter=0,
    convert_float=None,
    mangle_dupe_cols=True,
    storage_options: StorageOptions = None,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    intermediate = FactoryDispatcher.read_excel(**kwargs)
    if isinstance(intermediate, (OrderedDict, dict)):
        parsed = type(intermediate)()
        for key in intermediate.keys():
            parsed[key] = DataFrame(query_compiler=intermediate.get(key))
        return parsed
    else:
        return DataFrame(query_compiler=intermediate)
Exemple #22
0
def read_pickle_distributed(
    filepath_or_buffer: FilePathOrBuffer,
    compression: Optional[str] = "infer",
    storage_options: StorageOptions = None,
):
    """
    Load pickled pandas object from files.

    In experimental mode, we can use `*` in the filename. The files must contain
    parts of one dataframe, which can be obtained, for example, by
    `to_pickle_distributed` function.
    Note: the number of partitions is equal to the number of input files.

    Parameters
    ----------
    filepath_or_buffer : str, path object or file-like object
        File path, URL, or buffer where the pickled object will be loaded from.
        Accept URL. URL is not limited to S3 and GCS.
    compression : {{'infer', 'gzip', 'bz2', 'zip', 'xz', None}}, default: 'infer'
        If 'infer' and 'path_or_url' is path-like, then detect compression from
        the following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
        compression) If 'infer' and 'path_or_url' is not path-like, then use
        None (= no decompression).
    storage_options : dict, optional
        Extra options that make sense for a particular storage connection, e.g.
        host, port, username, password, etc., if using a URL that will be parsed by
        fsspec, e.g., starting "s3://", "gcs://". An error will be raised if providing
        this argument with a non-fsspec URL. See the fsspec and backend storage
        implementation docs for the set of allowed keys and values.

    Returns
    -------
    unpickled : same type as object stored in file
    """
    Engine.subscribe(_update_engine)
    assert IsExperimental.get(), "This only works in experimental mode"
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    return DataFrame(query_compiler=FactoryDispatcher.read_pickle_distributed(
        **kwargs))
Exemple #23
0
def read_gbq(
    query: str,
    project_id: Optional[str] = None,
    index_col: Optional[str] = None,
    col_order: Optional[List[str]] = None,
    reauth: bool = False,
    auth_local_webserver: bool = False,
    dialect: Optional[str] = None,
    location: Optional[str] = None,
    configuration: Optional[Dict[str, Any]] = None,
    credentials=None,
    use_bqstorage_api: Optional[bool] = None,
    progress_bar_type: Optional[str] = None,
    max_results: Optional[int] = None,
) -> DataFrame:
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    kwargs.update(kwargs.pop("kwargs", {}))

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_gbq(**kwargs))
Exemple #24
0
def read_html(
    io,
    match=".+",
    flavor=None,
    header=None,
    index_col=None,
    skiprows=None,
    attrs=None,
    parse_dates=False,
    thousands=",",
    encoding=None,
    decimal=".",
    converters=None,
    na_values=None,
    keep_default_na=True,
    displayed_only=True,
):
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())

    from modin.data_management.factories.dispatcher import FactoryDispatcher

    Engine.subscribe(_update_engine)
    return DataFrame(query_compiler=FactoryDispatcher.read_html(**kwargs))
Exemple #25
0
def from_partitions(partitions,
                    axis,
                    index=None,
                    columns=None,
                    row_lengths=None,
                    column_widths=None):
    """
    Create DataFrame from remote partitions.

    Parameters
    ----------
    partitions : list
        A list of Ray.ObjectRef/Dask.Future to partitions depending on the engine used.
        Or a list of tuples of Ray.ObjectRef/Dask.Future to node ip addresses and partitions
        depending on the engine used (i.e. ``[(Ray.ObjectRef/Dask.Future, Ray.ObjectRef/Dask.Future), ...]``).
    axis : {None, 0 or 1}
        The ``axis`` parameter is used to identify what are the partitions passed.
        You have to set:

        * ``axis=0`` if you want to create DataFrame from row partitions
        * ``axis=1`` if you want to create DataFrame from column partitions
        * ``axis=None`` if you want to create DataFrame from 2D list of partitions
    index : sequence, optional
        The index for the DataFrame. Is computed if not provided.
    columns : sequence, optional
        The columns for the DataFrame. Is computed if not provided.
    row_lengths : list, optional
        The length of each partition in the rows. The "height" of
        each of the block partitions. Is computed if not provided.
    column_widths : list, optional
        The width of each partition in the columns. The "width" of
        each of the block partitions. Is computed if not provided.

    Returns
    -------
    modin.pandas.DataFrame
        DataFrame instance created from remote partitions.

    Notes
    -----
    Pass `index`, `columns`, `row_lengths` and `column_widths` to avoid triggering
    extra computations of the metadata when creating a DataFrame.
    """
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    factory = FactoryDispatcher.get_factory()

    partition_class = factory.io_cls.frame_cls._partition_mgr_cls._partition_class
    partition_frame_class = factory.io_cls.frame_cls
    partition_mgr_class = factory.io_cls.frame_cls._partition_mgr_cls

    # Since we store partitions of Modin DataFrame as a 2D NumPy array we need to place
    # passed partitions to 2D NumPy array to pass it to internal Modin Frame class.
    # `axis=None` - convert 2D list to 2D NumPy array
    if axis is None:
        if isinstance(partitions[0][0], tuple):
            parts = np.array(
                [[partition_class(partition, ip=ip) for ip, partition in row]
                 for row in partitions])
        else:
            parts = np.array(
                [[partition_class(partition) for partition in row]
                 for row in partitions])
    # `axis=0` - place row partitions to 2D NumPy array so that each row of the array is one row partition.
    elif axis == 0:
        if isinstance(partitions[0], tuple):
            parts = np.array([[partition_class(partition, ip=ip)]
                              for ip, partition in partitions])
        else:
            parts = np.array([[partition_class(partition)]
                              for partition in partitions])
    # `axis=1` - place column partitions to 2D NumPy array so that each column of the array is one column partition.
    elif axis == 1:
        if isinstance(partitions[0], tuple):
            parts = np.array([[
                partition_class(partition, ip=ip)
                for ip, partition in partitions
            ]])
        else:
            parts = np.array(
                [[partition_class(partition) for partition in partitions]])
    else:
        raise ValueError(
            f"Got unacceptable value of axis {axis}. Possible values are {0}, {1} or {None}."
        )

    labels_axis_to_sync = None
    if index is None:
        labels_axis_to_sync = 1
        index = partition_mgr_class.get_indices(0, parts,
                                                lambda df: df.axes[0])

    if columns is None:
        labels_axis_to_sync = 0 if labels_axis_to_sync is None else -1
        columns = partition_mgr_class.get_indices(1, parts,
                                                  lambda df: df.axes[1])

    frame = partition_frame_class(
        parts,
        index,
        columns,
        row_lengths=row_lengths,
        column_widths=column_widths,
    )

    if labels_axis_to_sync != -1:
        frame.synchronize_labels(axis=labels_axis_to_sync)

    return DataFrame(query_compiler=PandasQueryCompiler(frame))
Exemple #26
0
def read_sql(
    sql,
    con,
    index_col=None,
    coerce_float=True,
    params=None,
    parse_dates=None,
    columns=None,
    chunksize=None,
    partition_column: Optional[str] = None,
    lower_bound: Optional[int] = None,
    upper_bound: Optional[int] = None,
    max_sessions: Optional[int] = None,
) -> DataFrame:
    """
    General documentation is available in `modin.pandas.read_sql`.

    This experimental feature provides distributed reading from a sql file.

    Parameters
    ----------
    sql : str or SQLAlchemy Selectable (select or text object)
        SQL query to be executed or a table name.
    con : SQLAlchemy connectable, str, or sqlite3 connection
        Using SQLAlchemy makes it possible to use any DB supported by that
        library. If a DBAPI2 object, only sqlite3 is supported. The user is responsible
        for engine disposal and connection closure for the SQLAlchemy
        connectable; str connections are closed automatically. See
        `here <https://docs.sqlalchemy.org/en/13/core/connections.html>`_.
    index_col : str or list of str, optional
        Column(s) to set as index(MultiIndex).
    coerce_float : bool, default: True
        Attempts to convert values of non-string, non-numeric objects (like
        decimal.Decimal) to floating point, useful for SQL result sets.
    params : list, tuple or dict, optional
        List of parameters to pass to execute method. The syntax used to pass
        parameters is database driver dependent. Check your database driver
        documentation for which of the five syntax styles, described in PEP 249's
        paramstyle, is supported. Eg. for psycopg2, uses %(name)s so use params=
        {'name' : 'value'}.
    parse_dates : list or dict, optional
        - List of column names to parse as dates.
        - Dict of ``{column_name: format string}`` where format string is
          strftime compatible in case of parsing string times, or is one of
          (D, s, ns, ms, us) in case of parsing integer timestamps.
        - Dict of ``{column_name: arg dict}``, where the arg dict corresponds
          to the keyword arguments of :func:`pandas.to_datetime`
          Especially useful with databases without native Datetime support,
          such as SQLite.
    columns : list, optional
        List of column names to select from SQL table (only used when reading
        a table).
    chunksize : int, optional
        If specified, return an iterator where `chunksize` is the
        number of rows to include in each chunk.
    partition_column : str, optional
        Column used to share the data between the workers (MUST be a INTEGER column).
    lower_bound : int, optional
        The minimum value to be requested from the partition_column.
    upper_bound : int, optional
        The maximum value to be requested from the partition_column.
    max_sessions : int, optional
        The maximum number of simultaneous connections allowed to use.

    Returns
    -------
    modin.DataFrame
    """
    Engine.subscribe(_update_engine)
    assert IsExperimental.get(), "This only works in experimental mode"
    _, _, _, kwargs = inspect.getargvalues(inspect.currentframe())
    return DataFrame(query_compiler=FactoryDispatcher.read_sql(**kwargs))
Exemple #27
0
def from_partitions(partitions, axis):
    """
    Create DataFrame from remote partitions.

    Parameters
    ----------
    partitions : list
        A list of Ray.ObjectRef/Dask.Future to partitions depending on the engine used.
        Or a list of tuples of Ray.ObjectRef/Dask.Future to node ip addresses and partitions
        depending on the engine used (i.e. ``[(Ray.ObjectRef/Dask.Future, Ray.ObjectRef/Dask.Future), ...]``).
    axis : {None, 0 or 1}
        The ``axis`` parameter is used to identify what are the partitions passed.
        You have to set:

        * ``axis=0`` if you want to create DataFrame from row partitions
        * ``axis=1`` if you want to create DataFrame from column partitions
        * ``axis=None`` if you want to create DataFrame from 2D list of partitions

    Returns
    -------
    modin.pandas.DataFrame
        DataFrame instance created from remote partitions.
    """
    from modin.data_management.factories.dispatcher import FactoryDispatcher

    factory = FactoryDispatcher.get_factory()

    partition_class = factory.io_cls.frame_cls._partition_mgr_cls._partition_class
    partition_frame_class = factory.io_cls.frame_cls
    partition_mgr_class = factory.io_cls.frame_cls._partition_mgr_cls

    # Since we store partitions of Modin DataFrame as a 2D NumPy array we need to place
    # passed partitions to 2D NumPy array to pass it to internal Modin Frame class.
    # `axis=None` - convert 2D list to 2D NumPy array
    if axis is None:
        if isinstance(partitions[0][0], tuple):
            parts = np.array(
                [
                    [partition_class(partition, ip=ip) for ip, partition in row]
                    for row in partitions
                ]
            )
        else:
            parts = np.array(
                [
                    [partition_class(partition) for partition in row]
                    for row in partitions
                ]
            )
    # `axis=0` - place row partitions to 2D NumPy array so that each row of the array is one row partition.
    elif axis == 0:
        if isinstance(partitions[0], tuple):
            parts = np.array(
                [[partition_class(partition, ip=ip)] for ip, partition in partitions]
            )
        else:
            parts = np.array([[partition_class(partition)] for partition in partitions])
    # `axis=1` - place column partitions to 2D NumPy array so that each column of the array is one column partition.
    elif axis == 1:
        if isinstance(partitions[0], tuple):
            parts = np.array(
                [[partition_class(partition, ip=ip) for ip, partition in partitions]]
            )
        else:
            parts = np.array([[partition_class(partition) for partition in partitions]])
    else:
        raise ValueError(
            f"Got unacceptable value of axis {axis}. Possible values are {0}, {1} or {None}."
        )

    index = partition_mgr_class.get_indices(0, parts, lambda df: df.axes[0])
    columns = partition_mgr_class.get_indices(1, parts, lambda df: df.axes[1])
    return DataFrame(
        query_compiler=PandasQueryCompiler(partition_frame_class(parts, index, columns))
    )
Exemple #28
0
def test_set_backends():
    set_backends("Bar", "Foo")
    assert FactoryDispatcher.get_factory() == FooOnBarFactory
Exemple #29
0
def test_default_factory():
    assert issubclass(FactoryDispatcher.get_factory(), factories.BaseFactory)
    assert FactoryDispatcher.get_factory().io_cls
Exemple #30
0
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import numpy as np
import pandas
import pytest

import modin.pandas as pd
from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions
from modin.config import Engine, NPartitions
from modin.pandas.test.utils import df_equals
from modin.pandas.indexing import compute_sliced_len
from modin.data_management.factories.dispatcher import FactoryDispatcher

PartitionClass = (FactoryDispatcher.get_factory().io_cls.frame_cls.
                  _partition_mgr_cls._partition_class)

if Engine.get() == "Ray":
    import ray

    put_func = ray.put
    get_func = ray.get
    FutureType = ray.ObjectRef
elif Engine.get() == "Dask":
    from distributed.client import default_client
    from distributed import Future

    put_func = lambda x: default_client().scatter(x)  # noqa: E731
    get_func = lambda x: x.result()  # noqa: E731
    FutureType = Future
elif Engine.get() == "Python":