Esempio n. 1
0
def test_from_partitions(axis, index, columns, row_lengths, column_widths):
    num_rows = 2**16
    num_cols = 2**8
    data = np.random.randint(0, 100, size=(num_rows, num_cols))
    df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data)
    expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis)

    index = expected_df.index if index == "index" else None
    columns = expected_df.columns if columns == "columns" else None
    row_lengths = (None if row_lengths is None else
                   [num_rows, num_rows] if axis == 0 else [num_rows])
    column_widths = (None if column_widths is None else
                     [num_cols] if axis == 0 else [num_cols, num_cols])

    if Engine.get() == "Ray":
        if axis is None:
            futures = [[ray.put(df1), ray.put(df2)]]
        else:
            futures = [ray.put(df1), ray.put(df2)]
    if Engine.get() == "Dask":
        client = default_client()
        if axis is None:
            futures = [client.scatter([df1, df2], hash=False)]
        else:
            futures = client.scatter([df1, df2], hash=False)
    actual_df = from_partitions(
        futures,
        axis,
        index=index,
        columns=columns,
        row_lengths=row_lengths,
        column_widths=column_widths,
    )
    df_equals(expected_df, actual_df)
Esempio n. 2
0
def test_unwrap_partitions(axis):
    data = np.random.randint(0, 100, size=(2**16, 2**8))
    df = pd.DataFrame(data)

    if axis is None:
        expected_partitions = df._query_compiler._modin_frame._partitions
        actual_partitions = np.array(unwrap_partitions(df, axis=axis))
        assert (expected_partitions.shape[0] == actual_partitions.shape[0] and
                expected_partitions.shape[1] == expected_partitions.shape[1])
        for row_idx in range(expected_partitions.shape[0]):
            for col_idx in range(expected_partitions.shape[1]):
                if Engine.get() == "Ray":
                    assert (expected_partitions[row_idx][col_idx].oid ==
                            actual_partitions[row_idx][col_idx])
                if Engine.get() == "Dask":
                    assert (expected_partitions[row_idx][col_idx].future ==
                            actual_partitions[row_idx][col_idx])
    else:
        expected_axis_partitions = (
            df._query_compiler._modin_frame._partition_mgr_cls.axis_partition(
                df._query_compiler._modin_frame._partitions, axis ^ 1))
        expected_axis_partitions = [
            axis_partition.force_materialization().unwrap(squeeze=True)
            for axis_partition in expected_axis_partitions
        ]
        actual_axis_partitions = unwrap_partitions(df, axis=axis)
        assert len(expected_axis_partitions) == len(actual_axis_partitions)
        for item_idx in range(len(expected_axis_partitions)):
            if Engine.get() in ["Ray", "Dask"]:
                df_equals(
                    get_func(expected_axis_partitions[item_idx]),
                    get_func(actual_axis_partitions[item_idx]),
                )
Esempio n. 3
0
        def __update_engine(self, _):
            if Engine.get() in REMOTE_ENGINES:
                from modin.experimental.cloud import get_connection

                self.__swap_numpy(get_connection().modules["numpy"])
            else:
                self.__swap_numpy()
Esempio n. 4
0
def test_from_sql_distributed(make_sql_connection):  # noqa: F811
    if Engine.get() == "Ray":
        filename = "test_from_sql_distributed.db"
        table = "test_from_sql_distributed"
        conn = make_sql_connection(filename, table)
        query = "select * from {0}".format(table)

        pandas_df = pandas.read_sql(query, conn)
        modin_df_from_query = pd.read_sql(
            query,
            conn,
            partition_column="col1",
            lower_bound=0,
            upper_bound=6,
            max_sessions=2,
        )
        modin_df_from_table = pd.read_sql(
            table,
            conn,
            partition_column="col1",
            lower_bound=0,
            upper_bound=6,
            max_sessions=2,
        )

        df_equals(modin_df_from_query, pandas_df)
        df_equals(modin_df_from_table, pandas_df)
Esempio n. 5
0
    def update_class(_):
        if Engine.get() in REMOTE_ENGINES:
            from . import rpyc_proxy

            result.__real_cls__ = getattr(rpyc_proxy, rpyc_wrapper_name)(result)
        else:
            result.__real_cls__ = result
Esempio n. 6
0
 def _read_sql(cls, **kwargs):
     if Engine.get() != "Ray":
         if "partition_column" in kwargs:
             if kwargs["partition_column"] is not None:
                 warnings.warn(
                     "Distributed read_sql() was only implemented for Ray engine."
                 )
             del kwargs["partition_column"]
         if "lower_bound" in kwargs:
             if kwargs["lower_bound"] is not None:
                 warnings.warn(
                     "Distributed read_sql() was only implemented for Ray engine."
                 )
             del kwargs["lower_bound"]
         if "upper_bound" in kwargs:
             if kwargs["upper_bound"] is not None:
                 warnings.warn(
                     "Distributed read_sql() was only implemented for Ray engine."
                 )
             del kwargs["upper_bound"]
         if "max_sessions" in kwargs:
             if kwargs["max_sessions"] is not None:
                 warnings.warn(
                     "Distributed read_sql() was only implemented for Ray engine."
                 )
             del kwargs["max_sessions"]
     return cls.io_cls.read_sql(**kwargs)
Esempio n. 7
0
    def _update_factory(cls, _):
        """
        Update and prepare factory with a new one specified via Modin config.

        Parameters
        ----------
        _ : object
            This parameters serves the compatibility purpose.
            Does not affect the result.
        """
        factory_name = get_current_backend() + "Factory"
        try:
            cls.__factory = getattr(factories, factory_name)
        except AttributeError:
            if not IsExperimental.get():
                # allow missing factories in experimenal mode only
                if hasattr(factories, "Experimental" + factory_name):
                    msg = (
                        "{0} on {1} is only accessible through the experimental API.\nRun "
                        "`import modin.experimental.pandas as pd` to use {0} on {1}."
                    )
                else:
                    msg = (
                        "Cannot find a factory for partition '{}' and execution engine '{}'. "
                        "Potential reason might be incorrect environment variable value for "
                        f"{Backend.varname} or {Engine.varname}")
                raise FactoryNotFoundError(
                    msg.format(Backend.get(), Engine.get()))
            cls.__factory = StubFactory.set_failing_name(factory_name)
        else:
            cls.__factory.prepare()
Esempio n. 8
0
def test_from_partitions(axis):
    data = np.random.randint(0, 100, size=(2**16, 2**8))
    df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data)
    expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis)
    if Engine.get() == "Ray":
        if axis is None:
            futures = [[ray.put(df1), ray.put(df2)]]
        else:
            futures = [ray.put(df1), ray.put(df2)]
    if Engine.get() == "Dask":
        client = get_client()
        if axis is None:
            futures = [client.scatter([df1, df2], hash=False)]
        else:
            futures = client.scatter([df1, df2], hash=False)
    actual_df = from_partitions(futures, axis)
    df_equals(expected_df, actual_df)
Esempio n. 9
0
def train(
    params: Dict,
    dtrain: DMatrix,
    *args,
    evals=(),
    num_actors: Optional[int] = None,
    evals_result: Optional[Dict] = None,
    **kwargs,
):
    """
    Run distributed training of XGBoost model.

    During work it evenly distributes `dtrain` between workers according
    to IP addresses partitions (in case of not even distribution of `dtrain`
    over nodes, some partitions will be re-distributed between nodes),
    runs xgb.train on each worker for subset of `dtrain` and reduces training results
    of each worker using Rabit Context.

    Parameters
    ----------
    params : dict
        Booster params.
    dtrain : modin.experimental.xgboost.DMatrix
        Data to be trained against.
    *args : iterable
        Other parameters for `xgboost.train`.
    evals : list of pairs (modin.experimental.xgboost.DMatrix, str), default: empty
        List of validation sets for which metrics will evaluated during training.
        Validation metrics will help us track the performance of the model.
    num_actors : int, optional
        Number of actors for training. If unspecified, this value will be
        computed automatically.
    evals_result : dict, optional
        Dict to store evaluation results in.
    **kwargs : dict
        Other parameters are the same as `xgboost.train`.

    Returns
    -------
    modin.experimental.xgboost.Booster
        A trained booster.
    """
    LOGGER.info("Training started")

    if Engine.get() == "Ray":
        from .xgboost_ray import _train
    else:
        raise ValueError("Current version supports only Ray engine.")

    assert isinstance(
        dtrain, DMatrix
    ), f"Type of `dtrain` is {type(dtrain)}, but expected {DMatrix}."
    result = _train(dtrain, num_actors, params, *args, evals=evals, **kwargs)
    if isinstance(evals_result, dict):
        evals_result.update(result["history"])

    LOGGER.info("Training finished")
    return Booster(model_file=result["booster"])
Esempio n. 10
0
def train(
    params: Dict,
    dtrain: DMatrix,
    *args,
    evals=(),
    num_actors: Optional[int] = None,
    evals_result: Optional[Dict] = None,
    **kwargs,
):
    """
    Train XGBoost model.

    Parameters
    ----------
    params : dict
        Booster params.
    dtrain : DMatrix
        Data to be trained against.
    evals: list of pairs (DMatrix, string)
        List of validation sets for which metrics will evaluated during training.
        Validation metrics will help us track the performance of the model.
    num_actors : int. Default is None
        Number of actors for training. If it's None, this value will be
        computed automatically.
    evals_result : dict. Default is None
        Dict to store evaluation results in.
    \\*\\*kwargs :
        Other parameters are the same as `xgboost.train`.

    Returns
    -------
    ``modin.experimental.xgboost.Booster``
        A trained booster.
    """
    LOGGER.info("Training started")

    if Engine.get() == "Ray":
        from .xgboost_ray import _train
    else:
        raise ValueError("Current version supports only Ray engine.")

    assert isinstance(
        dtrain, DMatrix
    ), f"Type of `dtrain` is {type(dtrain)}, but expected {DMatrix}."
    result = _train(dtrain, num_actors, params, *args, evals=evals, **kwargs)
    if isinstance(evals_result, dict):
        evals_result.update(result["history"])

    LOGGER.info("Training finished")
    return Booster(model_file=result["booster"])
Esempio n. 11
0
def predict(
    model,
    data: ModinDMatrix,
    nthread: Optional[int] = cpu_count(),
    evenly_data_distribution: Optional[bool] = True,
    **kwargs,
):
    """
    Run prediction with a trained booster.

    Parameters
    ----------
    model : A Booster or a dictionary returned by `modin.experimental.xgboost.train`.
        The trained model.
    data : ModinDMatrix.
        Input data used for prediction.
    nthread : int
        Number of threads for using in each node. By default it is equal to
        number of threads on master node.
    evenly_data_distribution : boolean, default True
        Whether make evenly distribution of partitions between nodes or not.
        In case `False` minimal datatransfer between nodes will be provided
        but the data may not be evenly distributed.

    Returns
    -------
    numpy.array
        Array with prediction results.
    """
    LOGGER.info("Prediction started")

    if Engine.get() == "Ray":
        from .xgboost_ray import _predict
    else:
        raise ValueError("Current version supports only Ray engine.")

    if isinstance(model, xgb.Booster):
        booster = model
    elif isinstance(model, dict):
        booster = model["booster"]
    else:
        raise TypeError(
            f"Expected types for `model` xgb.Booster or dict, but presented type is {type(model)}"
        )
    result = _predict(booster, data, nthread, evenly_data_distribution,
                      **kwargs)
    LOGGER.info("Prediction finished")

    return result
Esempio n. 12
0
def train(
        params: Dict,
        dtrain: ModinDMatrix,
        *args,
        evals=(),
        nthread: Optional[int] = cpu_count(),
        **kwargs,
):
    """
    Train XGBoost model.

    Parameters
    ----------
    params : dict
        Booster params.
    dtrain : ModinDMatrix
        Data to be trained against.
    evals: list of pairs (ModinDMatrix, string)
        List of validation sets for which metrics will evaluated during training.
        Validation metrics will help us track the performance of the model.
    nthread : int. Default is number of threads on master node
        Number of threads for using in each node.
    \\*\\*kwargs :
        Other parameters are the same as `xgboost.train` except for
        `evals_result`, which is returned as part of function return value
        instead of argument.

    Returns
    -------
    dict
        A dictionary containing trained booster and evaluation history.
        `history` field is the same as `eval_result` from `xgboost.train`.

        .. code-block:: python

            {'booster': xgboost.Booster,
             'history': {'train': {'logloss': ['0.48253', '0.35953']},
                         'eval': {'logloss': ['0.480385', '0.357756']}}}
    """
    LOGGER.info("Training started")

    if Engine.get() == "Ray":
        from .xgboost_ray import _train
    else:
        raise ValueError("Current version supports only Ray engine.")

    result = _train(dtrain, nthread, params, *args, evals=evals, **kwargs)
    LOGGER.info("Training finished")
    return result
Esempio n. 13
0
def test_from_sql_distributed(make_sql_connection):  # noqa: F811
    if Engine.get() == "Ray":
        pytest.xfail("Distributed read_sql is broken, see GH#2194")
        filename = "test_from_sql_distributed.db"
        table = "test_from_sql_distributed"
        conn = make_sql_connection(filename, table)
        query = "select * from {0}".format(table)

        pandas_df = pandas.read_sql(query, conn)
        modin_df_from_query = pd.read_sql(
            query, conn, partition_column="col1", lower_bound=0, upper_bound=6
        )
        modin_df_from_table = pd.read_sql(
            table, conn, partition_column="col1", lower_bound=0, upper_bound=6
        )

        df_equals(modin_df_from_query, pandas_df)
        df_equals(modin_df_from_table, pandas_df)
Esempio n. 14
0
    def predict(
        self,
        data: DMatrix,
        num_actors: Optional[int] = None,
        **kwargs,
    ):
        """
        Run distributed prediction with a trained booster.

        During work it evenly distributes `data` between workers,
        runs xgb.predict on each worker for subset of `data` and creates
        Modin DataFrame with prediction results.

        Parameters
        ----------
        data : modin.experimental.xgboost.DMatrix
            Input data used for prediction.
        num_actors : int, optional
            Number of actors for prediction. If unspecified, this value will be
            computed automatically.
        **kwargs : dict
            Other parameters are the same as `xgboost.Booster.predict`.

        Returns
        -------
        modin.pandas.DataFrame
            Modin DataFrame with prediction results.
        """
        LOGGER.info("Prediction started")

        if Engine.get() == "Ray":
            from .xgboost_ray import _predict
        else:
            raise ValueError("Current version supports only Ray engine.")

        assert isinstance(
            data, DMatrix
        ), f"Type of `data` is {type(data)}, but expected {DMatrix}."

        result = _predict(self.copy(), data, num_actors, **kwargs)
        LOGGER.info("Prediction finished")

        return result
Esempio n. 15
0
def predict(
        model,
        data: ModinDMatrix,
        nthread: Optional[int] = cpu_count(),
        **kwargs,
):
    """
    Run prediction with a trained booster.

    Parameters
    ----------
    model : A Booster or a dictionary returned by `modin.experimental.xgboost.train`
        The trained model.
    data : ModinDMatrix
        Input data used for prediction.
    nthread : int. Default is number of threads on master node
        Number of threads for using in each node.

    Returns
    -------
    modin.pandas.DataFrame
        Modin DataFrame with prediction results.
    """
    LOGGER.info("Prediction started")

    if Engine.get() == "Ray":
        from .xgboost_ray import _predict
    else:
        raise ValueError("Current version supports only Ray engine.")

    if isinstance(model, xgb.Booster):
        booster = model
    elif isinstance(model, dict):
        booster = model["booster"]
    else:
        raise TypeError(
            f"Expected types for `model` xgb.Booster or dict, but presented type is {type(model)}"
        )
    result = _predict(booster, data, nthread, **kwargs)
    LOGGER.info("Prediction finished")

    return result
Esempio n. 16
0
 def _update_engine(cls, _):
     factory_name = get_current_backend() + "Factory"
     try:
         cls.__engine = getattr(factories, factory_name)
     except AttributeError:
         if not IsExperimental.get():
             # allow missing factories in experimenal mode only
             if hasattr(factories, "Experimental" + factory_name):
                 msg = (
                     "{0} on {1} is only accessible through the experimental API.\nRun "
                     "`import modin.experimental.pandas as pd` to use {0} on {1}."
                 )
             else:
                 msg = (
                     "Cannot find a factory for partition '{}' and execution engine '{}'. "
                     "Potential reason might be incorrect environment variable value for "
                     f"{Backend.varname} or {Engine.varname}"
                 )
             raise FactoryNotFoundError(msg.format(Backend.get(), Engine.get()))
         cls.__engine = StubFactory.set_failing_name(factory_name)
     else:
         cls.__engine.prepare()
Esempio n. 17
0
    def predict(
        self,
        data: DMatrix,
        num_actors: Optional[int] = None,
        **kwargs,
    ):
        """
        Run prediction with a trained booster.

        Parameters
        ----------
        data : DMatrix
            Input data used for prediction.
        num_actors : int. Default is None
            Number of actors for prediction. If it's None, this value will be
            computed automatically.
        \\*\\*kwargs :
            Other parameters are the same as `xgboost.Booster.predict`.

        Returns
        -------
        ``modin.pandas.DataFrame``
            Modin DataFrame with prediction results.
        """
        LOGGER.info("Prediction started")

        if Engine.get() == "Ray":
            from .xgboost_ray import _predict
        else:
            raise ValueError("Current version supports only Ray engine.")

        assert isinstance(
            data, DMatrix
        ), f"Type of `data` is {type(data)}, but expected {DMatrix}."

        result = _predict(self.copy(), data, num_actors, **kwargs)
        LOGGER.info("Prediction finished")

        return result
Esempio n. 18
0
    def predict(
        self,
        data: DMatrix,
        **kwargs,
    ):
        """
        Run distributed prediction with a trained booster.

        During execution it runs ``xgb.predict`` on each worker for subset of `data`
        and creates Modin DataFrame with prediction results.

        Parameters
        ----------
        data : modin.experimental.xgboost.DMatrix
            Input data used for prediction.
        **kwargs : dict
            Other parameters are the same as for ``xgboost.Booster.predict``.

        Returns
        -------
        modin.pandas.DataFrame
            Modin DataFrame with prediction results.
        """
        LOGGER.info("Prediction started")

        if Engine.get() == "Ray":
            from .xgboost_ray import _predict
        else:
            raise ValueError("Current version supports only Ray engine.")

        assert isinstance(
            data, DMatrix
        ), f"Type of `data` is {type(data)}, but expected {DMatrix}."

        result = _predict(self.copy(), data, **kwargs)
        LOGGER.info("Prediction finished")

        return result
Esempio n. 19
0
    def predict(
            self,
            data: DMatrix,
            nthread: Optional[int] = cpu_count(),
            **kwargs,
    ):
        """
        Run prediction with a trained booster.

        Parameters
        ----------
        data : DMatrix
            Input data used for prediction.
        nthread : int. Default is number of threads on master node
            Number of threads for using in each node.
        \\*\\*kwargs :
            Other parameters are the same as `xgboost.Booster.predict`.

        Returns
        -------
        ``modin.pandas.DataFrame``
            Modin DataFrame with prediction results.
        """
        LOGGER.info("Prediction started")

        if Engine.get() == "Ray":
            from .xgboost_ray import _predict
        else:
            raise ValueError("Current version supports only Ray engine.")

        assert isinstance(
            data, DMatrix
        ), f"Type of `data` is {type(data)}, but expected {DMatrix}."

        result = _predict(self.copy(), data, nthread, **kwargs)
        LOGGER.info("Prediction finished")

        return result
Esempio n. 20
0
# Licensed to Modin Development Team under one or more contributor license agreements.
# See the NOTICE file distributed with this work for additional information regarding
# copyright ownership.  The Modin Development Team licenses this file to you under the
# Apache License, Version 2.0 (the "License"); you may not use this file except in
# compliance with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import pytest
from modin.config import Engine

import modin.experimental.xgboost as xgb


@pytest.mark.skipif(
    Engine.get() == "Ray",
    reason="This test doesn't make sense on Ray backend.",
)
@pytest.mark.parametrize("func", ["train", "predict"])
def test_backend(func):
    try:
        getattr(xgb, func)({}, xgb.ModinDMatrix(None, None))
    except ValueError:
        pass
Esempio n. 21
0
import pandas
import pytest
import modin.experimental.pandas as pd
from modin.config import Engine
from modin.pandas.test.test_io import (  # noqa: F401
    df_equals,
    eval_io,
    make_sql_connection,
    _make_csv_file,
    teardown_test_files,
)
from modin.pandas.test.utils import get_unique_filename


@pytest.mark.skipif(
    Engine.get() == "Dask",
    reason="Dask does not have experimental API",
)
def test_from_sql_distributed(make_sql_connection):  # noqa: F811
    if Engine.get() == "Ray":
        pytest.xfail("Distributed read_sql is broken, see GH#2194")
        filename = "test_from_sql_distributed.db"
        table = "test_from_sql_distributed"
        conn = make_sql_connection(filename, table)
        query = "select * from {0}".format(table)

        pandas_df = pandas.read_sql(query, conn)
        modin_df_from_query = pd.read_sql(
            query, conn, partition_column="col1", lower_bound=0, upper_bound=6
        )
        modin_df_from_table = pd.read_sql(
Esempio n. 22
0
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import pandas
import pytest
import modin.experimental.pandas as pd
from modin.config import Engine
from modin.pandas.test.test_io import (  # noqa: F401
    df_equals,
    make_sql_connection,
)


@pytest.mark.skipif(
    Engine.get() == "Dask",
    reason="Dask does not have experimental API",
)
def test_from_sql_distributed(make_sql_connection):  # noqa: F811
    if Engine.get() == "Ray":
        pytest.xfail("Distributed read_sql is broken, see GH#2194")
        filename = "test_from_sql_distributed.db"
        table = "test_from_sql_distributed"
        conn = make_sql_connection(filename, table)
        query = "select * from {0}".format(table)

        pandas_df = pandas.read_sql(query, conn)
        modin_df_from_query = pd.read_sql(
            query, conn, partition_column="col1", lower_bound=0, upper_bound=6
        )
        modin_df_from_table = pd.read_sql(
Esempio n. 23
0
import numpy as np
import pandas
import pytest

import modin.pandas as pd
from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions
from modin.config import Engine, NPartitions
from modin.pandas.test.utils import df_equals
from modin.pandas.indexing import compute_sliced_len
from modin.data_management.factories.dispatcher import FactoryDispatcher

PartitionClass = (FactoryDispatcher.get_factory().io_cls.frame_cls.
                  _partition_mgr_cls._partition_class)

if Engine.get() == "Ray":
    import ray

    put_func = ray.put
    get_func = ray.get
    FutureType = ray.ObjectRef
elif Engine.get() == "Dask":
    from distributed.client import default_client
    from distributed import Future

    put_func = lambda x: default_client().scatter(x)  # noqa: E731
    get_func = lambda x: x.result()  # noqa: E731
    FutureType = Future
elif Engine.get() == "Python":
    put_func = lambda x: x  # noqa: E731
    get_func = lambda x: x  # noqa: E731
Esempio n. 24
0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import numpy as np
import pandas
import pytest

import modin.pandas as pd
from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions
from modin.config import Engine, NPartitions
from modin.pandas.test.utils import df_equals

if Engine.get() == "Ray":
    import ray
if Engine.get() == "Dask":
    from distributed.client import get_client

NPartitions.put(4)


@pytest.mark.parametrize("axis", [None, 0, 1])
def test_unwrap_partitions(axis):
    data = np.random.randint(0, 100, size=(2**16, 2**8))
    df = pd.DataFrame(data)

    if axis is None:
        expected_partitions = df._query_compiler._modin_frame._partitions
        actual_partitions = np.array(unwrap_partitions(df, axis=axis))
Esempio n. 25
0
    def predict(
        self,
        data: DMatrix,
        **kwargs,
    ):
        """
        Run distributed prediction with a trained booster.

        During execution it runs ``xgb.predict`` on each worker for subset of `data`
        and creates Modin DataFrame with prediction results.

        Parameters
        ----------
        data : modin.experimental.xgboost.DMatrix
            Input data used for prediction.
        **kwargs : dict
            Other parameters are the same as for ``xgboost.Booster.predict``.

        Returns
        -------
        modin.pandas.DataFrame
            Modin DataFrame with prediction results.
        """
        LOGGER.info("Prediction started")

        if Engine.get() == "Ray":
            from .xgboost_ray import _predict
        else:
            raise ValueError("Current version supports only Ray engine.")

        assert isinstance(
            data, DMatrix
        ), f"Type of `data` is {type(data)}, but expected {DMatrix}."

        if (
            self.feature_names is not None
            and data.feature_names is not None
            and self.feature_names != data.feature_names
        ):
            data_missing = set(self.feature_names) - set(data.feature_names)
            self_missing = set(data.feature_names) - set(self.feature_names)

            msg = "feature_names mismatch: {0} {1}"

            if data_missing:
                msg += (
                    "\nexpected "
                    + ", ".join(str(s) for s in data_missing)
                    + " in input data"
                )

            if self_missing:
                msg += (
                    "\ntraining data did not have the following fields: "
                    + ", ".join(str(s) for s in self_missing)
                )

            raise ValueError(msg.format(self.feature_names, data.feature_names))

        result = _predict(self.copy(), data, **kwargs)
        LOGGER.info("Prediction finished")

        return result
Esempio n. 26
0
import pytest
import modin.experimental.pandas as pd
from modin.config import Engine
from modin.utils import get_current_execution
from modin.pandas.test.utils import (
    df_equals,
    get_unique_filename,
    teardown_test_files,
    test_data,
)
from modin.test.test_utils import warns_that_defaulting_to_pandas
from modin.pandas.test.utils import parse_dates_values_by_id, time_parsing_csv_path


@pytest.mark.skipif(
    Engine.get() == "Dask",
    reason="Dask does not have experimental API",
)
def test_from_sql_distributed(make_sql_connection):  # noqa: F811
    if Engine.get() == "Ray":
        filename = "test_from_sql_distributed.db"
        table = "test_from_sql_distributed"
        conn = make_sql_connection(filename, table)
        query = "select * from {0}".format(table)

        pandas_df = pandas.read_sql(query, conn)
        modin_df_from_query = pd.read_sql(
            query,
            conn,
            partition_column="col1",
            lower_bound=0,
Esempio n. 27
0
RAND_HIGH = 100
random_state = np.random.RandomState(seed=42)

try:
    from modin.config import NPartitions

    NPARTITIONS = NPartitions.get()
except ImportError:
    NPARTITIONS = pd.DEFAULT_NPARTITIONS

try:
    from modin.config import TestDatasetSize, AsvImplementation, Engine

    ASV_USE_IMPL = AsvImplementation.get()
    ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
    ASV_USE_ENGINE = Engine.get()
except ImportError:
    # The same benchmarking code can be run for different versions of Modin, so in
    # case of an error importing important variables, we'll just use predefined values
    ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
    ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")
    ASV_USE_ENGINE = os.environ.get("MODIN_ENGINE", "Ray")

ASV_USE_IMPL = ASV_USE_IMPL.lower()
ASV_DATASET_SIZE = ASV_DATASET_SIZE.lower()
ASV_USE_ENGINE = ASV_USE_ENGINE.lower()

assert ASV_USE_IMPL in ("modin", "pandas")
assert ASV_DATASET_SIZE in ("big", "small")
assert ASV_USE_ENGINE in ("ray", "dask", "python")
Esempio n. 28
0
# compliance with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import pytest
from modin.config import Engine

import modin.experimental.xgboost as xgb
import modin.pandas as pd


@pytest.mark.skipif(
    Engine.get() == "Ray",
    reason="This test doesn't make sense on Ray backend.",
)
@pytest.mark.skipif(
    Engine.get() == "Python",
    reason=
    "This test doesn't make sense on not distributed backend (see issue #2938).",
)
def test_backend():
    try:
        xgb.train({}, xgb.DMatrix(pd.DataFrame([0]), pd.DataFrame([0])))
    except ValueError:
        pass