Beispiel #1
0
def _update_engine(publisher: Parameter):
    global dask_client
    from modin.config import Backend, CpuCount

    if publisher.get() == "Ray":
        from modin.engines.ray.utils import initialize_ray

        # With OmniSci backend there is only a single worker per node
        # and we allow it to work on all cores.
        if Backend.get() == "Omnisci":
            CpuCount.put(1)
            os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
        if _is_first_update.get("Ray", True):
            initialize_ray()

    elif publisher.get() == "Dask":
        if _is_first_update.get("Dask", True):
            from modin.engines.dask.utils import initialize_dask

            initialize_dask()
    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection

        conn = get_connection()
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray(partition):
                from ray import ray_constants
                import modin
                from modin.engines.ray.utils import initialize_ray

                modin.set_backends("Ray", partition)
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray(Backend.get())
            # import FactoryDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.data_management.factories.dispatcher  # noqa: F401
        else:
            get_connection().modules["modin"].set_backends(
                "Ray", Backend.get())
    elif publisher.get() == "Cloudpython":
        from modin.experimental.cloud import get_connection

        get_connection().modules["modin"].set_backends("Python")

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
Beispiel #2
0
def test_engine_switch():
    Engine.put("Test")
    assert EngineDispatcher.get_engine() == PandasOnTestFactory
    assert EngineDispatcher.get_engine().io_cls == "Foo"
    Engine.put("Python")  # revert engine to default

    Backend.put("Test")
    assert EngineDispatcher.get_engine() == TestOnPythonFactory
    assert EngineDispatcher.get_engine().io_cls == "Bar"
    Backend.put("Pandas")  # revert engine to default
Beispiel #3
0
    def _update_factory(cls, _):
        """
        Update and prepare factory with a new one specified via Modin config.

        Parameters
        ----------
        _ : object
            This parameters serves the compatibility purpose.
            Does not affect the result.
        """
        factory_name = get_current_backend() + "Factory"
        try:
            cls.__factory = getattr(factories, factory_name)
        except AttributeError:
            if not IsExperimental.get():
                # allow missing factories in experimenal mode only
                if hasattr(factories, "Experimental" + factory_name):
                    msg = (
                        "{0} on {1} is only accessible through the experimental API.\nRun "
                        "`import modin.experimental.pandas as pd` to use {0} on {1}."
                    )
                else:
                    msg = (
                        "Cannot find a factory for partition '{}' and execution engine '{}'. "
                        "Potential reason might be incorrect environment variable value for "
                        f"{Backend.varname} or {Engine.varname}")
                raise FactoryNotFoundError(
                    msg.format(Backend.get(), Engine.get()))
            cls.__factory = StubFactory.set_failing_name(factory_name)
        else:
            cls.__factory.prepare()
Beispiel #4
0
    def read(cls, *args, **kwargs):
        """
        Read data according passed `args` and `kwargs`.

        Parameters
        ----------
        *args : iterable
            Positional arguments to be passed into `_read` function.
        **kwargs : dict
            Keywords arguments to be passed into `_read` function.

        Returns
        -------
        query_compiler : BaseQueryCompiler
            Query compiler with imported data for further processing.

        Notes
        -----
        `read` is high-level function that calls specific for defined backend, engine and
        dispatcher class `_read` function with passed parameters and performs some
        postprocessing work on the resulting query_compiler object.
        """
        query_compiler = cls._read(*args, **kwargs)
        # TODO (devin-petersohn): Make this section more general for non-pandas kernel
        # implementations.
        if Backend.get() == "Pandas":
            import pandas as kernel_lib
        elif Backend.get() == "Cudf":
            import cudf as kernel_lib
        else:
            raise NotImplementedError("FIXME")

        if hasattr(query_compiler, "dtypes") and any(
                isinstance(t, kernel_lib.CategoricalDtype)
                for t in query_compiler.dtypes):
            dtypes = query_compiler.dtypes
            return query_compiler.astype({
                t: dtypes[t]
                for t in dtypes.index
                if isinstance(dtypes[t], kernel_lib.CategoricalDtype)
            })
        return query_compiler
Beispiel #5
0
    def read(cls, *args, **kwargs):
        query_compiler = cls._read(*args, **kwargs)
        # TODO (devin-petersohn): Make this section more general for non-pandas kernel
        # implementations.
        if Backend.get() == "Pandas":
            import pandas as kernel_lib
        elif Backend.get() == "Cudf":
            import cudf as kernel_lib
        else:
            raise NotImplementedError("FIXME")

        if hasattr(query_compiler, "dtypes") and any(
            isinstance(t, kernel_lib.CategoricalDtype) for t in query_compiler.dtypes
        ):
            dtypes = query_compiler.dtypes
            return query_compiler.astype(
                {
                    t: dtypes[t]
                    for t in dtypes.index
                    if isinstance(dtypes[t], kernel_lib.CategoricalDtype)
                }
            )
        return query_compiler
Beispiel #6
0
 def _update_engine(cls, _):
     factory_name = get_current_backend() + "Factory"
     try:
         cls.__engine = getattr(factories, factory_name)
     except AttributeError:
         if not IsExperimental.get():
             # allow missing factories in experimenal mode only
             if hasattr(factories, "Experimental" + factory_name):
                 msg = (
                     "{0} on {1} is only accessible through the experimental API.\nRun "
                     "`import modin.experimental.pandas as pd` to use {0} on {1}."
                 )
             else:
                 msg = (
                     "Cannot find a factory for partition '{}' and execution engine '{}'. "
                     "Potential reason might be incorrect environment variable value for "
                     f"{Backend.varname} or {Engine.varname}"
                 )
             raise FactoryNotFoundError(msg.format(Backend.get(), Engine.get()))
         cls.__engine = StubFactory.set_failing_name(factory_name)
     else:
         cls.__engine.prepare()
Beispiel #7
0
    def read_sql(cls, **kwargs):
        return cls.__engine._read_sql(**kwargs)

    @classmethod
    def read_fwf(cls, **kwargs):
        return cls.__engine._read_fwf(**kwargs)

    @classmethod
    def read_sql_table(cls, **kwargs):
        return cls.__engine._read_sql_table(**kwargs)

    @classmethod
    def read_sql_query(cls, **kwargs):
        return cls.__engine._read_sql_query(**kwargs)

    @classmethod
    def read_spss(cls, **kwargs):
        return cls.__engine._read_spss(**kwargs)

    @classmethod
    def to_sql(cls, *args, **kwargs):
        return cls.__engine._to_sql(*args, **kwargs)

    @classmethod
    def to_pickle(cls, *args, **kwargs):
        return cls.__engine._to_pickle(*args, **kwargs)


Engine.subscribe(EngineDispatcher._update_engine)
Backend.subscribe(EngineDispatcher._update_engine)
Beispiel #8
0
def _update_engine(publisher: Parameter):
    global DEFAULT_NPARTITIONS, dask_client, num_cpus
    from modin.config import Backend, CpuCount

    if publisher.get() == "Ray":
        import ray
        from modin.engines.ray.utils import initialize_ray

        # With OmniSci backend there is only a single worker per node
        # and we allow it to work on all cores.
        if Backend.get() == "Omnisci":
            CpuCount.put(1)
            os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
        if _is_first_update.get("Ray", True):
            initialize_ray()
        num_cpus = ray.cluster_resources()["CPU"]
    elif publisher.get() == "Dask":  # pragma: no cover
        from distributed.client import get_client

        if threading.current_thread(
        ).name == "MainThread" and _is_first_update.get("Dask", True):
            import warnings

            warnings.warn("The Dask Engine for Modin is experimental.")

            try:
                dask_client = get_client()
            except ValueError:
                from distributed import Client

                dask_client = Client(n_workers=CpuCount.get())

    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection

        conn = get_connection()
        remote_ray = conn.modules["ray"]
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray(partition):
                from ray import ray_constants
                import modin
                from modin.engines.ray.utils import initialize_ray

                modin.set_backends("Ray", partition)
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray(Backend.get())
            # import EngineDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.data_management.factories.dispatcher  # noqa: F401
        else:
            get_connection().modules["modin"].set_backends(
                "Ray", Backend.get())

        num_cpus = remote_ray.cluster_resources()["CPU"]
    elif publisher.get() == "Cloudpython":
        from modin.experimental.cloud import get_connection

        get_connection().modules["modin"].set_backends("Python")

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
    DEFAULT_NPARTITIONS = max(4, int(num_cpus))
Beispiel #9
0
def initialize_ray(
    override_is_cluster=False,
    override_redis_address: str = None,
    override_redis_password: str = None,
):
    """
    Initialize Ray based on parameters, ``modin.config`` variables and internal defaults.

    Parameters
    ----------
    override_is_cluster : bool, default: False
        Whether to override the detection of Modin being run in a cluster
        and always assume this runs on cluster head node.
        This also overrides Ray worker detection and always runs the initialization
        function (runs from main thread only by default).
        If not specified, ``modin.config.IsRayCluster`` variable is used.
    override_redis_address : str, optional
        What Redis address to connect to when running in Ray cluster.
        If not specified, ``modin.config.RayRedisAddress`` is used.
    override_redis_password : str, optional
        What password to use when connecting to Redis.
        If not specified, ``modin.config.RayRedisPassword`` is used.
    """
    import ray

    if not ray.is_initialized() or override_is_cluster:
        cluster = override_is_cluster or IsRayCluster.get()
        redis_address = override_redis_address or RayRedisAddress.get()
        redis_password = override_redis_password or RayRedisPassword.get()

        if cluster:
            # We only start ray in a cluster setting for the head node.
            ray.init(
                address=redis_address or "auto",
                include_dashboard=False,
                ignore_reinit_error=True,
                _redis_password=redis_password,
            )
        else:
            from modin.error_message import ErrorMessage

            # This string is intentionally formatted this way. We want it indented in
            # the warning message.
            ErrorMessage.not_initialized(
                "Ray",
                """
    import ray
    ray.init()
""",
            )
            object_store_memory = Memory.get()
            # In case anything failed above, we can still improve the memory for Modin.
            if object_store_memory is None:
                virtual_memory = psutil.virtual_memory().total
                if sys.platform.startswith("linux"):
                    shm_fd = os.open("/dev/shm", os.O_RDONLY)
                    try:
                        shm_stats = os.fstatvfs(shm_fd)
                        system_memory = shm_stats.f_bsize * shm_stats.f_bavail
                        if system_memory / (virtual_memory / 2) < 0.99:
                            warnings.warn(
                                f"The size of /dev/shm is too small ({system_memory} bytes). The required size "
                                f"at least half of RAM ({virtual_memory // 2} bytes). Please, delete files in /dev/shm or "
                                "increase size of /dev/shm with --shm-size in Docker. Also, you can set "
                                "the required memory size for each Ray worker in bytes to MODIN_MEMORY environment variable."
                            )
                    finally:
                        os.close(shm_fd)
                else:
                    system_memory = virtual_memory
                object_store_memory = int(0.6 * system_memory // 1e9 * 1e9)
                # If the memory pool is smaller than 2GB, just use the default in ray.
                if object_store_memory == 0:
                    object_store_memory = None
            else:
                object_store_memory = int(object_store_memory)

            ray_init_kwargs = {
                "num_cpus": CpuCount.get(),
                "num_gpus": GpuCount.get(),
                "include_dashboard": False,
                "ignore_reinit_error": True,
                "object_store_memory": object_store_memory,
                "address": redis_address,
                "_redis_password": redis_password,
                "_memory": object_store_memory,
            }
            ray.init(**ray_init_kwargs)

        if Backend.get() == "Cudf":
            from modin.engines.ray.cudf_on_ray.frame.gpu_manager import GPUManager
            from modin.engines.ray.cudf_on_ray.frame.partition_manager import (
                GPU_MANAGERS, )

            # Check that GPU_MANAGERS is empty because _update_engine can be called multiple times
            if not GPU_MANAGERS:
                for i in range(GpuCount.get()):
                    GPU_MANAGERS.append(GPUManager.remote(i))
    _move_stdlib_ahead_of_site_packages()
    ray.worker.global_worker.run_function_on_all_workers(
        _move_stdlib_ahead_of_site_packages)
    ray.worker.global_worker.run_function_on_all_workers(_import_pandas)
    num_cpus = int(ray.cluster_resources()["CPU"])
    num_gpus = int(ray.cluster_resources().get("GPU", 0))
    if Backend.get() == "Cudf":
        NPartitions._put(num_gpus)
    else:
        NPartitions._put(num_cpus)
Beispiel #10
0
def test_aggregate_error_checking():
    modin_df = pd.DataFrame(test_data["float_nan_data"])

    with pytest.warns(UserWarning):
        modin_df.aggregate({modin_df.columns[0]: "sum", modin_df.columns[1]: "mean"})

    with pytest.warns(UserWarning):
        modin_df.aggregate("cumproduct")

    with pytest.raises(ValueError):
        modin_df.aggregate("NOT_EXISTS")


@pytest.mark.xfail(
    Backend.get() == "Pandas",
    reason="DataFrame.apply(dict) raises an exception because of a bug in its"
    "implementation for pandas backend, this prevents us from catching the desired"
    "exception. You can track this bug at:"
    "https://github.com/modin-project/modin/issues/3221",
)
@pytest.mark.parametrize(
    "func",
    agg_func_values + agg_func_except_values,
    ids=agg_func_keys + agg_func_except_keys,
)
def test_apply_key_error(func):
    if not (is_list_like(func) or callable(func) or isinstance(func, str)):
        pytest.xfail(
            reason="Because index materialization is expensive Modin first"
            "checks the validity of the function itself and only then the engine level"
Beispiel #11
0
        return cls.__factory._read_sql_table(**kwargs)

    @classmethod
    @_inherit_docstrings(factories.BaseFactory._read_sql_query)
    def read_sql_query(cls, **kwargs):
        return cls.__factory._read_sql_query(**kwargs)

    @classmethod
    @_inherit_docstrings(factories.BaseFactory._read_spss)
    def read_spss(cls, **kwargs):
        return cls.__factory._read_spss(**kwargs)

    @classmethod
    @_inherit_docstrings(factories.BaseFactory._to_sql)
    def to_sql(cls, *args, **kwargs):
        return cls.__factory._to_sql(*args, **kwargs)

    @classmethod
    @_inherit_docstrings(factories.BaseFactory._to_pickle)
    def to_pickle(cls, *args, **kwargs):
        return cls.__factory._to_pickle(*args, **kwargs)

    @classmethod
    @_inherit_docstrings(factories.BaseFactory._to_csv)
    def to_csv(cls, *args, **kwargs):
        return cls.__factory._to_csv(*args, **kwargs)


Engine.subscribe(FactoryDispatcher._update_factory)
Backend.subscribe(FactoryDispatcher._update_factory)
Beispiel #12
0
import modin.pandas as pd

try:
    from modin.config import NPartitions

    NPARTITIONS = NPartitions.get()
except ImportError:
    NPARTITIONS = pd.DEFAULT_NPARTITIONS

try:
    from modin.config import TestDatasetSize, AsvImplementation, Engine, Backend

    ASV_USE_IMPL = AsvImplementation.get()
    ASV_DATASET_SIZE = TestDatasetSize.get() or "Small"
    ASV_USE_ENGINE = Engine.get()
    ASV_USE_BACKEND = Backend.get()
except ImportError:
    # The same benchmarking code can be run for different versions of Modin, so in
    # case of an error importing important variables, we'll just use predefined values
    ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin")
    ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small")
    ASV_USE_ENGINE = os.environ.get("MODIN_ENGINE", "Ray")
    ASV_USE_BACKEND = os.environ.get("MODIN_BACKEND", "Pandas")

ASV_USE_IMPL = ASV_USE_IMPL.lower()
ASV_DATASET_SIZE = ASV_DATASET_SIZE.lower()
ASV_USE_ENGINE = ASV_USE_ENGINE.lower()
ASV_USE_BACKEND = ASV_USE_BACKEND.lower()

assert ASV_USE_IMPL in ("modin", "pandas")
assert ASV_DATASET_SIZE in ("big", "small")
Beispiel #13
0
# Unless required by applicable law or agreed to in writing, software distributed under
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import os
import pandas
import numpy as np
import pyarrow
import pytest

from modin.config import IsExperimental, Engine, Backend

IsExperimental.put(True)
Engine.put("ray")
Backend.put("omnisci")

import modin.pandas as pd
from modin.pandas.test.utils import (
    df_equals,
    bool_arg_values,
    to_pandas,
    test_data_values,
    test_data_keys,
    generate_multiindex,
    eval_general,
)


def set_execution_mode(frame, mode, recursive=False):
    if isinstance(frame, (pd.Series, pd.DataFrame)):
Beispiel #14
0
def initialize_ray(
    override_is_cluster=False,
    override_redis_address: str = None,
    override_redis_password: str = None,
):
    """
    Initializes ray based on parameters, environment variables and internal defaults.

    Parameters
    ----------
    override_is_cluster: bool, optional
        Whether to override the detection of Moding being run in a cluster
        and always assume this runs on cluster head node.
        This also overrides Ray worker detection and always runs the function,
        not only from main thread.
        If not specified, $MODIN_RAY_CLUSTER env variable is used.
    override_redis_address: str, optional
        What Redis address to connect to when running in Ray cluster.
        If not specified, $MODIN_REDIS_ADDRESS is used.
    override_redis_password: str, optional
        What password to use when connecting to Redis.
        If not specified, a new random one is generated.
    """
    import ray

    if not ray.is_initialized() or override_is_cluster:
        import secrets

        cluster = override_is_cluster or IsRayCluster.get()
        redis_address = override_redis_address or RayRedisAddress.get()
        redis_password = override_redis_password or secrets.token_hex(32)

        if cluster:
            # We only start ray in a cluster setting for the head node.
            ray.init(
                address=redis_address or "auto",
                include_dashboard=False,
                ignore_reinit_error=True,
                _redis_password=redis_password,
                logging_level=100,
            )
        else:
            from modin.error_message import ErrorMessage

            # This string is intentionally formatted this way. We want it indented in
            # the warning message.
            ErrorMessage.not_initialized(
                "Ray",
                """
    import ray
    ray.init()
""",
            )
            object_store_memory = Memory.get()
            plasma_directory = RayPlasmaDir.get()
            if IsOutOfCore.get():
                if plasma_directory is None:
                    from tempfile import gettempdir

                    plasma_directory = gettempdir()
                # We may have already set the memory from the environment variable, we don't
                # want to overwrite that value if we have.
                if object_store_memory is None:
                    # Round down to the nearest Gigabyte.
                    try:
                        system_memory = ray._private.utils.get_system_memory()
                    except AttributeError:  # Compatibility with Ray <= 1.2
                        system_memory = ray.utils.get_system_memory()
                    mem_bytes = system_memory // 10**9 * 10**9
                    # Default to 8x memory for out of core
                    object_store_memory = 8 * mem_bytes
            # In case anything failed above, we can still improve the memory for Modin.
            if object_store_memory is None:
                # Round down to the nearest Gigabyte.
                try:
                    system_memory = ray._private.utils.get_system_memory()
                except AttributeError:  # Compatibility with Ray <= 1.2
                    system_memory = ray.utils.get_system_memory()
                object_store_memory = int(0.6 * system_memory // 10**9 * 10**9)
                # If the memory pool is smaller than 2GB, just use the default in ray.
                if object_store_memory == 0:
                    object_store_memory = None
            else:
                object_store_memory = int(object_store_memory)

            ray_init_kwargs = {
                "num_cpus": CpuCount.get(),
                "num_gpus": GpuCount.get(),
                "include_dashboard": False,
                "ignore_reinit_error": True,
                "_plasma_directory": plasma_directory,
                "object_store_memory": object_store_memory,
                "address": redis_address,
                "_redis_password": redis_password,
                "logging_level": 100,
                "_memory": object_store_memory,
                "_lru_evict": True,
            }
            from packaging import version

            # setting of `_lru_evict` parameter raises DeprecationWarning since ray 2.0.0.dev0
            if version.parse(ray.__version__) >= version.parse("2.0.0.dev0"):
                ray_init_kwargs.pop("_lru_evict")
            ray.init(**ray_init_kwargs)

        _move_stdlib_ahead_of_site_packages()
        ray.worker.global_worker.run_function_on_all_workers(
            _move_stdlib_ahead_of_site_packages)

        ray.worker.global_worker.run_function_on_all_workers(_import_pandas)

        if Backend.get() == "Cudf":
            from modin.engines.ray.cudf_on_ray.frame.gpu_manager import GPUManager
            from modin.engines.ray.cudf_on_ray.frame.partition_manager import (
                GPU_MANAGERS, )

            # Check that GPU_MANAGERS is empty because _update_engine can be called multiple times
            if not GPU_MANAGERS:
                for i in range(GpuCount.get()):
                    GPU_MANAGERS.append(GPUManager.remote(i))

    num_cpus = int(ray.cluster_resources()["CPU"])
    num_gpus = int(ray.cluster_resources().get("GPU", 0))
    if Backend.get() == "Cudf":
        NPartitions._put(num_gpus)
    else:
        NPartitions._put(num_cpus)