Example #1
0
def simulate_cloud(request):
    mode = request.config.getoption("--simulate-cloud").lower()
    if mode == "off":
        yield
        return

    if mode not in ("normal", "experimental"):
        raise ValueError(f"Unsupported --simulate-cloud mode: {mode}")
    assert IsExperimental.get(
    ), "Simulated cloud must be started in experimental mode"

    from modin.experimental.cloud import create_cluster, get_connection
    import modin.pandas.test.utils

    with create_cluster("local", cluster_type="local"):
        get_connection().teleport(set_experimental_env)(mode)
        with Patcher(
                get_connection(),
            (modin.pandas.test.utils, "assert_index_equal"),
            (modin.pandas.test.utils, "assert_series_equal"),
            (modin.pandas.test.utils, "assert_frame_equal"),
            (modin.pandas.test.utils, "assert_extension_array_equal"),
            (modin.pandas.test.utils, "assert_empty_frame_equal"),
        ):
            yield
Example #2
0
def _update_engine(publisher: Parameter):
    global dask_client
    from modin.config import Backend, CpuCount

    if publisher.get() == "Ray":
        from modin.engines.ray.utils import initialize_ray

        # With OmniSci backend there is only a single worker per node
        # and we allow it to work on all cores.
        if Backend.get() == "Omnisci":
            CpuCount.put(1)
            os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
        if _is_first_update.get("Ray", True):
            initialize_ray()

    elif publisher.get() == "Dask":
        if _is_first_update.get("Dask", True):
            from modin.engines.dask.utils import initialize_dask

            initialize_dask()
    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection

        conn = get_connection()
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray(partition):
                from ray import ray_constants
                import modin
                from modin.engines.ray.utils import initialize_ray

                modin.set_backends("Ray", partition)
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray(Backend.get())
            # import FactoryDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.data_management.factories.dispatcher  # noqa: F401
        else:
            get_connection().modules["modin"].set_backends(
                "Ray", Backend.get())
    elif publisher.get() == "Cloudpython":
        from modin.experimental.cloud import get_connection

        get_connection().modules["modin"].set_backends("Python")

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
Example #3
0
        def __update_engine(self, _):
            if Engine.get() in REMOTE_ENGINES:
                from modin.experimental.cloud import get_connection

                self.__swap_numpy(get_connection().modules["numpy"])
            else:
                self.__swap_numpy()
Example #4
0
        def __update_engine(self, _):
            if execution_engine.get() == "Cloudray":
                from modin.experimental.cloud import get_connection

                self.__swap_numpy(get_connection().modules["numpy"])
            else:
                self.__swap_numpy()
Example #5
0
def simulate_cloud(request):
    mode = request.config.getoption("--simulate-cloud").lower()
    if mode == "off":
        yield
        return

    if mode not in ("normal", "experimental"):
        raise ValueError(f"Unsupported --simulate-cloud mode: {mode}")
    assert IsExperimental.get(), "Simulated cloud must be started in experimental mode"

    from modin.experimental.cloud import create_cluster, get_connection
    import pandas._testing
    import pandas._libs.testing as cyx_testing

    with create_cluster("local", cluster_type="local"):
        get_connection().teleport(set_experimental_env)(mode)
        with Patcher(
            get_connection(),
            (pandas._testing, "assert_class_equal"),
            (cyx_testing, "assert_almost_equal"),
        ):
            yield
Example #6
0
def simulate_cloud(request):
    mode = request.config.getoption("--simulate-cloud").lower()
    if mode == "off":
        yield
        return
    if mode not in ("normal", "experimental"):
        raise ValueError(f"Unsupported --simulate-cloud mode: {mode}")
    os.environ["MODIN_EXPERIMENTAL"] = "True"

    from modin.experimental.cloud import create_cluster, get_connection

    with create_cluster("local", __spawner__="local"):

        def set_env(mode):
            import os

            os.environ["MODIN_EXPERIMENTAL"] = (
                "True" if mode == "experimental" else "False"
            )

        get_connection().teleport(set_env)(mode)
        yield
Example #7
0
    def prepare(cls):
        # query_compiler import is needed so remote PandasQueryCompiler
        # has an imported local counterpart;
        # if there isn't such counterpart rpyc generates some bogus
        # class type which raises TypeError()
        # upon checking its isinstance() or issubclass()
        import modin.backends.pandas.query_compiler  # noqa: F401
        from modin.experimental.cloud import get_connection

        # import a numpy overrider if it wasn't already imported
        import modin.experimental.pandas.numpy_wrap  # noqa: F401

        class WrappedIO:
            def __init__(self, conn, factory):
                self.__conn = conn
                remote_factory = getattr(conn.modules[factory.__module__],
                                         factory.__name__)
                remote_factory.prepare()
                self.__io_cls = remote_factory.io_cls
                self.__reads = {
                    name
                    for name in BaseIO.__dict__ if name.startswith("read_")
                }
                self.__wrappers = {}

            def __getattr__(self, name):
                if name in self.__reads:
                    try:
                        wrap = self.__wrappers[name]
                    except KeyError:

                        def wrap(*a,
                                 _original=getattr(self.__io_cls, name),
                                 **kw):
                            a, kw = self.__conn.deliver(a, kw)
                            return _original(*a, **kw)

                        self.__wrappers[name] = wrap
                else:
                    wrap = getattr(self.__io_cls, name)
                return wrap

        cls.io_cls = WrappedIO(get_connection(), cls.wrapped_factory)
Example #8
0
def _update_engine(publisher: Parameter):
    global DEFAULT_NPARTITIONS, dask_client, num_cpus
    from modin.config import Backend, CpuCount

    if publisher.get() == "Ray":
        import ray
        from modin.engines.ray.utils import initialize_ray

        # With OmniSci backend there is only a single worker per node
        # and we allow it to work on all cores.
        if Backend.get() == "Omnisci":
            CpuCount.put(1)
            os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
        if _is_first_update.get("Ray", True):
            initialize_ray()
        num_cpus = ray.cluster_resources()["CPU"]
    elif publisher.get() == "Dask":  # pragma: no cover
        from distributed.client import get_client

        if threading.current_thread(
        ).name == "MainThread" and _is_first_update.get("Dask", True):
            import warnings

            warnings.warn("The Dask Engine for Modin is experimental.")

            try:
                dask_client = get_client()
            except ValueError:
                from distributed import Client

                dask_client = Client(n_workers=CpuCount.get())

    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection

        conn = get_connection()
        remote_ray = conn.modules["ray"]
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray(partition):
                from ray import ray_constants
                import modin
                from modin.engines.ray.utils import initialize_ray

                modin.set_backends("Ray", partition)
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray(Backend.get())
            # import EngineDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.data_management.factories.dispatcher  # noqa: F401
        else:
            get_connection().modules["modin"].set_backends(
                "Ray", Backend.get())

        num_cpus = remote_ray.cluster_resources()["CPU"]
    elif publisher.get() == "Cloudpython":
        from modin.experimental.cloud import get_connection

        get_connection().modules["modin"].set_backends("Python")

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
    DEFAULT_NPARTITIONS = max(4, int(num_cpus))
Example #9
0
def _update_engine(publisher: Publisher):
    global DEFAULT_NPARTITIONS, dask_client, num_cpus

    if publisher.get() == "Ray":
        import ray
        from modin.engines.ray.utils import initialize_ray

        if _is_first_update.get("Ray", True):
            initialize_ray()
        num_cpus = ray.cluster_resources()["CPU"]
    elif publisher.get() == "Dask":  # pragma: no cover
        from distributed.client import get_client

        if threading.current_thread(
        ).name == "MainThread" and _is_first_update.get("Dask", True):
            import warnings

            warnings.warn("The Dask Engine for Modin is experimental.")

            try:
                dask_client = get_client()
            except ValueError:
                from distributed import Client

                num_cpus = (os.environ.get("MODIN_CPUS", None)
                            or multiprocessing.cpu_count())
                dask_client = Client(n_workers=int(num_cpus))

    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection
        import rpyc

        conn: rpyc.ClassicService = get_connection()
        remote_ray = conn.modules["ray"]
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray():
                from ray import ray_constants
                import modin
                from modin.engines.ray.utils import initialize_ray

                modin.set_backends("Ray")
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray()
            # import EngineDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.data_management.dispatcher  # noqa: F401

        num_cpus = remote_ray.cluster_resources()["CPU"]

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
    DEFAULT_NPARTITIONS = max(4, int(num_cpus))
Example #10
0
    "aws_credentials",
    cluster_name="rayscale-test",
    region="eu-north-1",
    zone="eu-north-1b",
    image="ami-00e1e82d7d4ca80d3",
    **cluster_params,
)
with test_cluster:
    data_file = "https://modin-datasets.s3.amazonaws.com/trips_data.csv"
    if USE_OMNISCI:
        # Workaround for GH#2099
        from modin.experimental.cloud import get_connection

        data_file, remote_data_file = "/tmp/trips_data.csv", data_file
        get_connection().modules["subprocess"].check_call(
            ["wget", remote_data_file, "-O", data_file]
        )

        # Omniscripts check for files being present when given local file paths,
        # so replace "glob" there with a remote one
        import utils.utils

        utils.utils.glob = get_connection().modules["glob"]

    parameters = {
        "data_file": data_file,
        # "data_file": "s3://modin-datasets/trips_data.csv",
        "dfiles_num": 1,
        "validation": False,
        "no_ibis": True,
        "no_pandas": False,
Example #11
0
import modin.experimental.pandas as pd  # noqa: F401
from modin.experimental.cloud import create_cluster, get_connection

from mortgage import run_benchmark
from mortgage.mortgage_pandas import etl_pandas

test_cluster = create_cluster(
    "aws",
    "aws_credentials",
    cluster_name="rayscale-test",
    region="eu-north-1",
    zone="eu-north-1b",
    image="ami-00e1e82d7d4ca80d3",
)
with test_cluster:
    conn = get_connection()
    np = conn.modules["numpy"]
    etl_pandas.__globals__["np"] = np

    parameters = {
        "data_file": "https://modin-datasets.s3.amazonaws.com/mortgage",
        # "data_file": "s3://modin-datasets/mortgage",
        "dfiles_num": 1,
        "no_ml": True,
        "validation": False,
        "no_ibis": True,
        "no_pandas": False,
        "pandas_mode": "Modin_on_ray",
        "ray_tmpdir": "/tmp",
        "ray_memory": 1024 * 1024 * 1024,
    }
Example #12
0
    cluster_name="rayscale-test",
    region="eu-central-1",
    zone="eu-central-1b",
    image="ami-05f7491af5eef733a",
    **cluster_params,
)
with test_cluster:
    if USE_OMNISCI:
        from modin.experimental.cloud import get_connection

        # We should move omniscripts trigger in remote conext
        # https://github.com/intel-ai/omniscripts/blob/7d4599bcacf51de876952c658048571d32275ac1/taxi/taxibench_pandas_ibis.py#L482
        import modin.experimental.engines.omnisci_on_native.frame.omnisci_worker

        OmnisciServer = (get_connection().modules[
            "modin.experimental.engines.omnisci_on_native.frame.omnisci_worker"]
                         .OmnisciServer)
        modin.experimental.engines.omnisci_on_native.frame.omnisci_worker.OmnisciServer = (
            OmnisciServer)

        # Omniscripts check for files being present when given local file paths,
        # so replace "glob" there with a remote one
        import utils.utils

        utils.utils.glob = get_connection().modules["glob"]

    parameters = {
        "data_file": "s3://modin-datasets/cloud/taxi/trips_xaa.csv",
        "dfiles_num": 1,
        "validation": False,
        "no_ibis": True,
Example #13
0
def _update_engine(publisher: Parameter):
    global dask_client
    from modin.config import StorageFormat, CpuCount
    from modin.config.envvars import IsExperimental
    from modin.config.pubsub import ValueSource

    if (StorageFormat.get() == "Omnisci"
            and publisher.get_value_source() == ValueSource.DEFAULT):
        publisher.put("Native")
        IsExperimental.put(True)
    elif (publisher.get() == "Native"
          and StorageFormat.get_value_source() == ValueSource.DEFAULT):
        StorageFormat.put("Omnisci")
        IsExperimental.put(True)
    elif publisher.get() == "Ray":
        if _is_first_update.get("Ray", True):
            from modin.core.execution.ray.common.utils import initialize_ray

            initialize_ray()
    elif publisher.get() == "Native":
        # With OmniSci storage format there is only a single worker per node
        # and we allow it to work on all cores.
        if StorageFormat.get() == "Omnisci":
            os.environ["OMP_NUM_THREADS"] = str(CpuCount.get())
        else:
            raise ValueError(
                f"Storage format should be 'Omnisci' with 'Native' engine, but provided {StorageFormat.get()}."
            )
    elif publisher.get() == "Dask":
        if _is_first_update.get("Dask", True):
            from modin.core.execution.dask.common.utils import initialize_dask

            initialize_dask()
    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection

        conn = get_connection()
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray(partition):
                from ray import ray_constants
                import modin
                from modin.core.execution.ray.common.utils import initialize_ray

                modin.set_execution("Ray", partition)
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray(StorageFormat.get())
            # import FactoryDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.core.execution.dispatching.factories.dispatcher  # noqa: F401
        else:
            get_connection().modules["modin"].set_execution(
                "Ray", StorageFormat.get())
    elif publisher.get() == "Cloudpython":
        from modin.experimental.cloud import get_connection

        get_connection().modules["modin"].set_execution("Python")
    elif publisher.get() == "Cloudnative":
        from modin.experimental.cloud import get_connection

        assert (
            StorageFormat.get() == "Omnisci"
        ), f"Storage format should be 'Omnisci' with 'Cloudnative' engine, but provided {StorageFormat.get()}."
        get_connection().modules["modin"].set_execution("Native", "OmniSci")

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False