def test_factory_switch(): Engine.put("Test") assert FactoryDispatcher.get_factory() == PandasOnTestFactory assert FactoryDispatcher.get_factory().io_cls == "Foo" Engine.put("Python") # revert engine to default StorageFormat.put("Test") assert FactoryDispatcher.get_factory() == TestOnPythonFactory assert FactoryDispatcher.get_factory().io_cls == "Bar" StorageFormat.put("Pandas") # revert engine to default
def _update_factory(cls, _): """ Update and prepare factory with a new one specified via Modin config. Parameters ---------- _ : object This parameters serves the compatibility purpose. Does not affect the result. """ factory_name = get_current_execution() + "Factory" try: cls.__factory = getattr(factories, factory_name) except AttributeError: if factory_name == "ExperimentalOmnisciOnRayFactory": msg = ("OmniSci storage format no longer needs Ray engine; " "please specify MODIN_ENGINE='native'") raise FactoryNotFoundError(msg) if not IsExperimental.get(): # allow missing factories in experimenal mode only if hasattr(factories, "Experimental" + factory_name): msg = ( "{0} on {1} is only accessible through the experimental API.\nRun " "`import modin.experimental.pandas as pd` to use {0} on {1}." ) else: msg = ( "Cannot find a factory for partition '{}' and execution engine '{}'. " "Potential reason might be incorrect environment variable value for " f"{StorageFormat.varname} or {Engine.varname}") raise FactoryNotFoundError( msg.format(StorageFormat.get(), Engine.get())) cls.__factory = StubFactory.set_failing_name(factory_name) else: cls.__factory.prepare()
def read(cls, *args, **kwargs): """ Read data according passed `args` and `kwargs`. Parameters ---------- *args : iterable Positional arguments to be passed into `_read` function. **kwargs : dict Keywords arguments to be passed into `_read` function. Returns ------- query_compiler : BaseQueryCompiler Query compiler with imported data for further processing. Notes ----- `read` is high-level function that calls specific for defined storage format, engine and dispatcher class `_read` function with passed parameters and performs some postprocessing work on the resulting query_compiler object. """ query_compiler = cls._read(*args, **kwargs) # TODO (devin-petersohn): Make this section more general for non-pandas kernel # implementations. if StorageFormat.get() == "Pandas": import pandas as kernel_lib elif StorageFormat.get() == "Cudf": import cudf as kernel_lib else: raise NotImplementedError("FIXME") if hasattr(query_compiler, "dtypes") and any( isinstance(t, kernel_lib.CategoricalDtype) for t in query_compiler.dtypes): dtypes = query_compiler.dtypes return query_compiler.astype({ t: dtypes[t] for t in dtypes.index if isinstance(dtypes[t], kernel_lib.CategoricalDtype) }) return query_compiler
with warns_that_defaulting_to_pandas(): modin_df.aggregate({ modin_df.columns[0]: "sum", modin_df.columns[1]: "mean" }) with warns_that_defaulting_to_pandas(): modin_df.aggregate("cumproduct") with pytest.raises(ValueError): modin_df.aggregate("NOT_EXISTS") @pytest.mark.xfail( StorageFormat.get() == "Pandas", reason="DataFrame.apply(dict) raises an exception because of a bug in its" + "implementation for pandas storage format, this prevents us from catching the desired" + "exception. You can track this bug at:" + "https://github.com/modin-project/modin/issues/3221", ) @pytest.mark.parametrize( "func", agg_func_values + agg_func_except_values, ids=agg_func_keys + agg_func_except_keys, ) def test_apply_key_error(func): if not (is_list_like(func) or callable(func) or isinstance(func, str)): pytest.xfail( reason="Because index materialization is expensive Modin first" +
@classmethod @_inherit_docstrings(factories.BaseFactory._to_pickle) def to_pickle(cls, *args, **kwargs): return cls.__factory._to_pickle(*args, **kwargs) @classmethod @_inherit_docstrings( factories.ExperimentalPandasOnRayFactory._to_pickle_distributed) def to_pickle_distributed(cls, *args, **kwargs): return cls.__factory._to_pickle_distributed(*args, **kwargs) @classmethod @_inherit_docstrings( factories.ExperimentalPandasOnRayFactory._read_custom_text) def read_custom_text(cls, **kwargs): return cls.__factory._read_custom_text(**kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_csv) def to_csv(cls, *args, **kwargs): return cls.__factory._to_csv(*args, **kwargs) @classmethod @_inherit_docstrings(factories.BaseFactory._to_parquet) def to_parquet(cls, *args, **kwargs): return cls.__factory._to_parquet(*args, **kwargs) Engine.subscribe(FactoryDispatcher._update_factory) StorageFormat.subscribe(FactoryDispatcher._update_factory)
def initialize_ray( override_is_cluster=False, override_redis_address: str = None, override_redis_password: str = None, ): """ Initialize Ray based on parameters, ``modin.config`` variables and internal defaults. Parameters ---------- override_is_cluster : bool, default: False Whether to override the detection of Modin being run in a cluster and always assume this runs on cluster head node. This also overrides Ray worker detection and always runs the initialization function (runs from main thread only by default). If not specified, ``modin.config.IsRayCluster`` variable is used. override_redis_address : str, optional What Redis address to connect to when running in Ray cluster. If not specified, ``modin.config.RayRedisAddress`` is used. override_redis_password : str, optional What password to use when connecting to Redis. If not specified, ``modin.config.RayRedisPassword`` is used. """ if not ray.is_initialized() or override_is_cluster: cluster = override_is_cluster or IsRayCluster.get() redis_address = override_redis_address or RayRedisAddress.get() redis_password = ( (ray.ray_constants.REDIS_DEFAULT_PASSWORD if cluster else RayRedisPassword.get()) if override_redis_password is None and RayRedisPassword.get_value_source() == ValueSource.DEFAULT else override_redis_password or RayRedisPassword.get()) if cluster: # We only start ray in a cluster setting for the head node. ray.init( address=redis_address or "auto", include_dashboard=False, ignore_reinit_error=True, _redis_password=redis_password, ) else: from modin.error_message import ErrorMessage # This string is intentionally formatted this way. We want it indented in # the warning message. ErrorMessage.not_initialized( "Ray", """ import ray ray.init() """, ) object_store_memory = Memory.get() # In case anything failed above, we can still improve the memory for Modin. if object_store_memory is None: virtual_memory = psutil.virtual_memory().total if sys.platform.startswith("linux"): shm_fd = os.open("/dev/shm", os.O_RDONLY) try: shm_stats = os.fstatvfs(shm_fd) system_memory = shm_stats.f_bsize * shm_stats.f_bavail if system_memory / (virtual_memory / 2) < 0.99: warnings.warn( f"The size of /dev/shm is too small ({system_memory} bytes). The required size " f"at least half of RAM ({virtual_memory // 2} bytes). Please, delete files in /dev/shm or " "increase size of /dev/shm with --shm-size in Docker. Also, you can set " "the required memory size for each Ray worker in bytes to MODIN_MEMORY environment variable." ) finally: os.close(shm_fd) else: system_memory = virtual_memory object_store_memory = int(0.6 * system_memory // 1e9 * 1e9) # If the memory pool is smaller than 2GB, just use the default in ray. if object_store_memory == 0: object_store_memory = None else: object_store_memory = int(object_store_memory) ray_init_kwargs = { "num_cpus": CpuCount.get(), "num_gpus": GpuCount.get(), "include_dashboard": False, "ignore_reinit_error": True, "object_store_memory": object_store_memory, "_redis_password": redis_password, "_memory": object_store_memory, } ray.init(**ray_init_kwargs) if StorageFormat.get() == "Cudf": from modin.core.execution.ray.implementations.cudf_on_ray.frame.gpu_manager import ( GPUManager, ) from modin.core.execution.ray.implementations.cudf_on_ray.frame.partition_manager import ( GPU_MANAGERS, ) # Check that GPU_MANAGERS is empty because _update_engine can be called multiple times if not GPU_MANAGERS: for i in range(GpuCount.get()): GPU_MANAGERS.append(GPUManager.remote(i)) _move_stdlib_ahead_of_site_packages() ray.worker.global_worker.run_function_on_all_workers( _move_stdlib_ahead_of_site_packages) ray.worker.global_worker.run_function_on_all_workers(_import_pandas) num_cpus = int(ray.cluster_resources()["CPU"]) num_gpus = int(ray.cluster_resources().get("GPU", 0)) if StorageFormat.get() == "Cudf": NPartitions._put(num_gpus) else: NPartitions._put(num_cpus)
import modin.pandas as pd try: from modin.config import NPartitions NPARTITIONS = NPartitions.get() except ImportError: NPARTITIONS = pd.DEFAULT_NPARTITIONS try: from modin.config import TestDatasetSize, AsvImplementation, Engine, StorageFormat ASV_USE_IMPL = AsvImplementation.get() ASV_DATASET_SIZE = TestDatasetSize.get() or "Small" ASV_USE_ENGINE = Engine.get() ASV_USE_STORAGE_FORMAT = StorageFormat.get() except ImportError: # The same benchmarking code can be run for different versions of Modin, so in # case of an error importing important variables, we'll just use predefined values ASV_USE_IMPL = os.environ.get("MODIN_ASV_USE_IMPL", "modin") ASV_DATASET_SIZE = os.environ.get("MODIN_TEST_DATASET_SIZE", "Small") ASV_USE_ENGINE = os.environ.get("MODIN_ENGINE", "Ray") ASV_USE_STORAGE_FORMAT = os.environ.get("MODIN_STORAGE_FORMAT", "Pandas") ASV_USE_IMPL = ASV_USE_IMPL.lower() ASV_DATASET_SIZE = ASV_DATASET_SIZE.lower() ASV_USE_ENGINE = ASV_USE_ENGINE.lower() ASV_USE_STORAGE_FORMAT = ASV_USE_STORAGE_FORMAT.lower() assert ASV_USE_IMPL in ("modin", "pandas") assert ASV_DATASET_SIZE in ("big", "small")
def test_syncronous_mode(): assert BenchmarkMode.get() # On Omnisci storage, transpose() defaults to Pandas. with (warns_that_defaulting_to_pandas() if StorageFormat.get() == "Omnisci" else nullcontext()): pd.DataFrame(test_data_values[0]).mean()
def _update_engine(publisher: Parameter): global dask_client from modin.config import StorageFormat, CpuCount from modin.config.envvars import IsExperimental from modin.config.pubsub import ValueSource if (StorageFormat.get() == "Omnisci" and publisher.get_value_source() == ValueSource.DEFAULT): publisher.put("Native") IsExperimental.put(True) elif (publisher.get() == "Native" and StorageFormat.get_value_source() == ValueSource.DEFAULT): StorageFormat.put("Omnisci") IsExperimental.put(True) elif publisher.get() == "Ray": if _is_first_update.get("Ray", True): from modin.core.execution.ray.common.utils import initialize_ray initialize_ray() elif publisher.get() == "Native": # With OmniSci storage format there is only a single worker per node # and we allow it to work on all cores. if StorageFormat.get() == "Omnisci": os.environ["OMP_NUM_THREADS"] = str(CpuCount.get()) else: raise ValueError( f"Storage format should be 'Omnisci' with 'Native' engine, but provided {StorageFormat.get()}." ) elif publisher.get() == "Dask": if _is_first_update.get("Dask", True): from modin.core.execution.dask.common.utils import initialize_dask initialize_dask() elif publisher.get() == "Cloudray": from modin.experimental.cloud import get_connection conn = get_connection() if _is_first_update.get("Cloudray", True): @conn.teleport def init_remote_ray(partition): from ray import ray_constants import modin from modin.core.execution.ray.common.utils import initialize_ray modin.set_execution("Ray", partition) initialize_ray( override_is_cluster=True, override_redis_address= f"localhost:{ray_constants.DEFAULT_PORT}", override_redis_password=ray_constants. REDIS_DEFAULT_PASSWORD, ) init_remote_ray(StorageFormat.get()) # import FactoryDispatcher here to initialize IO class # so it doesn't skew read_csv() timings later on import modin.core.execution.dispatching.factories.dispatcher # noqa: F401 else: get_connection().modules["modin"].set_execution( "Ray", StorageFormat.get()) elif publisher.get() == "Cloudpython": from modin.experimental.cloud import get_connection get_connection().modules["modin"].set_execution("Python") elif publisher.get() == "Cloudnative": from modin.experimental.cloud import get_connection assert ( StorageFormat.get() == "Omnisci" ), f"Storage format should be 'Omnisci' with 'Cloudnative' engine, but provided {StorageFormat.get()}." get_connection().modules["modin"].set_execution("Native", "OmniSci") elif publisher.get() not in _NOINIT_ENGINES: raise ImportError("Unrecognized execution engine: {}.".format( publisher.get())) _is_first_update[publisher.get()] = False