Beispiel #1
0
def _update_engine(publisher: Publisher):
    global DEFAULT_NPARTITIONS, dask_client

    num_cpus = DEFAULT_NPARTITIONS
    if publisher.get() == "Ray":
        import ray

        if _is_first_update.get("Ray", True):
            initialize_ray()
        num_cpus = ray.cluster_resources()["CPU"]
    elif publisher.get() == "Dask":  # pragma: no cover
        from distributed.client import get_client

        if threading.current_thread(
        ).name == "MainThread" and _is_first_update.get("Dask", True):
            import warnings

            warnings.warn("The Dask Engine for Modin is experimental.")

        try:
            dask_client = get_client()
        except ValueError:
            from distributed import Client

            num_cpus = os.environ.get("MODIN_CPUS",
                                      None) or multiprocessing.cpu_count()
            dask_client = Client(n_workers=int(num_cpus))

    elif publisher.get() != "Python":
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
    DEFAULT_NPARTITIONS = max(4, int(num_cpus))
Beispiel #2
0
def initialize_dask():
    from distributed.client import get_client

    try:
        get_client()
    except ValueError:
        from distributed import Client

        # The indentation here is intentional, we want the code to be indented.
        ErrorMessage.not_initialized(
            "Dask",
            """
    from distributed import Client

    client = Client()
""",
        )
        Client(n_workers=CpuCount.get())
Beispiel #3
0
    def put(cls, obj):
        """A factory classmethod to format a given object.

        Args:
            obj: An object.

        Returns:
            A `RemotePartitions` object.
        """
        client = get_client()
        return cls(client.scatter(obj, hash=False))
Beispiel #4
0
    def apply(self, func, **kwargs):
        """Apply some callable function to the data in this partition.

        Note: It is up to the implementation how kwargs are handled. They are
            an important part of many implementations. As of right now, they
            are not serialized.

        Args:
            func: The lambda to apply (may already be correctly formatted)

        Returns:
             A new `BaseFramePartition` containing the object that has had `func`
             applied to it.
        """
        func = pkl.dumps(func)
        call_queue = self.call_queue + [[func, kwargs]]
        future = get_client().submit(apply_list_of_funcs,
                                     call_queue,
                                     self.future,
                                     pure=False)
        futures = [
            get_client().submit(lambda l: l[i], future) for i in range(2)
        ]
        return PandasOnDaskFramePartition(futures[0], ip=futures[1])
Beispiel #5
0
def extract_ddf_partitions(ddf):
    """ Returns the mapping: worker -> [list of futures]"""
    client = get_client()
    delayed_ddf = ddf.to_delayed()
    parts = client.compute(delayed_ddf)
    wait(parts)

    key_to_part = dict([(str(part.key), part) for part in parts])
    ret = defaultdict(list)  # Map worker -> [list of futures]
    for key, workers in client.who_has(parts).items():
        worker = first(
            workers
        )  # If multiple workers have the part, we pick the first worker
        ret[worker].append(key_to_part[key])
    return ret
Beispiel #6
0
def test_from_partitions(axis):
    data = np.random.randint(0, 100, size=(2**16, 2**8))
    df1, df2 = pandas.DataFrame(data), pandas.DataFrame(data)
    expected_df = pandas.concat([df1, df2], axis=1 if axis is None else axis)
    if Engine.get() == "Ray":
        if axis is None:
            futures = [[ray.put(df1), ray.put(df2)]]
        else:
            futures = [ray.put(df1), ray.put(df2)]
    if Engine.get() == "Dask":
        client = get_client()
        if axis is None:
            futures = [client.scatter([df1, df2], hash=False)]
        else:
            futures = client.scatter([df1, df2], hash=False)
    actual_df = from_partitions(futures, axis)
    df_equals(expected_df, actual_df)
Beispiel #7
0
 def deploy_axis_func(cls, axis, func, num_splits, kwargs,
                      maintain_partitioning, *partitions):
     client = get_client()
     axis_result = client.submit(
         PandasFrameAxisPartition.deploy_axis_func,
         axis,
         func,
         num_splits,
         kwargs,
         maintain_partitioning,
         *partitions,
         pure=False,
     )
     if num_splits == 1:
         return axis_result
     # We have to do this to split it back up. It is already split, but we need to
     # get futures for each.
     return [
         client.submit(lambda l: l[i], axis_result, pure=False)
         for i in range(num_splits)
     ]
Beispiel #8
0
def initialize_dask():
    from distributed.client import get_client

    try:
        client = get_client()
    except ValueError:
        from distributed import Client

        # The indentation here is intentional, we want the code to be indented.
        ErrorMessage.not_initialized(
            "Dask",
            """
    from distributed import Client

    client = Client()
""",
        )
        client = Client(n_workers=CpuCount.get())

    num_cpus = len(client.ncores())
    NPartitions.put_if_default(num_cpus)
Beispiel #9
0
 def deploy_func_between_two_axis_partitions(cls, axis, func, num_splits,
                                             len_of_left, other_shape,
                                             kwargs, *partitions):
     client = get_client()
     axis_result = client.submit(
         PandasFrameAxisPartition.deploy_func_between_two_axis_partitions,
         axis,
         func,
         num_splits,
         len_of_left,
         other_shape,
         kwargs,
         *partitions,
         pure=False,
     )
     # We have to do this to split it back up. It is already split, but we need to
     # get futures for each.
     return [
         client.submit(lambda l: l[i], axis_result, pure=False)
         for i in range(num_splits)
     ]
    def sample(
        self,
        label: str | None = None,
        *,
        client: Client | None = None,
        measure: str = "process",
        interval: float = 0.5,
    ):
        """Context manager that records memory usage in the cluster.
        This is synchronous if the client is synchronous and
        asynchronous if the client is asynchronous.

        The samples are recorded in ``self.samples[<label>]``.

        Parameters
        ==========
        label: str, optional
            Tag to record the samples under in the self.samples dict.
            Default: automatically generate a random label
        client: Client, optional
            client used to connect to the scheduler.
            Default: use the global client
        measure: str, optional
            One of the measures from :class:`distributed.scheduler.MemoryState`.
            Default: sample process memory
        interval: float, optional
            sampling interval, in seconds.
            Default: 0.5
        """
        if not client:
            from distributed.client import get_client

            client = get_client()

        if client.asynchronous:
            return self._sample_async(label, client, measure, interval)
        else:
            return self._sample_sync(label, client, measure, interval)
Beispiel #11
0
    def deploy_axis_func(cls, axis, func, num_splits, kwargs,
                         maintain_partitioning, *partitions):
        client = get_client()
        axis_result = client.submit(
            PandasFrameAxisPartition.deploy_axis_func,
            axis,
            func,
            num_splits,
            kwargs,
            maintain_partitioning,
            *partitions,
            pure=False,
        )

        lengths = kwargs.get("_lengths", None)
        result_num_splits = len(lengths) if lengths else num_splits

        # We have to do this to split it back up. It is already split, but we need to
        # get futures for each.
        return [
            client.submit(lambda l: l[i], axis_result, pure=False)
            for i in range(result_num_splits)
        ]
Beispiel #12
0
def _update_engine(publisher: Parameter):
    global DEFAULT_NPARTITIONS, dask_client, num_cpus
    from modin.config import Backend, CpuCount

    if publisher.get() == "Ray":
        import ray
        from modin.engines.ray.utils import initialize_ray

        # With OmniSci backend there is only a single worker per node
        # and we allow it to work on all cores.
        if Backend.get() == "Omnisci":
            CpuCount.put(1)
            os.environ["OMP_NUM_THREADS"] = str(multiprocessing.cpu_count())
        if _is_first_update.get("Ray", True):
            initialize_ray()
        num_cpus = ray.cluster_resources()["CPU"]
    elif publisher.get() == "Dask":  # pragma: no cover
        from distributed.client import get_client

        if threading.current_thread(
        ).name == "MainThread" and _is_first_update.get("Dask", True):
            import warnings

            warnings.warn("The Dask Engine for Modin is experimental.")

            try:
                dask_client = get_client()
            except ValueError:
                from distributed import Client

                dask_client = Client(n_workers=CpuCount.get())

    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection

        conn = get_connection()
        remote_ray = conn.modules["ray"]
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray(partition):
                from ray import ray_constants
                import modin
                from modin.engines.ray.utils import initialize_ray

                modin.set_backends("Ray", partition)
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray(Backend.get())
            # import EngineDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.data_management.factories.dispatcher  # noqa: F401
        else:
            get_connection().modules["modin"].set_backends(
                "Ray", Backend.get())

        num_cpus = remote_ray.cluster_resources()["CPU"]
    elif publisher.get() == "Cloudpython":
        from modin.experimental.cloud import get_connection

        get_connection().modules["modin"].set_backends("Python")

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
    DEFAULT_NPARTITIONS = max(4, int(num_cpus))
Beispiel #13
0
            move_stdlib_ahead_of_site_packages)


if execution_engine == "Ray":
    import ray

    initialize_ray()
    num_cpus = ray.cluster_resources()["CPU"]
elif execution_engine == "Dask":  # pragma: no cover
    from distributed.client import get_client
    import warnings

    if threading.current_thread().name == "MainThread":
        warnings.warn("The Dask Engine for Modin is experimental.")
        try:
            client = get_client()
        except ValueError:
            from distributed import Client
            import multiprocessing

            num_cpus = multiprocessing.cpu_count()
            client = Client(n_workers=num_cpus)
elif execution_engine != "Python":
    raise ImportError(
        "Unrecognized execution engine: {}.".format(execution_engine))

DEFAULT_NPARTITIONS = max(4, int(num_cpus))

__all__ = [
    "DataFrame",
    "Series",
Beispiel #14
0
def _update_engine(publisher: Publisher):
    global DEFAULT_NPARTITIONS, dask_client, num_cpus

    if publisher.get() == "Ray":
        import ray
        from modin.engines.ray.utils import initialize_ray

        if _is_first_update.get("Ray", True):
            initialize_ray()
        num_cpus = ray.cluster_resources()["CPU"]
    elif publisher.get() == "Dask":  # pragma: no cover
        from distributed.client import get_client

        if threading.current_thread(
        ).name == "MainThread" and _is_first_update.get("Dask", True):
            import warnings

            warnings.warn("The Dask Engine for Modin is experimental.")

            try:
                dask_client = get_client()
            except ValueError:
                from distributed import Client

                num_cpus = (os.environ.get("MODIN_CPUS", None)
                            or multiprocessing.cpu_count())
                dask_client = Client(n_workers=int(num_cpus))

    elif publisher.get() == "Cloudray":
        from modin.experimental.cloud import get_connection
        import rpyc

        conn: rpyc.ClassicService = get_connection()
        remote_ray = conn.modules["ray"]
        if _is_first_update.get("Cloudray", True):

            @conn.teleport
            def init_remote_ray():
                from ray import ray_constants
                import modin
                from modin.engines.ray.utils import initialize_ray

                modin.set_backends("Ray")
                initialize_ray(
                    override_is_cluster=True,
                    override_redis_address=
                    f"localhost:{ray_constants.DEFAULT_PORT}",
                    override_redis_password=ray_constants.
                    REDIS_DEFAULT_PASSWORD,
                )

            init_remote_ray()
            # import EngineDispatcher here to initialize IO class
            # so it doesn't skew read_csv() timings later on
            import modin.data_management.dispatcher  # noqa: F401

        num_cpus = remote_ray.cluster_resources()["CPU"]

    elif publisher.get() not in _NOINIT_ENGINES:
        raise ImportError("Unrecognized execution engine: {}.".format(
            publisher.get()))

    _is_first_update[publisher.get()] = False
    DEFAULT_NPARTITIONS = max(4, int(num_cpus))