Beispiel #1
0
def test_factory_switch():
    Engine.put("Test")
    assert FactoryDispatcher.get_factory() == PandasOnTestFactory
    assert FactoryDispatcher.get_factory().io_cls == "Foo"
    Engine.put("Python")  # revert engine to default

    StorageFormat.put("Test")
    assert FactoryDispatcher.get_factory() == TestOnPythonFactory
    assert FactoryDispatcher.get_factory().io_cls == "Bar"
    StorageFormat.put("Pandas")  # revert engine to default
Beispiel #2
0
def test_set_execution():
    set_execution("Bar", "Foo")
    assert FactoryDispatcher.get_factory() == FooOnBarFactory
Beispiel #3
0
def from_partitions(partitions,
                    axis,
                    index=None,
                    columns=None,
                    row_lengths=None,
                    column_widths=None):
    """
    Create DataFrame from remote partitions.

    Parameters
    ----------
    partitions : list
        A list of Ray.ObjectRef/Dask.Future to partitions depending on the engine used.
        Or a list of tuples of Ray.ObjectRef/Dask.Future to node ip addresses and partitions
        depending on the engine used (i.e. ``[(Ray.ObjectRef/Dask.Future, Ray.ObjectRef/Dask.Future), ...]``).
    axis : {None, 0 or 1}
        The ``axis`` parameter is used to identify what are the partitions passed.
        You have to set:

        * ``axis=0`` if you want to create DataFrame from row partitions
        * ``axis=1`` if you want to create DataFrame from column partitions
        * ``axis=None`` if you want to create DataFrame from 2D list of partitions
    index : sequence, optional
        The index for the DataFrame. Is computed if not provided.
    columns : sequence, optional
        The columns for the DataFrame. Is computed if not provided.
    row_lengths : list, optional
        The length of each partition in the rows. The "height" of
        each of the block partitions. Is computed if not provided.
    column_widths : list, optional
        The width of each partition in the columns. The "width" of
        each of the block partitions. Is computed if not provided.

    Returns
    -------
    modin.pandas.DataFrame
        DataFrame instance created from remote partitions.

    Notes
    -----
    Pass `index`, `columns`, `row_lengths` and `column_widths` to avoid triggering
    extra computations of the metadata when creating a DataFrame.
    """
    from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

    factory = FactoryDispatcher.get_factory()

    partition_class = factory.io_cls.frame_cls._partition_mgr_cls._partition_class
    partition_frame_class = factory.io_cls.frame_cls
    partition_mgr_class = factory.io_cls.frame_cls._partition_mgr_cls

    # Since we store partitions of Modin DataFrame as a 2D NumPy array we need to place
    # passed partitions to 2D NumPy array to pass it to internal Modin Frame class.
    # `axis=None` - convert 2D list to 2D NumPy array
    if axis is None:
        if isinstance(partitions[0][0], tuple):
            parts = np.array(
                [[partition_class(partition, ip=ip) for ip, partition in row]
                 for row in partitions])
        else:
            parts = np.array(
                [[partition_class(partition) for partition in row]
                 for row in partitions])
    # `axis=0` - place row partitions to 2D NumPy array so that each row of the array is one row partition.
    elif axis == 0:
        if isinstance(partitions[0], tuple):
            parts = np.array([[partition_class(partition, ip=ip)]
                              for ip, partition in partitions])
        else:
            parts = np.array([[partition_class(partition)]
                              for partition in partitions])
    # `axis=1` - place column partitions to 2D NumPy array so that each column of the array is one column partition.
    elif axis == 1:
        if isinstance(partitions[0], tuple):
            parts = np.array([[
                partition_class(partition, ip=ip)
                for ip, partition in partitions
            ]])
        else:
            parts = np.array(
                [[partition_class(partition) for partition in partitions]])
    else:
        raise ValueError(
            f"Got unacceptable value of axis {axis}. Possible values are {0}, {1} or {None}."
        )

    labels_axis_to_sync = None
    if index is None:
        labels_axis_to_sync = 1
        index = partition_mgr_class.get_indices(0, parts,
                                                lambda df: df.axes[0])

    if columns is None:
        labels_axis_to_sync = 0 if labels_axis_to_sync is None else -1
        columns = partition_mgr_class.get_indices(1, parts,
                                                  lambda df: df.axes[1])

    frame = partition_frame_class(
        parts,
        index,
        columns,
        row_lengths=row_lengths,
        column_widths=column_widths,
    )

    if labels_axis_to_sync != -1:
        frame.synchronize_labels(axis=labels_axis_to_sync)

    return DataFrame(query_compiler=PandasQueryCompiler(frame))
Beispiel #4
0
def test_default_factory():
    assert issubclass(FactoryDispatcher.get_factory(), factories.BaseFactory)
    assert FactoryDispatcher.get_factory().io_cls
Beispiel #5
0
# the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific language
# governing permissions and limitations under the License.

import numpy as np
import pandas
import pytest

import modin.pandas as pd
from modin.distributed.dataframe.pandas import unwrap_partitions, from_partitions
from modin.config import Engine, NPartitions
from modin.pandas.test.utils import df_equals
from modin.pandas.indexing import compute_sliced_len
from modin.core.execution.dispatching.factories.dispatcher import FactoryDispatcher

PartitionClass = (FactoryDispatcher.get_factory().io_cls.frame_cls.
                  _partition_mgr_cls._partition_class)

if Engine.get() == "Ray":
    import ray

    put_func = ray.put
    get_func = ray.get
    FutureType = ray.ObjectRef
elif Engine.get() == "Dask":
    from distributed.client import default_client
    from distributed import Future

    put_func = lambda x: default_client().scatter(x)  # noqa: E731
    get_func = lambda x: x.result()  # noqa: E731
    FutureType = Future
elif Engine.get() == "Python":