Example #1
0
def test_compare(align_axis, keep_shape, keep_equal):
    kwargs = {
        "align_axis": align_axis,
        "keep_shape": keep_shape,
        "keep_equal": keep_equal,
    }
    frame_data1 = random_state.randn(100, 10)
    frame_data2 = random_state.randn(100, 10)
    pandas_df = pandas.DataFrame(frame_data1, columns=list("abcdefghij"))
    pandas_df2 = pandas.DataFrame(frame_data2, columns=list("abcdefghij"))
    modin_df = pd.DataFrame(frame_data1, columns=list("abcdefghij"))
    modin_df2 = pd.DataFrame(frame_data2, columns=list("abcdefghij"))

    modin_result = modin_df.compare(modin_df2, **kwargs)
    pandas_result = pandas_df.compare(pandas_df2, **kwargs)
    assert to_pandas(modin_result).equals(pandas_result)

    modin_result = modin_df2.compare(modin_df, **kwargs)
    pandas_result = pandas_df2.compare(pandas_df, **kwargs)
    assert to_pandas(modin_result).equals(pandas_result)

    series_data1 = ["a", "b", "c", "d", "e"]
    series_data2 = ["a", "a", "c", "b", "e"]
    pandas_series1 = pandas.Series(series_data1)
    pandas_series2 = pandas.Series(series_data2)
    modin_series1 = pd.Series(series_data1)
    modin_series2 = pd.Series(series_data2)

    modin_result = modin_series1.compare(modin_series2, **kwargs)
    pandas_result = pandas_series1.compare(pandas_series2, **kwargs)
    assert to_pandas(modin_result).equals(pandas_result)

    modin_result = modin_series2.compare(modin_series1, **kwargs)
    pandas_result = pandas_series2.compare(pandas_series1, **kwargs)
    assert to_pandas(modin_result).equals(pandas_result)
Example #2
0
File: io.py Project: yangl235/modin
                def return_handler(*args, **kwargs):
                    """
                    Replace the default behavior of methods with inplace kwarg.

                    Returns
                    -------
                    A Modin DataFrame in place of a pandas DataFrame, or the same
                    return type as pandas.ExcelFile.

                    Notes
                    -----
                    This function will replace all of the arguments passed to
                    methods of ExcelFile with the pandas equivalent. It will convert
                    Modin DataFrame to pandas DataFrame, etc.
                    """
                    from modin.utils import to_pandas

                    # We don't want to constantly be giving this error message for
                    # internal methods.
                    if item[0] != "_":
                        ErrorMessage.default_to_pandas("`{}`".format(item))
                    args = [
                        to_pandas(arg) if isinstance(arg, DataFrame) else arg
                        for arg in args
                    ]
                    kwargs = {
                        k: to_pandas(v) if isinstance(v, DataFrame) else v
                        for k, v in kwargs.items()
                    }
                    obj = super(ExcelFile,
                                self).__getattribute__(item)(*args, **kwargs)
                    if isinstance(obj, pandas.DataFrame):
                        return DataFrame(obj)
                    return obj
Example #3
0
def merge_ordered(
        left,
        right,
        on=None,
        left_on=None,
        right_on=None,
        left_by=None,
        right_by=None,
        fill_method=None,
        suffixes=("_x", "_y"),
        how: str = "outer",
) -> DataFrame:
    if not isinstance(left, DataFrame):
        raise ValueError(
            "can not merge DataFrame with instance of type {}".format(
                type(right)))
    ErrorMessage.default_to_pandas("`merge_ordered`")
    if isinstance(right, DataFrame):
        right = to_pandas(right)
    return DataFrame(
        pandas.merge_ordered(
            to_pandas(left),
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            left_by=left_by,
            right_by=right_by,
            fill_method=fill_method,
            suffixes=suffixes,
            how=how,
        ))
Example #4
0
 def wrap_func(*args, **kwargs):
     """Convert Modin DataFrames to pandas then call the function."""
     args = tuple(arg if not isinstance(arg, DataFrame) else
                  to_pandas(arg) for arg in args)
     kwargs = {
         kwd: val
         if not isinstance(val, DataFrame) else to_pandas(val)
         for kwd, val in kwargs.items()
     }
     return func(*args, **kwargs)
Example #5
0
def lreshape(data: DataFrame, groups, dropna=True, label=None):
    if not isinstance(data, DataFrame):
        raise ValueError("can not lreshape with instance of type {}".format(
            type(data)))
    ErrorMessage.default_to_pandas("`lreshape`")
    return DataFrame(
        pandas.lreshape(to_pandas(data), groups, dropna=dropna, label=label))
Example #6
0
def lreshape(data: DataFrame, groups, dropna=True, label=None):
    """
    Reshape wide-format data to long. Generalized inverse of ``DataFrame.pivot``.

    Accepts a dictionary, `groups`, in which each key is a new column name
    and each value is a list of old column names that will be "melted" under
    the new column name as part of the reshape.

    Parameters
    ----------
    data : DataFrame
        The wide-format DataFrame.
    groups : dict
        Dictionary in the form: `{new_name : list_of_columns}`.
    dropna : bool, default: True
        Whether include columns whose entries are all NaN or not.
    label : optional
        Deprecated parameter.

    Returns
    -------
    DataFrame
        Reshaped DataFrame.
    """
    if not isinstance(data, DataFrame):
        raise ValueError("can not lreshape with instance of type {}".format(
            type(data)))
    ErrorMessage.default_to_pandas("`lreshape`")
    return DataFrame(
        pandas.lreshape(to_pandas(data), groups, dropna=dropna, label=label))
Example #7
0
def _to_pandas_from_modin(obj):
    """
    Convert Modin DataFrame etc. to a Pandas DataFrame.
    """
    from modin.utils import to_pandas
    from modin.pandas import DataFrame, Series

    if isinstance(obj, (DataFrame, Series)):
        return to_pandas(obj)
    else:
        return obj
Example #8
0
def merge_asof(
    left,
    right,
    on=None,
    left_on=None,
    right_on=None,
    left_index: bool = False,
    right_index: bool = False,
    by=None,
    left_by=None,
    right_by=None,
    suffixes=("_x", "_y"),
    tolerance=None,
    allow_exact_matches: bool = True,
    direction: str = "backward",
) -> DataFrame:
    if not isinstance(left, DataFrame):
        raise ValueError(
            "can not merge DataFrame with instance of type {}".format(
                type(right)))
    ErrorMessage.default_to_pandas("`merge_asof`")
    if isinstance(right, DataFrame):
        right = to_pandas(right)
    return DataFrame(
        pandas.merge_asof(
            to_pandas(left),
            right,
            on=on,
            left_on=left_on,
            right_on=right_on,
            left_index=left_index,
            right_index=right_index,
            by=by,
            left_by=left_by,
            right_by=right_by,
            suffixes=suffixes,
            tolerance=tolerance,
            allow_exact_matches=allow_exact_matches,
            direction=direction,
        ))
def test_where():
    frame_data = random_state.randn(100, 10)
    pandas_df = pandas.DataFrame(frame_data, columns=list("abcdefghij"))
    modin_df = pd.DataFrame(frame_data, columns=list("abcdefghij"))
    pandas_cond_df = pandas_df % 5 < 2
    modin_cond_df = modin_df % 5 < 2

    pandas_result = pandas_df.where(pandas_cond_df, -pandas_df)
    modin_result = modin_df.where(modin_cond_df, -modin_df)
    assert all((to_pandas(modin_result) == pandas_result).all())

    other = pandas_df.loc[3]
    pandas_result = pandas_df.where(pandas_cond_df, other, axis=1)
    modin_result = modin_df.where(modin_cond_df, other, axis=1)
    assert all((to_pandas(modin_result) == pandas_result).all())

    other = pandas_df["e"]
    pandas_result = pandas_df.where(pandas_cond_df, other, axis=0)
    modin_result = modin_df.where(modin_cond_df, other, axis=0)
    assert all((to_pandas(modin_result) == pandas_result).all())

    pandas_result = pandas_df.where(pandas_df < 2, True)
    modin_result = modin_df.where(modin_df < 2, True)
    assert all((to_pandas(modin_result) == pandas_result).all())
Example #10
0
def modin_df_almost_equals_pandas(modin_df, pandas_df):
    df_categories_equals(modin_df._to_pandas(), pandas_df)

    modin_df = to_pandas(modin_df)

    if hasattr(modin_df, "select_dtypes"):
        modin_df = modin_df.select_dtypes(exclude=["category"])
    if hasattr(pandas_df, "select_dtypes"):
        pandas_df = pandas_df.select_dtypes(exclude=["category"])

    difference = modin_df - pandas_df
    diff_max = difference.max()
    if isinstance(diff_max, pandas.Series):
        diff_max = diff_max.max()
    assert (modin_df.equals(pandas_df) or diff_max < 0.0001
            or (all(modin_df.isna().all()) and all(pandas_df.isna().all())))
Example #11
0
def wide_to_long(df: DataFrame,
                 stubnames,
                 i,
                 j,
                 sep: str = "",
                 suffix: str = r"\d+") -> DataFrame:
    if not isinstance(df, DataFrame):
        raise ValueError(
            "can not wide_to_long with instance of type {}".format(type(df)))
    ErrorMessage.default_to_pandas("`wide_to_long`")
    return DataFrame(
        pandas.wide_to_long(to_pandas(df),
                            stubnames,
                            i,
                            j,
                            sep=sep,
                            suffix=suffix))
Example #12
0
def test_merge_asof_merge_options():
    modin_quotes = pd.DataFrame(
        {
            "time": [
                pd.Timestamp("2016-05-25 13:30:00.023"),
                pd.Timestamp("2016-05-25 13:30:00.023"),
                pd.Timestamp("2016-05-25 13:30:00.030"),
                pd.Timestamp("2016-05-25 13:30:00.041"),
                pd.Timestamp("2016-05-25 13:30:00.048"),
                pd.Timestamp("2016-05-25 13:30:00.049"),
                pd.Timestamp("2016-05-25 13:30:00.072"),
                pd.Timestamp("2016-05-25 13:30:00.075"),
            ],
            "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],
            "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
            "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
        }
    )
    modin_trades = pd.DataFrame(
        {
            "time": [
                pd.Timestamp("2016-05-25 13:30:00.023"),
                pd.Timestamp("2016-05-25 13:30:00.038"),
                pd.Timestamp("2016-05-25 13:30:00.048"),
                pd.Timestamp("2016-05-25 13:30:00.048"),
                pd.Timestamp("2016-05-25 13:30:00.048"),
            ],
            "ticker2": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
            "price": [51.95, 51.95, 720.77, 720.92, 98.0],
            "quantity": [75, 155, 100, 100, 100],
        }
    )
    pandas_quotes, pandas_trades = to_pandas(modin_quotes), to_pandas(modin_trades)

    # left_by + right_by
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            left_by="ticker",
            right_by="ticker2",
        ),
        pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            left_by="ticker",
            right_by="ticker2",
        ),
    )

    # Just by:
    pandas_trades["ticker"] = pandas_trades["ticker2"]
    modin_trades["ticker"] = modin_trades["ticker2"]
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            by="ticker",
        ),
        pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            by="ticker",
        ),
    )

    # Tolerance
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            by="ticker",
            tolerance=pd.Timedelta("2ms"),
        ),
        pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            by="ticker",
            tolerance=pd.Timedelta("2ms"),
        ),
    )

    # Direction
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            by="ticker",
            direction="forward",
        ),
        pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            by="ticker",
            direction="forward",
        ),
    )

    # Allow exact matches
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            by="ticker",
            tolerance=pd.Timedelta("10ms"),
            allow_exact_matches=False,
        ),
        pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            by="ticker",
            tolerance=pd.Timedelta("10ms"),
            allow_exact_matches=False,
        ),
    )
Example #13
0
def df_equals(df1, df2):
    """Tests if df1 and df2 are equal.

    Args:
        df1: (pandas or modin DataFrame or series) dataframe to test if equal.
        df2: (pandas or modin DataFrame or series) dataframe to test if equal.

    Returns:
        True if df1 is equal to df2.
    """
    # Gets AttributError if modin's groupby object is not import like this
    from modin.pandas.groupby import DataFrameGroupBy

    groupby_types = (pandas.core.groupby.DataFrameGroupBy, DataFrameGroupBy)

    # The typing behavior of how pandas treats its index is not consistent when the
    # length of the DataFrame or Series is 0, so we just verify that the contents are
    # the same.
    if (hasattr(df1, "index") and hasattr(df2, "index") and len(df1) == 0
            and len(df2) == 0):
        if type(df1).__name__ == type(df2).__name__:
            if hasattr(df1, "name") and hasattr(
                    df2, "name") and df1.name == df2.name:
                return
            if (hasattr(df1, "columns") and hasattr(df2, "columns")
                    and df1.columns.equals(df2.columns)):
                return
        assert False

    if isinstance(df1, (list, tuple)) and all(
            isinstance(d, (pd.DataFrame, pd.Series, pandas.DataFrame,
                           pandas.Series)) for d in df1):
        assert isinstance(df2, type(df1)), "Different type of collection"
        assert len(df1) == len(df2), "Different length result"
        return (df_equals(d1, d2) for d1, d2 in zip(df1, df2))

    # Convert to pandas
    if isinstance(df1, (pd.DataFrame, pd.Series)):
        df1 = to_pandas(df1)
    if isinstance(df2, (pd.DataFrame, pd.Series)):
        df2 = to_pandas(df2)

    if isinstance(df1, pandas.DataFrame) and isinstance(df2, pandas.DataFrame):
        if (df1.empty and not df2.empty) or (df2.empty and not df1.empty):
            assert False, "One of the passed frames is empty, when other isn't"
        elif df1.empty and df2.empty and type(df1) != type(df2):
            assert (
                False
            ), f"Empty frames have different types: {type(df1)} != {type(df2)}"

    if isinstance(df1, pandas.DataFrame) and isinstance(df2, pandas.DataFrame):
        assert_frame_equal(
            df1,
            df2,
            check_dtype=False,
            check_datetimelike_compat=True,
            check_index_type=False,
            check_column_type=False,
            check_categorical=False,
        )
        df_categories_equals(df1, df2)
    elif isinstance(df1, pandas.Index) and isinstance(df2, pandas.Index):
        assert_index_equal(df1, df2)
    elif isinstance(df1, pandas.Series) and isinstance(df2, pandas.Series):
        assert_series_equal(df1,
                            df2,
                            check_dtype=False,
                            check_series_type=False)
    elif isinstance(df1, groupby_types) and isinstance(df2, groupby_types):
        for g1, g2 in zip(df1, df2):
            assert g1[0] == g2[0]
            df_equals(g1[1], g2[1])
    elif (isinstance(df1, pandas.Series) and isinstance(df2, pandas.Series)
          and df1.empty and df2.empty):
        assert all(df1.index == df2.index)
        assert df1.dtypes == df2.dtypes
    elif isinstance(df1, pandas.core.arrays.numpy_.PandasArray):
        assert isinstance(df2, pandas.core.arrays.numpy_.PandasArray)
        assert df1 == df2
    elif isinstance(df1, np.recarray) and isinstance(df2, np.recarray):
        np.testing.assert_array_equal(df1, df2)
    else:
        if df1 != df2:
            np.testing.assert_almost_equal(df1, df2)
Example #14
0
def test_boxplot(data):
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)  # noqa F841

    assert modin_df.boxplot() == to_pandas(modin_df).boxplot()
Example #15
0
 }),
 ("expanding", None),
 ("corrwith", lambda df: {
     "other": df
 }),
 ("explode", lambda df: {
     "column": df.columns[0]
 }),
 ("ewm", lambda df: {
     "com": 0.5
 }),
 ("from_dict", lambda df: {
     "data": None
 }),
 ("from_records", lambda df: {
     "data": to_pandas(df)
 }),
 ("hist", lambda df: {
     "column": "int_col"
 }),
 ("infer_objects", None),
 ("interpolate", None),
 ("lookup", lambda df: {
     "row_labels": [0],
     "col_labels": ["int_col"]
 }),
 ("mask", lambda df: {
     "cond": df != 0
 }),
 ("pct_change", None),
 ("__getstate__", None),
 def time_to_pandas(self, shape, cpus):
     # to_pandas is already synchronous
     to_pandas(self.data)
 def time_to_pandas(self, shape, cpus):
     execute(to_pandas(self.data))
Example #18
0
def merge_asof(
    left,
    right,
    on=None,
    left_on=None,
    right_on=None,
    left_index: bool = False,
    right_index: bool = False,
    by=None,
    left_by=None,
    right_by=None,
    suffixes=("_x", "_y"),
    tolerance=None,
    allow_exact_matches: bool = True,
    direction: str = "backward",
) -> DataFrame:
    if not isinstance(left, DataFrame):
        raise ValueError(
            "can not merge DataFrame with instance of type {}".format(
                type(right)))
    ErrorMessage.default_to_pandas("`merge_asof`")

    # As of Pandas 1.2 these should raise an error; before that it did
    # something likely random:
    if ((on and (left_index or right_index)) or (left_on and left_index)
            or (right_on and right_index)):
        raise ValueError(
            "Can't combine left/right_index with left/right_on or on.")

    # Pandas fallbacks for tricky cases:
    if (
            # No idea how this works or why it does what it does; and in fact
            # there's a Pandas bug suggesting it's wrong:
            # https://github.com/pandas-dev/pandas/issues/33463
        (left_index and right_on is not None)
            # This is the case where by is a list of columns. If we're copying lots
            # of columns out of Pandas, maybe not worth trying our path, it's not
            # clear it's any better:
            or not isinstance(by, (str, type(None))) or
            not isinstance(left_by, (str, type(None))) or
            not isinstance(right_by, (str, type(None)))):
        if isinstance(right, DataFrame):
            right = to_pandas(right)
        return DataFrame(
            pandas.merge_asof(
                to_pandas(left),
                right,
                on=on,
                left_on=left_on,
                right_on=right_on,
                left_index=left_index,
                right_index=right_index,
                by=by,
                left_by=left_by,
                right_by=right_by,
                suffixes=suffixes,
                tolerance=tolerance,
                allow_exact_matches=allow_exact_matches,
                direction=direction,
            ))

    left_column = None
    right_column = None

    if on is not None:
        if left_on is not None or right_on is not None:
            raise ValueError(
                "If 'on' is set, 'left_on' and 'right_on' can't be set.")
        left_on = on
        right_on = on

    if left_on is not None:
        left_column = to_pandas(left[left_on])
    elif left_index:
        left_column = left.index
    else:
        raise ValueError("Need some sort of 'on' spec")

    if right_on is not None:
        right_column = to_pandas(right[right_on])
    elif right_index:
        right_column = right.index
    else:
        raise ValueError("Need some sort of 'on' spec")

    # If we haven't set these by now, there's a bug in this function.
    assert left_column is not None
    assert right_column is not None

    if by is not None:
        if left_by is not None or right_by is not None:
            raise ValueError(
                "Can't have both 'by' and 'left_by' or 'right_by'")
        left_by = right_by = by

    # List of columns case should have been handled by direct Pandas fallback
    # earlier:
    assert isinstance(left_by, (str, type(None)))
    assert isinstance(right_by, (str, type(None)))

    left_pandas_limited = {"on": left_column}
    right_pandas_limited = {"on": right_column, "right_labels": right.index}
    extra_kwargs = {}  # extra arguments to Pandas merge_asof

    if left_by is not None or right_by is not None:
        extra_kwargs["by"] = "by"
        left_pandas_limited["by"] = to_pandas(left[left_by])
        right_pandas_limited["by"] = to_pandas(right[right_by])

    # 1. Construct Pandas DataFrames with just the 'on' and optional 'by'
    # columns, and the index as another column.
    left_pandas_limited = pandas.DataFrame(left_pandas_limited,
                                           index=left.index)
    right_pandas_limited = pandas.DataFrame(right_pandas_limited)

    # 2. Use Pandas' merge_asof to figure out how to map labels on left to
    # labels on the right.
    merged = pandas.merge_asof(
        left_pandas_limited,
        right_pandas_limited,
        on="on",
        direction=direction,
        allow_exact_matches=allow_exact_matches,
        tolerance=tolerance,
        **extra_kwargs,
    )
    # Now merged["right_labels"] shows which labels from right map to left's index.

    # 3. Re-index right using the merged["right_labels"]; at this point right
    # should be same length and (semantically) same order as left:
    right_subset = right.reindex(index=pandas.Index(merged["right_labels"]))
    if not right_index:
        right_subset.drop(columns=[right_on], inplace=True)
    if right_by is not None and left_by == right_by:
        right_subset.drop(columns=[right_by], inplace=True)
    right_subset.index = left.index

    # 4. Merge left and the new shrunken right:
    result = merge(
        left,
        right_subset,
        left_index=True,
        right_index=True,
        suffixes=suffixes,
        how="left",
    )

    # 5. Clean up to match Pandas output:
    if left_on is not None and right_index:
        result.insert(
            # In theory this could use get_indexer_for(), but that causes an error:
            list(result.columns).index(left_on + suffixes[0]),
            left_on,
            result[left_on + suffixes[0]],
        )
    if not left_index and not right_index:
        result.index = pandas.RangeIndex(start=0, stop=len(result))

    return result
Example #19
0
NPartitions.put(4)

# Force matplotlib to not use any Xwindows backend.
matplotlib.use("Agg")


@pytest.mark.parametrize(
    "op, make_args",
    [
        ("align", lambda df: {"other": df}),
        ("expanding", None),
        ("corrwith", lambda df: {"other": df}),
        ("explode", lambda df: {"column": df.columns[0]}),
        ("ewm", lambda df: {"com": 0.5}),
        ("from_dict", lambda df: {"data": None}),
        ("from_records", lambda df: {"data": to_pandas(df)}),
        ("hist", lambda df: {"column": "int_col"}),
        ("infer_objects", None),
        ("interpolate", None),
        ("lookup", lambda df: {"row_labels": [0], "col_labels": ["int_col"]}),
        ("mask", lambda df: {"cond": df != 0}),
        ("pct_change", None),
        ("to_xarray", None),
        ("flags", None),
        ("set_flags", lambda df: {"allows_duplicate_labels": False}),
    ],
)
def test_ops_defaulting_to_pandas(op, make_args):
    modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1)
    with pytest.warns(UserWarning):
        operation = getattr(modin_df, op)