Exemple #1
0
def test_hasattr_sparse(is_sparse_data):
    modin_df, pandas_df = (
        create_test_dfs(pandas.arrays.SparseArray(test_data["float_nan_data"].values()))
        if is_sparse_data
        else create_test_dfs(test_data["float_nan_data"])
    )
    eval_general(modin_df, pandas_df, lambda df: hasattr(df, "sparse"))
Exemple #2
0
def test_duplicate_indexes():
    data = [0, 1, 2, 3, 4, 5]
    modin_df1, pandas_df1 = create_test_dfs(
        {"a": data, "b": data}, index=[0, 1, 2, 0, 1, 2]
    )
    modin_df2, pandas_df2 = create_test_dfs({"a": data, "b": data})
    df_equals(modin_df1 / modin_df2, pandas_df1 / pandas_df2)
Exemple #3
0
def test_cov(min_periods, ddof):
    eval_general(
        *create_test_dfs(test_data["int_data"]),
        lambda df: df.cov(min_periods=min_periods, ddof=ddof),
    )
    # Modin result may slightly differ from pandas result
    # due to floating pointing arithmetic.
    eval_general(
        *create_test_dfs(test_data["float_nan_data"]),
        lambda df: df.cov(min_periods=min_periods),
        comparator=modin_df_almost_equals_pandas,
    )
Exemple #4
0
def test_reduction_specific(fn, numeric_only, axis):
    if fn == "mean" and axis == 1:
        pytest.skip("See issue #2313 for details")
    eval_general(
        *create_test_dfs(test_data_diff_dtype),
        lambda df: getattr(df, fn)(numeric_only=numeric_only, axis=axis),
    )
Exemple #5
0
def test_unstack(data, is_multi_idx, is_multi_col):
    modin_df, pandas_df = create_test_dfs(data)

    if is_multi_idx:
        index = generate_multiindex(len(pandas_df),
                                    nlevels=4,
                                    is_tree_like=True)
    else:
        index = pandas_df.index

    if is_multi_col:
        columns = generate_multiindex(len(pandas_df.columns),
                                      nlevels=3,
                                      is_tree_like=True)
    else:
        columns = pandas_df.columns

    pandas_df.columns = modin_df.columns = columns
    pandas_df.index = modin_df.index = index

    df_equals(modin_df.unstack(), pandas_df.unstack())
    df_equals(modin_df.unstack(level=1), pandas_df.unstack(level=1))
    if is_multi_idx:
        df_equals(modin_df.unstack(level=[0, 1]),
                  pandas_df.unstack(level=[0, 1]))
        df_equals(modin_df.unstack(level=[0, 1, 2]),
                  pandas_df.unstack(level=[0, 1, 2]))
        df_equals(modin_df.unstack(level=[0, 1, 2, 3]),
                  pandas_df.unstack(level=[0, 1, 2, 3]))
Exemple #6
0
def test_apply_udf(data, func):
    eval_general(
        *create_test_dfs(data),
        lambda df, *args, **kwargs: df.apply(*args, **kwargs),
        func=func,
        other=lambda df: df,
    )
Exemple #7
0
def test_loc_series():
    md_df, pd_df = create_test_dfs({"a": [1, 2], "b": [3, 4]})

    pd_df.loc[pd_df["a"] > 1, "b"] = np.log(pd_df["b"])
    md_df.loc[md_df["a"] > 1, "b"] = np.log(md_df["b"])

    df_equals(pd_df, md_df)
Exemple #8
0
def test_prod_specific(min_count, numeric_only):
    if min_count == 5 and numeric_only:
        pytest.xfail("see #1953 for details")
    eval_general(
        *create_test_dfs(test_data_diff_dtype),
        lambda df: df.prod(min_count=min_count, numeric_only=numeric_only),
    )
Exemple #9
0
def test_prod(
    data,
    axis,
    skipna,
    is_transposed,
    method,
):
    eval_general(
        *create_test_dfs(data),
        lambda df, *args, **kwargs: getattr(df.T
                                            if is_transposed else df, method)(
                                                axis=axis,
                                                skipna=skipna,
                                            ),
    )

    # test for issue #1953
    arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]]
    modin_df = pd.DataFrame(
        [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays)
    pandas_df = pandas.DataFrame(
        [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]], index=arrays)
    modin_result = modin_df.prod(level=0)
    pandas_result = pandas_df.prod(level=0)
    df_equals(modin_result, pandas_result)
Exemple #10
0
def test_min_max_mean(data, axis, skipna, numeric_only, is_transposed, method):
    eval_general(
        *create_test_dfs(data),
        lambda df: getattr((df.T if is_transposed else df), method)(
            axis=axis, skipna=skipna, numeric_only=numeric_only
        ),
    )
Exemple #11
0
def test_all_any(data, axis, skipna, is_transposed, method):
    eval_general(
        *create_test_dfs(data),
        lambda df: getattr((df.T if is_transposed else df), method)(
            axis=axis, skipna=skipna, bool_only=None
        ),
    )
Exemple #12
0
def test_idxmin_idxmax(data, axis, skipna, is_transposed, method):
    eval_general(
        *create_test_dfs(data),
        lambda df: getattr((df.T if is_transposed else df), method)(
            axis=axis, skipna=skipna
        ),
    )
Exemple #13
0
def test_describe_specific(exclude, include):
    eval_general(
        *create_test_dfs(test_data_diff_dtype),
        lambda df: df.drop("str_col", axis=1).describe(
            exclude=exclude, include=include
        ),
    )
Exemple #14
0
def test___setitem__partitions_aligning():
    # from issue #2390
    modin_df = pd.DataFrame({"a": [1, 2, 3]})
    pandas_df = pandas.DataFrame({"a": [1, 2, 3]})
    modin_df["b"] = pd.Series([4, 5, 6, 7, 8])
    pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8])
    df_equals(modin_df, pandas_df)

    # from issue #2442
    data = {"a": [1, 2, 3, 4]}
    # Index with duplicated timestamp
    index = pandas.to_datetime(
        ["2020-02-06", "2020-02-06", "2020-02-22", "2020-03-26"])

    md_df, pd_df = create_test_dfs(data, index=index)
    # Setting new column
    pd_df["b"] = pandas.Series(np.arange(4))
    md_df["b"] = pd.Series(np.arange(4))
    df_equals(md_df, pd_df)

    # Setting existing column
    pd_df["b"] = pandas.Series(np.arange(4))
    md_df["b"] = pd.Series(np.arange(4))
    df_equals(md_df, pd_df)

    pd_df["a"] = pandas.Series(np.arange(4))
    md_df["a"] = pd.Series(np.arange(4))
    df_equals(md_df, pd_df)
Exemple #15
0
def test_comparison_except(data, op, other):
    # So far `eq` and `ne` are excluded from testing because
    # pandas throws an exception but Modin doesn't throw it.
    with pytest.raises(AssertionError):
        eval_general(
            *create_test_dfs(data),
            lambda df: getattr(df, op)(other),
        )
Exemple #16
0
def test_loc_assignment(index, columns):
    md_df, pd_df = create_test_dfs(index=index, columns=columns)
    for i, ind in enumerate(index):
        for j, col in enumerate(columns):
            value_to_assign = int(str(i) + str(j))
            md_df.loc[ind][col] = value_to_assign
            pd_df.loc[ind][col] = value_to_assign
    df_equals(md_df, pd_df)
Exemple #17
0
def test_melt(data, id_vars, value_vars):
    eval_general(
        *create_test_dfs(data),
        lambda df, *args, **kwargs: df.melt(*args, **kwargs).sort_values(
            ["variable", "value"]).reset_index(drop=True),
        id_vars=id_vars,
        value_vars=value_vars,
    )
Exemple #18
0
def test_kurt_kurtosis(axis, skipna, numeric_only, method):
    data = test_data["float_nan_data"]

    eval_general(
        *create_test_dfs(data),
        lambda df: getattr(df, method)
        (axis=axis, skipna=skipna, numeric_only=numeric_only),
    )
Exemple #19
0
def loc_iter_dfs():
    columns = ["col1", "col2", "col3"]
    index = ["row1", "row2", "row3"]
    return create_test_dfs(
        {col: ([idx] * len(index)) for idx, col in enumerate(columns)},
        columns=columns,
        index=index,
    )
Exemple #20
0
def test_describe_dtypes():
    data = {
        "col1": list("abc"),
        "col2": list("abc"),
        "col3": list("abc"),
        "col4": [1, 2, 3],
    }
    eval_general(*create_test_dfs(data), lambda df: df.describe())
Exemple #21
0
def test_aligning_partitions():
    data = [0, 1, 2, 3, 4, 5]
    modin_df1, _ = create_test_dfs({"a": data, "b": data})
    modin_df = modin_df1.loc[:2]

    modin_df2 = modin_df.append(modin_df)

    modin_df2["c"] = modin_df1["b"]
    repr(modin_df2)
Exemple #22
0
def test_pivot_table_dropna(data):
    eval_general(
        *create_test_dfs(data),
        operation=lambda df, *args, **kwargs: df.pivot_table(*args, **kwargs),
        index=lambda df: df.columns[0],
        columns=lambda df: df.columns[1],
        values=lambda df: df.columns[-1],
        dropna=False,
    )
Exemple #23
0
def test_pivot(data, index, columns, values):
    eval_general(
        *create_test_dfs(data),
        lambda df, *args, **kwargs: df.pivot(*args, **kwargs),
        index=index,
        columns=columns,
        values=values,
        check_exception_type=None,
    )
Exemple #24
0
def test_resample_getitem(columns):
    index = pandas.date_range("1/1/2013", periods=9, freq="T")
    data = {
        "price": range(9),
        "volume": range(10, 19),
    }
    eval_general(
        *create_test_dfs(data, index=index),
        lambda df: df.resample("3T")[columns].mean(),
    )
Exemple #25
0
def test_multiindex_from_frame(data, sortorder):
    modin_df, pandas_df = create_test_dfs(data)

    def call_from_frame(df):
        if type(df).__module__.startswith("pandas"):
            return pandas.MultiIndex.from_frame(df, sortorder)
        else:
            return pd.MultiIndex.from_frame(df, sortorder)

    eval_general(modin_df, pandas_df, call_from_frame, comparator=assert_index_equal)
Exemple #26
0
def test_agg_dict():
    md_df, pd_df = create_test_dfs(test_data_values[0])
    agg_dict = {pd_df.columns[0]: "sum", pd_df.columns[-1]: ("sum", "count")}
    eval_general(md_df, pd_df, lambda df: df.agg(agg_dict), raising_exceptions=True)

    agg_dict = {
        "new_col1": (pd_df.columns[0], "sum"),
        "new_col2": (pd_df.columns[-1], "count"),
    }
    eval_general(md_df, pd_df, lambda df: df.agg(**agg_dict), raising_exceptions=True)
Exemple #27
0
def test_value_counts_categorical():
    # from issue #3571
    data = np.array(["a"] * 50000 + ["b"] * 10000 + ["c"] * 1000)
    random_state = np.random.RandomState(seed=42)
    random_state.shuffle(data)

    eval_general(
        *create_test_dfs({"col1": data, "col2": data}, dtype="category"),
        lambda df: df.value_counts(),
    )
Exemple #28
0
def test_agg_apply_axis_names(axis, func, op):
    # AssertionError may be arisen in case of
    # mismathing of index/columns in Modin and pandas.
    # See details in pandas issue 36189.
    try:
        eval_general(
            *create_test_dfs(test_data["int_data"]),
            lambda df: getattr(df, op)(func, axis),
        )
    except AssertionError:
        pass
Exemple #29
0
def test_apply_args(axis, args):
    def apply_func(series, y):
        try:
            return series + y
        except TypeError:
            return series.map(str) + str(y)

    eval_general(
        *create_test_dfs(test_data["int_data"]),
        lambda df: df.apply(apply_func, axis=axis, args=args),
    )
Exemple #30
0
def test_explode_all_partitions(column, ignore_index):
    # Test explode with enough rows to fill all partitions. explode should
    # expand every row in the input data into two rows. It's especially
    # important that the input data has list-like elements that must be
    # expanded at the boundaries of the partitions, e.g. at row 31.
    num_rows = NPartitions.get() * MinPartitionSize.get()
    data = {"A": [[3, 4]] * num_rows, "C": [["a", "b"]] * num_rows}
    eval_general(
        *create_test_dfs(data),
        lambda df: df.explode(column, ignore_index=ignore_index),
    )