Beispiel #1
0
def test_value_counts(normalize, bins, dropna):
    # We sort indices for Modin and pandas result because of issue #1650
    values = np.array([3, 1, 2, 3, 4, np.nan])
    with warns_that_defaulting_to_pandas():
        modin_result = sort_index_for_equal_values(
            pd.value_counts(values, normalize=normalize, ascending=False),
            False)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, normalize=normalize, ascending=False),
        False)
    df_equals(modin_result, pandas_result)

    with warns_that_defaulting_to_pandas():
        modin_result = sort_index_for_equal_values(
            pd.value_counts(values, bins=bins, ascending=False), False)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, bins=bins, ascending=False), False)
    df_equals(modin_result, pandas_result)

    with warns_that_defaulting_to_pandas():
        modin_result = sort_index_for_equal_values(
            pd.value_counts(values, dropna=dropna, ascending=True), True)
    pandas_result = sort_index_for_equal_values(
        pandas.value_counts(values, dropna=dropna, ascending=True), True)
    df_equals(modin_result, pandas_result)
Beispiel #2
0
def test_merge_asof():
    left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]})
    right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]})

    with warns_that_defaulting_to_pandas():
        df = pd.merge_asof(left, right, on="a")
        assert isinstance(df, pd.DataFrame)

    with warns_that_defaulting_to_pandas():
        df = pd.merge_asof(left, right, on="a", allow_exact_matches=False)
        assert isinstance(df, pd.DataFrame)

    with warns_that_defaulting_to_pandas():
        df = pd.merge_asof(left, right, on="a", direction="forward")
        assert isinstance(df, pd.DataFrame)

    with warns_that_defaulting_to_pandas():
        df = pd.merge_asof(left, right, on="a", direction="nearest")
        assert isinstance(df, pd.DataFrame)

    left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10])
    right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7])

    with warns_that_defaulting_to_pandas():
        df = pd.merge_asof(left, right, left_index=True, right_index=True)
        assert isinstance(df, pd.DataFrame)

    with pytest.raises(ValueError):
        pd.merge_asof(
            {"left_val": ["a", "b", "c"]},
            {"right_val": [1, 2, 3, 6, 7]},
            left_index=True,
            right_index=True,
        )
Beispiel #3
0
def test_aggregate_error_checking():
    modin_df = pd.DataFrame(test_data["float_nan_data"])

    with warns_that_defaulting_to_pandas():
        modin_df.aggregate({
            modin_df.columns[0]: "sum",
            modin_df.columns[1]: "mean"
        })

    with warns_that_defaulting_to_pandas():
        modin_df.aggregate("cumproduct")

    with pytest.raises(ValueError):
        modin_df.aggregate("NOT_EXISTS")
Beispiel #4
0
def test_simple_import(data_has_nulls):
    """Test that ``modin.pandas.utils.from_dataframe`` works properly."""
    data = get_data_of_all_types(data_has_nulls)

    modin_df_producer = pd.DataFrame(data)
    internal_modin_df_producer = modin_df_producer.__dataframe__()
    # Our configuration in pytest.ini requires that we explicitly catch all
    # instances of defaulting to pandas, this one raises a warning on `.from_dataframe`
    with warns_that_defaulting_to_pandas():
        modin_df_consumer = from_dataframe(modin_df_producer)
        internal_modin_df_consumer = from_dataframe(internal_modin_df_producer)

    # TODO: the following assertions verify that `from_dataframe` doesn't return
    # the same object untouched due to optimization branching, it actually should
    # do so but the logic is not implemented yet, so the assertions are passing
    # for now. It's required to replace the producer's type with a different one
    # to consumer when we have some other implementation of the protocol as the
    # assertions may start failing shortly.
    assert modin_df_producer is not modin_df_consumer
    assert internal_modin_df_producer is not internal_modin_df_consumer
    assert (modin_df_producer._query_compiler._modin_frame
            is not modin_df_consumer._query_compiler._modin_frame)

    df_equals(modin_df_producer, modin_df_consumer)
    df_equals(modin_df_producer, internal_modin_df_consumer)
Beispiel #5
0
def test_sql_query():
    from modin.experimental.sql import query

    # Modin can't read_csv from a buffer.
    with warns_that_defaulting_to_pandas():
        df = pd.read_csv(io.StringIO(titanic_snippet))
    sql = "SELECT survived, p_class, count(passenger_id) as count FROM (SELECT * FROM titanic WHERE survived = 1) as t1 GROUP BY survived, p_class"
    with warns_that_defaulting_to_pandas():
        query_result = query(sql, titanic=df)
    expected_df = (df[df.survived == 1].groupby(["survived", "p_class"]).agg({
        "passenger_id":
        "count"
    }).reset_index())
    assert query_result.shape == expected_df.shape
    values_left = expected_df.dropna().values
    values_right = query_result.dropna().values
    assert (values_left == values_right).all()
Beispiel #6
0
def test_crosstab():
    a = np.array(
        [
            "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo",
            "foo", "foo"
        ],
        dtype=object,
    )
    b = np.array(
        [
            "one", "one", "one", "two", "one", "one", "one", "two", "two",
            "two", "one"
        ],
        dtype=object,
    )
    c = np.array(
        [
            "dull",
            "dull",
            "shiny",
            "dull",
            "dull",
            "shiny",
            "shiny",
            "dull",
            "shiny",
            "shiny",
            "shiny",
        ],
        dtype=object,
    )

    with warns_that_defaulting_to_pandas():
        df = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"])
        assert isinstance(df, pd.DataFrame)

    foo = pd.Categorical(["a", "b"], categories=["a", "b", "c"])
    bar = pd.Categorical(["d", "e"], categories=["d", "e", "f"])

    with warns_that_defaulting_to_pandas():
        df = pd.crosstab(foo, bar)
        assert isinstance(df, pd.DataFrame)

    with warns_that_defaulting_to_pandas():
        df = pd.crosstab(foo, bar, dropna=False)
        assert isinstance(df, pd.DataFrame)
Beispiel #7
0
def test_asfreq():
    index = pd.date_range("1/1/2000", periods=4, freq="T")
    series = pd.Series([0.0, None, 2.0, 3.0], index=index)
    df = pd.DataFrame({"s": series})
    with warns_that_defaulting_to_pandas():
        # We are only testing that this defaults to pandas, so we will just check for
        # the warning
        df.asfreq(freq="30S")
Beispiel #8
0
def test_math_functions_level(op):
    modin_df = pd.DataFrame(test_data["int_data"])
    modin_df.index = pandas.MultiIndex.from_tuples([(i // 4, i // 2, i)
                                                    for i in modin_df.index])

    # Defaults to pandas
    with warns_that_defaulting_to_pandas():
        # Operation against self for sanity check
        getattr(modin_df, op)(modin_df, axis=0, level=1)
Beispiel #9
0
def test_sql_extension():
    import modin.experimental.sql  # noqa: F401

    # Modin can't read_csv from a buffer.
    with warns_that_defaulting_to_pandas():
        df = pd.read_csv(io.StringIO(titanic_snippet))

    expected_df = df[df["survived"] == 1][["passenger_id", "survived"]]

    sql = "SELECT passenger_id, survived WHERE survived = 1"
    # DataFrame.convert_dtypes defaults to pandas.
    with warns_that_defaulting_to_pandas():
        query_result = df.sql(sql)
    assert list(query_result.columns) == ["passenger_id", "survived"]
    values_left = expected_df.values
    values_right = query_result.values
    assert values_left.shape == values_right.shape
    assert (values_left == values_right).all()
Beispiel #10
0
def test_median_skew_std_var_sem_1953(method):
    # See #1953 for details
    arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]]
    data = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]]
    modin_df = pd.DataFrame(data, index=arrays)
    pandas_df = pandas.DataFrame(data, index=arrays)

    # These shouldn't default to pandas: follow up on
    # https://github.com/modin-project/modin/issues/1953
    with warns_that_defaulting_to_pandas():
        eval_general(modin_df, pandas_df, lambda df: getattr(df, method)(level=0))
Beispiel #11
0
def test_merge_asof_suffixes():
    """Suffix variations are handled the same as Pandas."""
    left = {"a": [1, 5, 10]}
    right = {"a": [2, 3, 6]}
    pandas_left, pandas_right = (pandas.DataFrame(left),
                                 pandas.DataFrame(right))
    modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right)
    for suffixes in [("a", "b"), (False, "c"), ("d", False)]:
        pandas_merged = pandas.merge_asof(
            pandas_left,
            pandas_right,
            left_index=True,
            right_index=True,
            suffixes=suffixes,
        )
        with warns_that_defaulting_to_pandas():
            modin_merged = pd.merge_asof(
                modin_left,
                modin_right,
                left_index=True,
                right_index=True,
                suffixes=suffixes,
            )
        df_equals(pandas_merged, modin_merged)

    with pytest.raises(ValueError):
        pandas.merge_asof(
            pandas_left,
            pandas_right,
            left_index=True,
            right_index=True,
            suffixes=(False, False),
        )
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        modin_merged = pd.merge_asof(
            modin_left,
            modin_right,
            left_index=True,
            right_index=True,
            suffixes=(False, False),
        )
Beispiel #12
0
def test_ops_defaulting_to_pandas(op, make_args):
    modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"],
                                                       axis=1)
    with warns_that_defaulting_to_pandas():
        operation = getattr(modin_df, op)
        if make_args is not None:
            operation(**make_args(modin_df))
        else:
            try:
                operation()
            # `except` for non callable attributes
            except TypeError:
                pass
Beispiel #13
0
def test_multi_level_comparison(data, op):
    modin_df_multi_level = pd.DataFrame(data)

    new_idx = pandas.MultiIndex.from_tuples([
        (i // 4, i // 2, i) for i in modin_df_multi_level.index
    ])
    modin_df_multi_level.index = new_idx

    # Defaults to pandas
    with warns_that_defaulting_to_pandas():
        # Operation against self for sanity check
        getattr(modin_df_multi_level, op)(modin_df_multi_level,
                                          axis=0,
                                          level=1)
Beispiel #14
0
def test_sort_multiindex(sort_remaining):
    data = test_data["int_data"]
    modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data)

    for index in ["index", "columns"]:
        new_index = generate_multiindex(len(getattr(modin_df, index)))
        for df in [modin_df, pandas_df]:
            setattr(df, index, new_index)

    for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]:
        with warns_that_defaulting_to_pandas():
            df_equals(
                modin_df.sort_index(sort_remaining=sort_remaining, **kwargs),
                pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs),
            )
Beispiel #15
0
def test_wide_to_long():
    data = pd.DataFrame({
        "hr1": [514, 573],
        "hr2": [545, 526],
        "team": ["Red Sox", "Yankees"],
        "year1": [2007, 2008],
        "year2": [2008, 2008],
    })

    with warns_that_defaulting_to_pandas():
        df = pd.wide_to_long(data, ["hr", "year"], "team", "index")
        assert isinstance(df, pd.DataFrame)

    with pytest.raises(ValueError):
        pd.wide_to_long(data.to_numpy(), ["hr", "year"], "team", "index")
Beispiel #16
0
def test_distributed_pickling(filename, compression):
    data = test_data["int_data"]
    df = pd.DataFrame(data)

    filename_param = filename
    if compression:
        filename = f"{filename}.gz"

    with (warns_that_defaulting_to_pandas() if filename_param
          == test_default_to_pickle_filename else nullcontext()):
        df.to_pickle_distributed(filename, compression=compression)
        pickled_df = pd.read_pickle_distributed(filename,
                                                compression=compression)
    df_equals(pickled_df, df)

    pickle_files = glob.glob(filename)
    teardown_test_files(pickle_files)
Beispiel #17
0
def test_merge_ordered():
    data_a = {
        "key": list("aceace"),
        "lvalue": [1, 2, 3, 1, 2, 3],
        "group": list("aaabbb"),
    }
    data_b = {"key": list("bcd"), "rvalue": [1, 2, 3]}

    modin_df_a = pd.DataFrame(data_a)
    modin_df_b = pd.DataFrame(data_b)

    with warns_that_defaulting_to_pandas():
        df = pd.merge_ordered(modin_df_a,
                              modin_df_b,
                              fill_method="ffill",
                              left_by="group")
        assert isinstance(df, pd.DataFrame)

    with pytest.raises(ValueError):
        pd.merge_ordered(data_a, data_b, fill_method="ffill", left_by="group")
Beispiel #18
0
def test_buffer_of_chunked_at(data_has_nulls, n_chunks):
    """Test that getting buffers of physically chunked column works properly."""
    data = get_data_of_all_types(
        # For the simplicity of the test include only primitive types, so the test can use
        # only one function to export a column instead of if-elsing to find a type-according one
        has_nulls=data_has_nulls,
        include_dtypes=["bool", "int", "uint", "float"],
    )

    pd_df = pandas.DataFrame(data)
    pd_chunks = split_df_into_chunks(pd_df, n_chunks)

    chunked_at = pa.concat_tables(
        [pa.Table.from_pandas(pd_df) for pd_df in pd_chunks])
    md_df = from_arrow(chunked_at)

    protocol_df = md_df.__dataframe__()
    for i, col in enumerate(protocol_df.get_columns()):
        assert col.num_chunks() > 1
        assert len(col._pyarrow_table.column(0).chunks) > 1

        buffers = col.get_buffers()
        data_buff, data_dtype = buffers["data"]
        result = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size)
        result = set_nulls(result, col, buffers["validity"])

        # Our configuration in pytest.ini requires that we explicitly catch all
        # instances of defaulting to pandas, this one raises a warning on `.to_numpy()`
        with warns_that_defaulting_to_pandas():
            reference = md_df.iloc[:, i].to_numpy()

        np.testing.assert_array_equal(reference, result)

    protocol_df = md_df.__dataframe__(allow_copy=False)
    for i, col in enumerate(protocol_df.get_columns()):
        assert col.num_chunks() > 1
        assert len(col._pyarrow_table.column(0).chunks) > 1

        # Catch exception on attempt of doing a copy due to chunks combining
        with pytest.raises(RuntimeError):
            col.get_buffers()
Beispiel #19
0
def test_merge_asof_on_variations():
    """on=,left_on=,right_on=,right_index=,left_index= options match Pandas."""
    left = {"a": [1, 5, 10], "left_val": ["a", "b", "c"]}
    left_index = [6, 8, 12]
    right = {"a": [1, 2, 3, 6, 7], "right_val": ["d", "e", "f", "g", "h"]}
    right_index = [6, 7, 8, 9, 15]
    pandas_left, pandas_right = (
        pandas.DataFrame(left, index=left_index),
        pandas.DataFrame(right, index=right_index),
    )
    modin_left, modin_right = (
        pd.DataFrame(left, index=left_index),
        pd.DataFrame(right, index=right_index),
    )
    for on_arguments in [
        {
            "on": "a"
        },
        {
            "left_on": "a",
            "right_on": "a"
        },
        {
            "left_on": "a",
            "right_index": True
        },
        {
            "left_index": True,
            "right_on": "a"
        },
        {
            "left_index": True,
            "right_index": True
        },
    ]:
        pandas_merged = pandas.merge_asof(pandas_left, pandas_right,
                                          **on_arguments)
        with warns_that_defaulting_to_pandas():
            modin_merged = pd.merge_asof(modin_left, modin_right,
                                         **on_arguments)
        df_equals(pandas_merged, modin_merged)
Beispiel #20
0
def test_get_dummies():
    s = pd.Series(list("abca"))
    with warns_that_defaulting_to_pandas():
        pd.get_dummies(s)

    s1 = ["a", "b", np.nan]
    with warns_that_defaulting_to_pandas():
        pd.get_dummies(s1)

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(s1, dummy_na=True)

    data = {"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]}
    modin_df = pd.DataFrame(data)
    pandas_df = pandas.DataFrame(data)

    modin_result = pd.get_dummies(modin_df, prefix=["col1", "col2"])
    pandas_result = pandas.get_dummies(pandas_df, prefix=["col1", "col2"])
    df_equals(modin_result, pandas_result)
    assert modin_result._to_pandas().columns.equals(pandas_result.columns)
    assert modin_result.shape == pandas_result.shape

    modin_result = pd.get_dummies(pd.DataFrame(pd.Series(list("abcdeabac"))))
    pandas_result = pandas.get_dummies(
        pandas.DataFrame(pandas.Series(list("abcdeabac"))))
    df_equals(modin_result, pandas_result)
    assert modin_result._to_pandas().columns.equals(pandas_result.columns)
    assert modin_result.shape == pandas_result.shape

    with pytest.raises(NotImplementedError):
        pd.get_dummies(modin_df, prefix=["col1", "col2"], sparse=True)

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(pd.Series(list("abcaa")))

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(pd.Series(list("abcaa")), drop_first=True)

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(pd.Series(list("abc")), dtype=float)

    with warns_that_defaulting_to_pandas():
        pd.get_dummies(1)
Beispiel #21
0
def test_lreshape():
    data = pd.DataFrame({
        "hr1": [514, 573],
        "hr2": [545, 526],
        "team": ["Red Sox", "Yankees"],
        "year1": [2007, 2008],
        "year2": [2008, 2008],
    })

    with warns_that_defaulting_to_pandas():
        df = pd.lreshape(data, {
            "year": ["year1", "year2"],
            "hr": ["hr1", "hr2"]
        })
        assert isinstance(df, pd.DataFrame)

    with pytest.raises(ValueError):
        pd.lreshape(data.to_numpy(), {
            "year": ["year1", "year2"],
            "hr": ["hr1", "hr2"]
        })
Beispiel #22
0
def test_merge_asof_bad_arguments():
    left = {"a": [1, 5, 10], "b": [5, 7, 9]}
    right = {"a": [2, 3, 6], "b": [6, 5, 20]}
    pandas_left, pandas_right = (pandas.DataFrame(left),
                                 pandas.DataFrame(right))
    modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right)

    # Can't mix by with left_by/right_by
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pandas.merge_asof(pandas_left,
                          pandas_right,
                          on="a",
                          by="b",
                          left_by="can't do with by")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left,
                      modin_right,
                      on="a",
                      by="b",
                      left_by="can't do with by")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pandas.merge_asof(pandas_left,
                          pandas_right,
                          by="b",
                          on="a",
                          right_by="can't do with by")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left,
                      modin_right,
                      by="b",
                      on="a",
                      right_by="can't do with by")

    # Can't mix on with left_on/right_on
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pandas.merge_asof(pandas_left,
                          pandas_right,
                          on="a",
                          left_on="can't do with by")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left,
                      modin_right,
                      on="a",
                      left_on="can't do with by")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pandas.merge_asof(pandas_left,
                          pandas_right,
                          on="a",
                          right_on="can't do with by")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left,
                      modin_right,
                      on="a",
                      right_on="can't do with by")

    # Can't mix left_index with left_on or on, similarly for right.
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left, modin_right, on="a", right_index=True)
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left,
                      modin_right,
                      left_on="a",
                      right_on="a",
                      right_index=True)
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left, modin_right, on="a", left_index=True)
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left,
                      modin_right,
                      left_on="a",
                      right_on="a",
                      left_index=True)

    # Need both left and right
    with pytest.raises(
            Exception):  # Pandas bug, didn't validate inputs sufficiently
        pandas.merge_asof(pandas_left, pandas_right, left_on="a")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left, modin_right, left_on="a")
    with pytest.raises(
            Exception):  # Pandas bug, didn't validate inputs sufficiently
        pandas.merge_asof(pandas_left, pandas_right, right_on="a")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left, modin_right, right_on="a")
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pandas.merge_asof(pandas_left, pandas_right)
    with pytest.raises(ValueError), warns_that_defaulting_to_pandas():
        pd.merge_asof(modin_left, modin_right)
Beispiel #23
0
def test_syncronous_mode():
    assert BenchmarkMode.get()
    # On Omnisci storage, transpose() defaults to Pandas.
    with (warns_that_defaulting_to_pandas()
          if StorageFormat.get() == "Omnisci" else nullcontext()):
        pd.DataFrame(test_data_values[0]).mean()
Beispiel #24
0
def test___round__():
    data = test_data_values[0]
    with warns_that_defaulting_to_pandas():
        pd.DataFrame(data).__round__()
Beispiel #25
0
def test_drop():
    frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]}
    simple = pandas.DataFrame(frame_data)
    modin_simple = pd.DataFrame(frame_data)
    df_equals(modin_simple.drop("A", axis=1), simple[["B"]])
    df_equals(modin_simple.drop(["A", "B"], axis="columns"), simple[[]])
    df_equals(modin_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :])
    df_equals(modin_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :])

    pytest.raises(ValueError, modin_simple.drop, 5)
    pytest.raises(ValueError, modin_simple.drop, "C", 1)
    pytest.raises(ValueError, modin_simple.drop, [1, 5])
    pytest.raises(ValueError, modin_simple.drop, ["A", "C"], 1)

    # errors = 'ignore'
    df_equals(modin_simple.drop(5, errors="ignore"), simple)
    df_equals(modin_simple.drop([0, 5], errors="ignore"),
              simple.loc[[1, 2, 3], :])
    df_equals(modin_simple.drop("C", axis=1, errors="ignore"), simple)
    df_equals(modin_simple.drop(["A", "C"], axis=1, errors="ignore"),
              simple[["B"]])

    # non-unique
    nu_df = pandas.DataFrame(zip(range(3), range(-3, 1), list("abc")),
                             columns=["a", "a", "b"])
    modin_nu_df = pd.DataFrame(nu_df)
    df_equals(modin_nu_df.drop("a", axis=1), nu_df[["b"]])
    df_equals(modin_nu_df.drop("b", axis="columns"), nu_df["a"])
    df_equals(modin_nu_df.drop([]), nu_df)

    nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"]))
    nu_df.columns = list("abc")
    modin_nu_df = pd.DataFrame(nu_df)
    df_equals(modin_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :])
    df_equals(modin_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :])

    # inplace cache issue
    frame_data = random_state.randn(10, 3)
    df = pandas.DataFrame(frame_data, columns=list("abc"))
    modin_df = pd.DataFrame(frame_data, columns=list("abc"))
    expected = df[~(df.b > 0)]
    modin_df.drop(labels=df[df.b > 0].index, inplace=True)
    df_equals(modin_df, expected)

    midx = pd.MultiIndex(
        levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]],
        codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]],
    )
    df = pd.DataFrame(
        index=midx,
        columns=["big", "small"],
        data=[
            [45, 30],
            [200, 100],
            [1.5, 1],
            [30, 20],
            [250, 150],
            [1.5, 0.8],
            [320, 250],
            [1, 0.8],
            [0.3, 0.2],
        ],
    )
    with warns_that_defaulting_to_pandas():
        df.drop(index="length", level=1)
Beispiel #26
0
def test___repr__():
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)
    assert repr(pandas_df) == repr(modin_df)

    # ___repr___ method has a different code path depending on
    # whether the number of rows is >60; and a different code path
    # depending on the number of columns is >20.
    # Previous test cases already check the case when cols>20
    # and rows>60. The cases that follow exercise the other three
    # combinations.
    # rows <= 60, cols > 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # rows <= 60, cols <= 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # rows > 60, cols <= 20
    frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10))
    pandas_df = pandas.DataFrame(frame_data)
    modin_df = pd.DataFrame(frame_data)

    assert repr(pandas_df) == repr(modin_df)

    # Empty
    pandas_df = pandas.DataFrame(columns=["col{}".format(i) for i in range(100)])
    modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)])

    assert repr(pandas_df) == repr(modin_df)

    # From Issue #1705
    string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf"
"2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6
"2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5
"2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0
"2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0"""
    pandas_df = pandas.read_csv(io.StringIO(string_data))
    with warns_that_defaulting_to_pandas():
        modin_df = pd.read_csv(io.StringIO(string_data))
    assert repr(pandas_df) == repr(modin_df)
Beispiel #27
0
def test___finalize__():
    data = test_data_values[0]
    with warns_that_defaulting_to_pandas():
        pd.DataFrame(data).__finalize__(None)
Beispiel #28
0
def test_merge():
    frame_data = {
        "col1": [0, 1, 2, 3],
        "col2": [4, 5, 6, 7],
        "col3": [8, 9, 0, 1],
        "col4": [2, 4, 5, 6],
    }

    modin_df = pd.DataFrame(frame_data)
    pandas_df = pandas.DataFrame(frame_data)

    frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]}
    modin_df2 = pd.DataFrame(frame_data2)
    pandas_df2 = pandas.DataFrame(frame_data2)

    join_types = ["outer", "inner"]
    for how in join_types:
        with warns_that_defaulting_to_pandas(
        ) if how == "outer" else contextlib.nullcontext():
            modin_result = pd.merge(modin_df, modin_df2, how=how)
        pandas_result = pandas.merge(pandas_df, pandas_df2, how=how)
        df_equals(modin_result, pandas_result)

        # left_on and right_index
        with warns_that_defaulting_to_pandas():
            modin_result = pd.merge(modin_df,
                                    modin_df2,
                                    how=how,
                                    left_on="col1",
                                    right_index=True)
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_on="col1",
                                     right_index=True)
        df_equals(modin_result, pandas_result)

        # left_index and right_on
        with warns_that_defaulting_to_pandas():
            modin_result = pd.merge(modin_df,
                                    modin_df2,
                                    how=how,
                                    left_index=True,
                                    right_on="col1")
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_index=True,
                                     right_on="col1")
        df_equals(modin_result, pandas_result)

        # left_on and right_on col1
        if how == "outer":
            warning_catcher = warns_that_defaulting_to_pandas()
        else:
            warning_catcher = contextlib.nullcontext()
        with warning_catcher:
            modin_result = pd.merge(modin_df,
                                    modin_df2,
                                    how=how,
                                    left_on="col1",
                                    right_on="col1")
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_on="col1",
                                     right_on="col1")
        df_equals(modin_result, pandas_result)

        # left_on and right_on col2
        if how == "outer":
            warning_catcher = warns_that_defaulting_to_pandas()
        else:
            warning_catcher = contextlib.nullcontext()
        with warning_catcher:
            modin_result = pd.merge(modin_df,
                                    modin_df2,
                                    how=how,
                                    left_on="col2",
                                    right_on="col2")
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_on="col2",
                                     right_on="col2")
        df_equals(modin_result, pandas_result)

        # left_index and right_index
        modin_result = pd.merge(modin_df,
                                modin_df2,
                                how=how,
                                left_index=True,
                                right_index=True)
        pandas_result = pandas.merge(pandas_df,
                                     pandas_df2,
                                     how=how,
                                     left_index=True,
                                     right_index=True)
        df_equals(modin_result, pandas_result)

    s = pd.Series(frame_data.get("col1"))
    with pytest.raises(ValueError):
        pd.merge(s, modin_df2)

    with pytest.raises(TypeError):
        pd.merge("Non-valid type", modin_df2)
Beispiel #29
0
def test_empty_dataframe():
    df = pd.DataFrame(columns=["a", "b"])
    with warns_that_defaulting_to_pandas():
        df[(df.a == 1) & (df.b == 2)]
Beispiel #30
0
def test_merge_asof_merge_options():
    modin_quotes = pd.DataFrame({
        "time": [
            pd.Timestamp("2016-05-25 13:30:00.023"),
            pd.Timestamp("2016-05-25 13:30:00.023"),
            pd.Timestamp("2016-05-25 13:30:00.030"),
            pd.Timestamp("2016-05-25 13:30:00.041"),
            pd.Timestamp("2016-05-25 13:30:00.048"),
            pd.Timestamp("2016-05-25 13:30:00.049"),
            pd.Timestamp("2016-05-25 13:30:00.072"),
            pd.Timestamp("2016-05-25 13:30:00.075"),
        ],
        "ticker":
        ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"],
        "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01],
        "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03],
    })
    modin_trades = pd.DataFrame({
        "time": [
            pd.Timestamp("2016-05-25 13:30:00.023"),
            pd.Timestamp("2016-05-25 13:30:00.038"),
            pd.Timestamp("2016-05-25 13:30:00.048"),
            pd.Timestamp("2016-05-25 13:30:00.048"),
            pd.Timestamp("2016-05-25 13:30:00.048"),
        ],
        "ticker2": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"],
        "price": [51.95, 51.95, 720.77, 720.92, 98.0],
        "quantity": [75, 155, 100, 100, 100],
    })
    pandas_quotes, pandas_trades = to_pandas(modin_quotes), to_pandas(
        modin_trades)

    # left_by + right_by
    with warns_that_defaulting_to_pandas():
        modin_result = pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            left_by="ticker",
            right_by="ticker2",
        )
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            left_by="ticker",
            right_by="ticker2",
        ),
        modin_result,
    )

    # Just by:
    pandas_trades["ticker"] = pandas_trades["ticker2"]
    modin_trades["ticker"] = modin_trades["ticker2"]
    with warns_that_defaulting_to_pandas():
        modin_result = pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            by="ticker",
        )
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            by="ticker",
        ),
        modin_result,
    )

    # Tolerance
    with warns_that_defaulting_to_pandas():
        modin_result = pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            by="ticker",
            tolerance=pd.Timedelta("2ms"),
        )
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            by="ticker",
            tolerance=pd.Timedelta("2ms"),
        ),
        modin_result,
    )

    # Direction
    with warns_that_defaulting_to_pandas():
        modin_result = pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            by="ticker",
            direction="forward",
        )
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            by="ticker",
            direction="forward",
        ),
        modin_result,
    )

    # Allow exact matches
    with warns_that_defaulting_to_pandas():
        modin_result = pd.merge_asof(
            modin_quotes,
            modin_trades,
            on="time",
            by="ticker",
            tolerance=pd.Timedelta("10ms"),
            allow_exact_matches=False,
        )
    df_equals(
        pandas.merge_asof(
            pandas_quotes,
            pandas_trades,
            on="time",
            by="ticker",
            tolerance=pd.Timedelta("10ms"),
            allow_exact_matches=False,
        ),
        modin_result,
    )