def test_value_counts(normalize, bins, dropna): # We sort indices for Modin and pandas result because of issue #1650 values = np.array([3, 1, 2, 3, 4, np.nan]) with warns_that_defaulting_to_pandas(): modin_result = sort_index_for_equal_values( pd.value_counts(values, normalize=normalize, ascending=False), False) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, normalize=normalize, ascending=False), False) df_equals(modin_result, pandas_result) with warns_that_defaulting_to_pandas(): modin_result = sort_index_for_equal_values( pd.value_counts(values, bins=bins, ascending=False), False) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, bins=bins, ascending=False), False) df_equals(modin_result, pandas_result) with warns_that_defaulting_to_pandas(): modin_result = sort_index_for_equal_values( pd.value_counts(values, dropna=dropna, ascending=True), True) pandas_result = sort_index_for_equal_values( pandas.value_counts(values, dropna=dropna, ascending=True), True) df_equals(modin_result, pandas_result)
def test_merge_asof(): left = pd.DataFrame({"a": [1, 5, 10], "left_val": ["a", "b", "c"]}) right = pd.DataFrame({"a": [1, 2, 3, 6, 7], "right_val": [1, 2, 3, 6, 7]}) with warns_that_defaulting_to_pandas(): df = pd.merge_asof(left, right, on="a") assert isinstance(df, pd.DataFrame) with warns_that_defaulting_to_pandas(): df = pd.merge_asof(left, right, on="a", allow_exact_matches=False) assert isinstance(df, pd.DataFrame) with warns_that_defaulting_to_pandas(): df = pd.merge_asof(left, right, on="a", direction="forward") assert isinstance(df, pd.DataFrame) with warns_that_defaulting_to_pandas(): df = pd.merge_asof(left, right, on="a", direction="nearest") assert isinstance(df, pd.DataFrame) left = pd.DataFrame({"left_val": ["a", "b", "c"]}, index=[1, 5, 10]) right = pd.DataFrame({"right_val": [1, 2, 3, 6, 7]}, index=[1, 2, 3, 6, 7]) with warns_that_defaulting_to_pandas(): df = pd.merge_asof(left, right, left_index=True, right_index=True) assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.merge_asof( {"left_val": ["a", "b", "c"]}, {"right_val": [1, 2, 3, 6, 7]}, left_index=True, right_index=True, )
def test_aggregate_error_checking(): modin_df = pd.DataFrame(test_data["float_nan_data"]) with warns_that_defaulting_to_pandas(): modin_df.aggregate({ modin_df.columns[0]: "sum", modin_df.columns[1]: "mean" }) with warns_that_defaulting_to_pandas(): modin_df.aggregate("cumproduct") with pytest.raises(ValueError): modin_df.aggregate("NOT_EXISTS")
def test_simple_import(data_has_nulls): """Test that ``modin.pandas.utils.from_dataframe`` works properly.""" data = get_data_of_all_types(data_has_nulls) modin_df_producer = pd.DataFrame(data) internal_modin_df_producer = modin_df_producer.__dataframe__() # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, this one raises a warning on `.from_dataframe` with warns_that_defaulting_to_pandas(): modin_df_consumer = from_dataframe(modin_df_producer) internal_modin_df_consumer = from_dataframe(internal_modin_df_producer) # TODO: the following assertions verify that `from_dataframe` doesn't return # the same object untouched due to optimization branching, it actually should # do so but the logic is not implemented yet, so the assertions are passing # for now. It's required to replace the producer's type with a different one # to consumer when we have some other implementation of the protocol as the # assertions may start failing shortly. assert modin_df_producer is not modin_df_consumer assert internal_modin_df_producer is not internal_modin_df_consumer assert (modin_df_producer._query_compiler._modin_frame is not modin_df_consumer._query_compiler._modin_frame) df_equals(modin_df_producer, modin_df_consumer) df_equals(modin_df_producer, internal_modin_df_consumer)
def test_sql_query(): from modin.experimental.sql import query # Modin can't read_csv from a buffer. with warns_that_defaulting_to_pandas(): df = pd.read_csv(io.StringIO(titanic_snippet)) sql = "SELECT survived, p_class, count(passenger_id) as count FROM (SELECT * FROM titanic WHERE survived = 1) as t1 GROUP BY survived, p_class" with warns_that_defaulting_to_pandas(): query_result = query(sql, titanic=df) expected_df = (df[df.survived == 1].groupby(["survived", "p_class"]).agg({ "passenger_id": "count" }).reset_index()) assert query_result.shape == expected_df.shape values_left = expected_df.dropna().values values_right = query_result.dropna().values assert (values_left == values_right).all()
def test_crosstab(): a = np.array( [ "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar", "foo", "foo", "foo" ], dtype=object, ) b = np.array( [ "one", "one", "one", "two", "one", "one", "one", "two", "two", "two", "one" ], dtype=object, ) c = np.array( [ "dull", "dull", "shiny", "dull", "dull", "shiny", "shiny", "dull", "shiny", "shiny", "shiny", ], dtype=object, ) with warns_that_defaulting_to_pandas(): df = pd.crosstab(a, [b, c], rownames=["a"], colnames=["b", "c"]) assert isinstance(df, pd.DataFrame) foo = pd.Categorical(["a", "b"], categories=["a", "b", "c"]) bar = pd.Categorical(["d", "e"], categories=["d", "e", "f"]) with warns_that_defaulting_to_pandas(): df = pd.crosstab(foo, bar) assert isinstance(df, pd.DataFrame) with warns_that_defaulting_to_pandas(): df = pd.crosstab(foo, bar, dropna=False) assert isinstance(df, pd.DataFrame)
def test_asfreq(): index = pd.date_range("1/1/2000", periods=4, freq="T") series = pd.Series([0.0, None, 2.0, 3.0], index=index) df = pd.DataFrame({"s": series}) with warns_that_defaulting_to_pandas(): # We are only testing that this defaults to pandas, so we will just check for # the warning df.asfreq(freq="30S")
def test_math_functions_level(op): modin_df = pd.DataFrame(test_data["int_data"]) modin_df.index = pandas.MultiIndex.from_tuples([(i // 4, i // 2, i) for i in modin_df.index]) # Defaults to pandas with warns_that_defaulting_to_pandas(): # Operation against self for sanity check getattr(modin_df, op)(modin_df, axis=0, level=1)
def test_sql_extension(): import modin.experimental.sql # noqa: F401 # Modin can't read_csv from a buffer. with warns_that_defaulting_to_pandas(): df = pd.read_csv(io.StringIO(titanic_snippet)) expected_df = df[df["survived"] == 1][["passenger_id", "survived"]] sql = "SELECT passenger_id, survived WHERE survived = 1" # DataFrame.convert_dtypes defaults to pandas. with warns_that_defaulting_to_pandas(): query_result = df.sql(sql) assert list(query_result.columns) == ["passenger_id", "survived"] values_left = expected_df.values values_right = query_result.values assert values_left.shape == values_right.shape assert (values_left == values_right).all()
def test_median_skew_std_var_sem_1953(method): # See #1953 for details arrays = [["1", "1", "2", "2"], ["1", "2", "3", "4"]] data = [[1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4]] modin_df = pd.DataFrame(data, index=arrays) pandas_df = pandas.DataFrame(data, index=arrays) # These shouldn't default to pandas: follow up on # https://github.com/modin-project/modin/issues/1953 with warns_that_defaulting_to_pandas(): eval_general(modin_df, pandas_df, lambda df: getattr(df, method)(level=0))
def test_merge_asof_suffixes(): """Suffix variations are handled the same as Pandas.""" left = {"a": [1, 5, 10]} right = {"a": [2, 3, 6]} pandas_left, pandas_right = (pandas.DataFrame(left), pandas.DataFrame(right)) modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right) for suffixes in [("a", "b"), (False, "c"), ("d", False)]: pandas_merged = pandas.merge_asof( pandas_left, pandas_right, left_index=True, right_index=True, suffixes=suffixes, ) with warns_that_defaulting_to_pandas(): modin_merged = pd.merge_asof( modin_left, modin_right, left_index=True, right_index=True, suffixes=suffixes, ) df_equals(pandas_merged, modin_merged) with pytest.raises(ValueError): pandas.merge_asof( pandas_left, pandas_right, left_index=True, right_index=True, suffixes=(False, False), ) with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): modin_merged = pd.merge_asof( modin_left, modin_right, left_index=True, right_index=True, suffixes=(False, False), )
def test_ops_defaulting_to_pandas(op, make_args): modin_df = pd.DataFrame(test_data_diff_dtype).drop(["str_col", "bool_col"], axis=1) with warns_that_defaulting_to_pandas(): operation = getattr(modin_df, op) if make_args is not None: operation(**make_args(modin_df)) else: try: operation() # `except` for non callable attributes except TypeError: pass
def test_multi_level_comparison(data, op): modin_df_multi_level = pd.DataFrame(data) new_idx = pandas.MultiIndex.from_tuples([ (i // 4, i // 2, i) for i in modin_df_multi_level.index ]) modin_df_multi_level.index = new_idx # Defaults to pandas with warns_that_defaulting_to_pandas(): # Operation against self for sanity check getattr(modin_df_multi_level, op)(modin_df_multi_level, axis=0, level=1)
def test_sort_multiindex(sort_remaining): data = test_data["int_data"] modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) for index in ["index", "columns"]: new_index = generate_multiindex(len(getattr(modin_df, index))) for df in [modin_df, pandas_df]: setattr(df, index, new_index) for kwargs in [{"level": 0}, {"axis": 0}, {"axis": 1}]: with warns_that_defaulting_to_pandas(): df_equals( modin_df.sort_index(sort_remaining=sort_remaining, **kwargs), pandas_df.sort_index(sort_remaining=sort_remaining, **kwargs), )
def test_wide_to_long(): data = pd.DataFrame({ "hr1": [514, 573], "hr2": [545, 526], "team": ["Red Sox", "Yankees"], "year1": [2007, 2008], "year2": [2008, 2008], }) with warns_that_defaulting_to_pandas(): df = pd.wide_to_long(data, ["hr", "year"], "team", "index") assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.wide_to_long(data.to_numpy(), ["hr", "year"], "team", "index")
def test_distributed_pickling(filename, compression): data = test_data["int_data"] df = pd.DataFrame(data) filename_param = filename if compression: filename = f"{filename}.gz" with (warns_that_defaulting_to_pandas() if filename_param == test_default_to_pickle_filename else nullcontext()): df.to_pickle_distributed(filename, compression=compression) pickled_df = pd.read_pickle_distributed(filename, compression=compression) df_equals(pickled_df, df) pickle_files = glob.glob(filename) teardown_test_files(pickle_files)
def test_merge_ordered(): data_a = { "key": list("aceace"), "lvalue": [1, 2, 3, 1, 2, 3], "group": list("aaabbb"), } data_b = {"key": list("bcd"), "rvalue": [1, 2, 3]} modin_df_a = pd.DataFrame(data_a) modin_df_b = pd.DataFrame(data_b) with warns_that_defaulting_to_pandas(): df = pd.merge_ordered(modin_df_a, modin_df_b, fill_method="ffill", left_by="group") assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.merge_ordered(data_a, data_b, fill_method="ffill", left_by="group")
def test_buffer_of_chunked_at(data_has_nulls, n_chunks): """Test that getting buffers of physically chunked column works properly.""" data = get_data_of_all_types( # For the simplicity of the test include only primitive types, so the test can use # only one function to export a column instead of if-elsing to find a type-according one has_nulls=data_has_nulls, include_dtypes=["bool", "int", "uint", "float"], ) pd_df = pandas.DataFrame(data) pd_chunks = split_df_into_chunks(pd_df, n_chunks) chunked_at = pa.concat_tables( [pa.Table.from_pandas(pd_df) for pd_df in pd_chunks]) md_df = from_arrow(chunked_at) protocol_df = md_df.__dataframe__() for i, col in enumerate(protocol_df.get_columns()): assert col.num_chunks() > 1 assert len(col._pyarrow_table.column(0).chunks) > 1 buffers = col.get_buffers() data_buff, data_dtype = buffers["data"] result = buffer_to_ndarray(data_buff, data_dtype, col.offset, col.size) result = set_nulls(result, col, buffers["validity"]) # Our configuration in pytest.ini requires that we explicitly catch all # instances of defaulting to pandas, this one raises a warning on `.to_numpy()` with warns_that_defaulting_to_pandas(): reference = md_df.iloc[:, i].to_numpy() np.testing.assert_array_equal(reference, result) protocol_df = md_df.__dataframe__(allow_copy=False) for i, col in enumerate(protocol_df.get_columns()): assert col.num_chunks() > 1 assert len(col._pyarrow_table.column(0).chunks) > 1 # Catch exception on attempt of doing a copy due to chunks combining with pytest.raises(RuntimeError): col.get_buffers()
def test_merge_asof_on_variations(): """on=,left_on=,right_on=,right_index=,left_index= options match Pandas.""" left = {"a": [1, 5, 10], "left_val": ["a", "b", "c"]} left_index = [6, 8, 12] right = {"a": [1, 2, 3, 6, 7], "right_val": ["d", "e", "f", "g", "h"]} right_index = [6, 7, 8, 9, 15] pandas_left, pandas_right = ( pandas.DataFrame(left, index=left_index), pandas.DataFrame(right, index=right_index), ) modin_left, modin_right = ( pd.DataFrame(left, index=left_index), pd.DataFrame(right, index=right_index), ) for on_arguments in [ { "on": "a" }, { "left_on": "a", "right_on": "a" }, { "left_on": "a", "right_index": True }, { "left_index": True, "right_on": "a" }, { "left_index": True, "right_index": True }, ]: pandas_merged = pandas.merge_asof(pandas_left, pandas_right, **on_arguments) with warns_that_defaulting_to_pandas(): modin_merged = pd.merge_asof(modin_left, modin_right, **on_arguments) df_equals(pandas_merged, modin_merged)
def test_get_dummies(): s = pd.Series(list("abca")) with warns_that_defaulting_to_pandas(): pd.get_dummies(s) s1 = ["a", "b", np.nan] with warns_that_defaulting_to_pandas(): pd.get_dummies(s1) with warns_that_defaulting_to_pandas(): pd.get_dummies(s1, dummy_na=True) data = {"A": ["a", "b", "a"], "B": ["b", "a", "c"], "C": [1, 2, 3]} modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_result = pd.get_dummies(modin_df, prefix=["col1", "col2"]) pandas_result = pandas.get_dummies(pandas_df, prefix=["col1", "col2"]) df_equals(modin_result, pandas_result) assert modin_result._to_pandas().columns.equals(pandas_result.columns) assert modin_result.shape == pandas_result.shape modin_result = pd.get_dummies(pd.DataFrame(pd.Series(list("abcdeabac")))) pandas_result = pandas.get_dummies( pandas.DataFrame(pandas.Series(list("abcdeabac")))) df_equals(modin_result, pandas_result) assert modin_result._to_pandas().columns.equals(pandas_result.columns) assert modin_result.shape == pandas_result.shape with pytest.raises(NotImplementedError): pd.get_dummies(modin_df, prefix=["col1", "col2"], sparse=True) with warns_that_defaulting_to_pandas(): pd.get_dummies(pd.Series(list("abcaa"))) with warns_that_defaulting_to_pandas(): pd.get_dummies(pd.Series(list("abcaa")), drop_first=True) with warns_that_defaulting_to_pandas(): pd.get_dummies(pd.Series(list("abc")), dtype=float) with warns_that_defaulting_to_pandas(): pd.get_dummies(1)
def test_lreshape(): data = pd.DataFrame({ "hr1": [514, 573], "hr2": [545, 526], "team": ["Red Sox", "Yankees"], "year1": [2007, 2008], "year2": [2008, 2008], }) with warns_that_defaulting_to_pandas(): df = pd.lreshape(data, { "year": ["year1", "year2"], "hr": ["hr1", "hr2"] }) assert isinstance(df, pd.DataFrame) with pytest.raises(ValueError): pd.lreshape(data.to_numpy(), { "year": ["year1", "year2"], "hr": ["hr1", "hr2"] })
def test_merge_asof_bad_arguments(): left = {"a": [1, 5, 10], "b": [5, 7, 9]} right = {"a": [2, 3, 6], "b": [6, 5, 20]} pandas_left, pandas_right = (pandas.DataFrame(left), pandas.DataFrame(right)) modin_left, modin_right = pd.DataFrame(left), pd.DataFrame(right) # Can't mix by with left_by/right_by with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pandas.merge_asof(pandas_left, pandas_right, on="a", by="b", left_by="can't do with by") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, on="a", by="b", left_by="can't do with by") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pandas.merge_asof(pandas_left, pandas_right, by="b", on="a", right_by="can't do with by") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, by="b", on="a", right_by="can't do with by") # Can't mix on with left_on/right_on with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pandas.merge_asof(pandas_left, pandas_right, on="a", left_on="can't do with by") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, on="a", left_on="can't do with by") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pandas.merge_asof(pandas_left, pandas_right, on="a", right_on="can't do with by") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, on="a", right_on="can't do with by") # Can't mix left_index with left_on or on, similarly for right. with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, on="a", right_index=True) with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, left_on="a", right_on="a", right_index=True) with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, on="a", left_index=True) with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, left_on="a", right_on="a", left_index=True) # Need both left and right with pytest.raises( Exception): # Pandas bug, didn't validate inputs sufficiently pandas.merge_asof(pandas_left, pandas_right, left_on="a") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, left_on="a") with pytest.raises( Exception): # Pandas bug, didn't validate inputs sufficiently pandas.merge_asof(pandas_left, pandas_right, right_on="a") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right, right_on="a") with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pandas.merge_asof(pandas_left, pandas_right) with pytest.raises(ValueError), warns_that_defaulting_to_pandas(): pd.merge_asof(modin_left, modin_right)
def test_syncronous_mode(): assert BenchmarkMode.get() # On Omnisci storage, transpose() defaults to Pandas. with (warns_that_defaulting_to_pandas() if StorageFormat.get() == "Omnisci" else nullcontext()): pd.DataFrame(test_data_values[0]).mean()
def test___round__(): data = test_data_values[0] with warns_that_defaulting_to_pandas(): pd.DataFrame(data).__round__()
def test_drop(): frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} simple = pandas.DataFrame(frame_data) modin_simple = pd.DataFrame(frame_data) df_equals(modin_simple.drop("A", axis=1), simple[["B"]]) df_equals(modin_simple.drop(["A", "B"], axis="columns"), simple[[]]) df_equals(modin_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) df_equals(modin_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) pytest.raises(ValueError, modin_simple.drop, 5) pytest.raises(ValueError, modin_simple.drop, "C", 1) pytest.raises(ValueError, modin_simple.drop, [1, 5]) pytest.raises(ValueError, modin_simple.drop, ["A", "C"], 1) # errors = 'ignore' df_equals(modin_simple.drop(5, errors="ignore"), simple) df_equals(modin_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]) df_equals(modin_simple.drop("C", axis=1, errors="ignore"), simple) df_equals(modin_simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]) # non-unique nu_df = pandas.DataFrame(zip(range(3), range(-3, 1), list("abc")), columns=["a", "a", "b"]) modin_nu_df = pd.DataFrame(nu_df) df_equals(modin_nu_df.drop("a", axis=1), nu_df[["b"]]) df_equals(modin_nu_df.drop("b", axis="columns"), nu_df["a"]) df_equals(modin_nu_df.drop([]), nu_df) nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"])) nu_df.columns = list("abc") modin_nu_df = pd.DataFrame(nu_df) df_equals(modin_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) df_equals(modin_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) # inplace cache issue frame_data = random_state.randn(10, 3) df = pandas.DataFrame(frame_data, columns=list("abc")) modin_df = pd.DataFrame(frame_data, columns=list("abc")) expected = df[~(df.b > 0)] modin_df.drop(labels=df[df.b > 0].index, inplace=True) df_equals(modin_df, expected) midx = pd.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) df = pd.DataFrame( index=midx, columns=["big", "small"], data=[ [45, 30], [200, 100], [1.5, 1], [30, 20], [250, 150], [1.5, 0.8], [320, 250], [1, 0.8], [0.3, 0.2], ], ) with warns_that_defaulting_to_pandas(): df.drop(index="length", level=1)
def test___repr__(): frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 100)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 99)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 101)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(1000, 102)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # ___repr___ method has a different code path depending on # whether the number of rows is >60; and a different code path # depending on the number of columns is >20. # Previous test cases already check the case when cols>20 # and rows>60. The cases that follow exercise the other three # combinations. # rows <= 60, cols > 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 100)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # rows <= 60, cols <= 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(10, 10)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # rows > 60, cols <= 20 frame_data = random_state.randint(RAND_LOW, RAND_HIGH, size=(100, 10)) pandas_df = pandas.DataFrame(frame_data) modin_df = pd.DataFrame(frame_data) assert repr(pandas_df) == repr(modin_df) # Empty pandas_df = pandas.DataFrame(columns=["col{}".format(i) for i in range(100)]) modin_df = pd.DataFrame(columns=["col{}".format(i) for i in range(100)]) assert repr(pandas_df) == repr(modin_df) # From Issue #1705 string_data = """"time","device_id","lat","lng","accuracy","activity_1","activity_1_conf","activity_2","activity_2_conf","activity_3","activity_3_conf" "2016-08-26 09:00:00.206",2,60.186805,24.821049,33.6080017089844,"STILL",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:05.428",5,60.192928,24.767222,5,"WALKING",62,"ON_BICYCLE",29,"RUNNING",6 "2016-08-26 09:00:05.818",1,60.166382,24.700443,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:15.816",1,60.166254,24.700671,3,"WALKING",75,"IN_VEHICLE",5,"ON_BICYCLE",5 "2016-08-26 09:00:16.413",5,60.193055,24.767427,5,"WALKING",85,"ON_BICYCLE",15,"UNKNOWN",0 "2016-08-26 09:00:20.578",3,60.152996,24.745216,3.90000009536743,"STILL",69,"IN_VEHICLE",31,"UNKNOWN",0""" pandas_df = pandas.read_csv(io.StringIO(string_data)) with warns_that_defaulting_to_pandas(): modin_df = pd.read_csv(io.StringIO(string_data)) assert repr(pandas_df) == repr(modin_df)
def test___finalize__(): data = test_data_values[0] with warns_that_defaulting_to_pandas(): pd.DataFrame(data).__finalize__(None)
def test_merge(): frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["outer", "inner"] for how in join_types: with warns_that_defaulting_to_pandas( ) if how == "outer" else contextlib.nullcontext(): modin_result = pd.merge(modin_df, modin_df2, how=how) pandas_result = pandas.merge(pandas_df, pandas_df2, how=how) df_equals(modin_result, pandas_result) # left_on and right_index with warns_that_defaulting_to_pandas(): modin_result = pd.merge(modin_df, modin_df2, how=how, left_on="col1", right_index=True) pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_on="col1", right_index=True) df_equals(modin_result, pandas_result) # left_index and right_on with warns_that_defaulting_to_pandas(): modin_result = pd.merge(modin_df, modin_df2, how=how, left_index=True, right_on="col1") pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_index=True, right_on="col1") df_equals(modin_result, pandas_result) # left_on and right_on col1 if how == "outer": warning_catcher = warns_that_defaulting_to_pandas() else: warning_catcher = contextlib.nullcontext() with warning_catcher: modin_result = pd.merge(modin_df, modin_df2, how=how, left_on="col1", right_on="col1") pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_on="col1", right_on="col1") df_equals(modin_result, pandas_result) # left_on and right_on col2 if how == "outer": warning_catcher = warns_that_defaulting_to_pandas() else: warning_catcher = contextlib.nullcontext() with warning_catcher: modin_result = pd.merge(modin_df, modin_df2, how=how, left_on="col2", right_on="col2") pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_on="col2", right_on="col2") df_equals(modin_result, pandas_result) # left_index and right_index modin_result = pd.merge(modin_df, modin_df2, how=how, left_index=True, right_index=True) pandas_result = pandas.merge(pandas_df, pandas_df2, how=how, left_index=True, right_index=True) df_equals(modin_result, pandas_result) s = pd.Series(frame_data.get("col1")) with pytest.raises(ValueError): pd.merge(s, modin_df2) with pytest.raises(TypeError): pd.merge("Non-valid type", modin_df2)
def test_empty_dataframe(): df = pd.DataFrame(columns=["a", "b"]) with warns_that_defaulting_to_pandas(): df[(df.a == 1) & (df.b == 2)]
def test_merge_asof_merge_options(): modin_quotes = pd.DataFrame({ "time": [ pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.030"), pd.Timestamp("2016-05-25 13:30:00.041"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.049"), pd.Timestamp("2016-05-25 13:30:00.072"), pd.Timestamp("2016-05-25 13:30:00.075"), ], "ticker": ["GOOG", "MSFT", "MSFT", "MSFT", "GOOG", "AAPL", "GOOG", "MSFT"], "bid": [720.50, 51.95, 51.97, 51.99, 720.50, 97.99, 720.50, 52.01], "ask": [720.93, 51.96, 51.98, 52.00, 720.93, 98.01, 720.88, 52.03], }) modin_trades = pd.DataFrame({ "time": [ pd.Timestamp("2016-05-25 13:30:00.023"), pd.Timestamp("2016-05-25 13:30:00.038"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.048"), pd.Timestamp("2016-05-25 13:30:00.048"), ], "ticker2": ["MSFT", "MSFT", "GOOG", "GOOG", "AAPL"], "price": [51.95, 51.95, 720.77, 720.92, 98.0], "quantity": [75, 155, 100, 100, 100], }) pandas_quotes, pandas_trades = to_pandas(modin_quotes), to_pandas( modin_trades) # left_by + right_by with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", left_by="ticker", right_by="ticker2", ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", left_by="ticker", right_by="ticker2", ), modin_result, ) # Just by: pandas_trades["ticker"] = pandas_trades["ticker2"] modin_trades["ticker"] = modin_trades["ticker2"] with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", ), modin_result, ) # Tolerance with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", tolerance=pd.Timedelta("2ms"), ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", tolerance=pd.Timedelta("2ms"), ), modin_result, ) # Direction with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", direction="forward", ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", direction="forward", ), modin_result, ) # Allow exact matches with warns_that_defaulting_to_pandas(): modin_result = pd.merge_asof( modin_quotes, modin_trades, on="time", by="ticker", tolerance=pd.Timedelta("10ms"), allow_exact_matches=False, ) df_equals( pandas.merge_asof( pandas_quotes, pandas_trades, on="time", by="ticker", tolerance=pd.Timedelta("10ms"), allow_exact_matches=False, ), modin_result, )