def test_drop_duplicates_with_missing_index_values(): data = { "columns": ["value", "time", "id"], "index": [ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, ], "data": [ ["3", 1279213398000.0, 88.0], ["3", 1279204682000.0, 88.0], ["0", 1245772835000.0, 448.0], ["0", 1270564258000.0, 32.0], ["0", 1267106669000.0, 118.0], ["7", 1300621123000.0, 5.0], ["0", 1251130752000.0, 957.0], ["0", 1311683506000.0, 62.0], ["9", 1283692698000.0, 89.0], ["9", 1270234253000.0, 64.0], ["0", 1285088818000.0, 50.0], ["0", 1218212725000.0, 695.0], ["2", 1383933968000.0, 348.0], ["0", 1368227625000.0, 257.0], ["1", 1454514093000.0, 446.0], ["1", 1428497427000.0, 134.0], ["1", 1459184936000.0, 568.0], ["1", 1502293302000.0, 599.0], ["1", 1491833358000.0, 829.0], ["1", 1485431534000.0, 806.0], ["8", 1351800505000.0, 101.0], ["0", 1357247721000.0, 916.0], ["0", 1335804423000.0, 370.0], ["24", 1327547726000.0, 720.0], ["0", 1332334140000.0, 415.0], ["0", 1309543100000.0, 30.0], ["18", 1309541141000.0, 30.0], ["0", 1298979435000.0, 48.0], ["14", 1276098160000.0, 59.0], ["0", 1233936302000.0, 109.0], ], } pandas_df = pandas.DataFrame(data["data"], index=data["index"], columns=data["columns"]) modin_df = pd.DataFrame(data["data"], index=data["index"], columns=data["columns"]) modin_result = modin_df.sort_values(["id", "time"]).drop_duplicates(["id"]) pandas_result = pandas_df.sort_values(["id", "time"]).drop_duplicates(["id"]) df_equals(modin_result, pandas_result)
def test_merge(test_data, test_data2): modin_df = pd.DataFrame( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), ) pandas_df = pandas.DataFrame( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pandas.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), ) modin_df2 = pd.DataFrame( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), ) pandas_df2 = pandas.DataFrame( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pandas.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), ) hows = ["left", "inner"] ons = ["col33", ["col33", "col34"]] sorts = [False, True] for i in range(2): for j in range(2): modin_result = modin_df.merge(modin_df2, how=hows[i], on=ons[j], sort=sorts[j]) pandas_result = pandas_df.merge(pandas_df2, how=hows[i], on=ons[j], sort=sorts[j]) df_equals(modin_result, pandas_result) modin_result = modin_df.merge( modin_df2, how=hows[i], left_on="key", right_on="key", sort=sorts[j], ) pandas_result = pandas_df.merge( pandas_df2, how=hows[i], left_on="key", right_on="key", sort=sorts[j], ) df_equals(modin_result, pandas_result) # Test for issue #1771 modin_df = pd.DataFrame({"name": np.arange(40)}) modin_df2 = pd.DataFrame({"name": [39], "position": [0]}) pandas_df = pandas.DataFrame({"name": np.arange(40)}) pandas_df2 = pandas.DataFrame({"name": [39], "position": [0]}) modin_result = modin_df.merge(modin_df2, on="name", how="inner") pandas_result = pandas_df.merge(pandas_df2, on="name", how="inner") df_equals(modin_result, pandas_result) frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col1": [0, 1, 2], "col2": [1, 5, 6]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["outer", "inner"] for how in join_types: # Defaults modin_result = modin_df.merge(modin_df2, how=how) pandas_result = pandas_df.merge(pandas_df2, how=how) df_equals(modin_result, pandas_result) # left_on and right_index modin_result = modin_df.merge(modin_df2, how=how, left_on="col1", right_index=True) pandas_result = pandas_df.merge(pandas_df2, how=how, left_on="col1", right_index=True) df_equals(modin_result, pandas_result) # left_index and right_on modin_result = modin_df.merge(modin_df2, how=how, left_index=True, right_on="col1") pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True, right_on="col1") df_equals(modin_result, pandas_result) # left_on and right_on col1 modin_result = modin_df.merge(modin_df2, how=how, left_on="col1", right_on="col1") pandas_result = pandas_df.merge(pandas_df2, how=how, left_on="col1", right_on="col1") df_equals(modin_result, pandas_result) # left_on and right_on col2 modin_result = modin_df.merge(modin_df2, how=how, left_on="col2", right_on="col2") pandas_result = pandas_df.merge(pandas_df2, how=how, left_on="col2", right_on="col2") df_equals(modin_result, pandas_result) # left_index and right_index modin_result = modin_df.merge(modin_df2, how=how, left_index=True, right_index=True) pandas_result = pandas_df.merge(pandas_df2, how=how, left_index=True, right_index=True) df_equals(modin_result, pandas_result) # Named Series promoted to DF s = pd.Series(frame_data2.get("col1")) with pytest.raises(ValueError): modin_df.merge(s) s = pd.Series(frame_data2.get("col1"), name="col1") df_equals(modin_df.merge(s), modin_df.merge(modin_df2[["col1"]])) with pytest.raises(TypeError): modin_df.merge("Non-valid type")
def test_join(test_data, test_data2): modin_df = pd.DataFrame( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pd.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), ) pandas_df = pandas.DataFrame( test_data, columns=["col{}".format(i) for i in range(test_data.shape[1])], index=pandas.Index([i for i in range(1, test_data.shape[0] + 1)], name="key"), ) modin_df2 = pd.DataFrame( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pd.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), ) pandas_df2 = pandas.DataFrame( test_data2, columns=["col{}".format(i) for i in range(test_data2.shape[1])], index=pandas.Index([i for i in range(1, test_data2.shape[0] + 1)], name="key"), ) hows = ["inner", "left", "right", "outer"] ons = ["col33", "col34"] sorts = [False, True] for i in range(4): for j in range(2): modin_result = modin_df.join( modin_df2, how=hows[i], on=ons[j], sort=sorts[j], lsuffix="_caller", rsuffix="_other", ) pandas_result = pandas_df.join( pandas_df2, how=hows[i], on=ons[j], sort=sorts[j], lsuffix="_caller", rsuffix="_other", ) df_equals(modin_result, pandas_result) frame_data = { "col1": [0, 1, 2, 3], "col2": [4, 5, 6, 7], "col3": [8, 9, 0, 1], "col4": [2, 4, 5, 6], } modin_df = pd.DataFrame(frame_data) pandas_df = pandas.DataFrame(frame_data) frame_data2 = {"col5": [0], "col6": [1]} modin_df2 = pd.DataFrame(frame_data2) pandas_df2 = pandas.DataFrame(frame_data2) join_types = ["left", "right", "outer", "inner"] for how in join_types: modin_join = modin_df.join(modin_df2, how=how) pandas_join = pandas_df.join(pandas_df2, how=how) df_equals(modin_join, pandas_join) frame_data3 = {"col7": [1, 2, 3, 5, 6, 7, 8]} modin_df3 = pd.DataFrame(frame_data3) pandas_df3 = pandas.DataFrame(frame_data3) join_types = ["left", "outer", "inner"] for how in join_types: modin_join = modin_df.join([modin_df2, modin_df3], how=how) pandas_join = pandas_df.join([pandas_df2, pandas_df3], how=how) df_equals(modin_join, pandas_join)
def test_rename_multiindex(): tuples_index = [("foo1", "bar1"), ("foo2", "bar2")] tuples_columns = [("fizz1", "buzz1"), ("fizz2", "buzz2")] index = pandas.MultiIndex.from_tuples(tuples_index, names=["foo", "bar"]) columns = pandas.MultiIndex.from_tuples(tuples_columns, names=["fizz", "buzz"]) frame_data = [(0, 0), (1, 1)] df = pandas.DataFrame(frame_data, index=index, columns=columns) modin_df = pd.DataFrame(frame_data, index=index, columns=columns) # # without specifying level -> accross all levels renamed = df.rename( index={"foo1": "foo3", "bar2": "bar3"}, columns={"fizz1": "fizz3", "buzz2": "buzz3"}, ) modin_renamed = modin_df.rename( index={"foo1": "foo3", "bar2": "bar3"}, columns={"fizz1": "fizz3", "buzz2": "buzz3"}, ) tm.assert_index_equal(renamed.index, modin_renamed.index) renamed = df.rename( index={"foo1": "foo3", "bar2": "bar3"}, columns={"fizz1": "fizz3", "buzz2": "buzz3"}, ) tm.assert_index_equal(renamed.columns, modin_renamed.columns) assert renamed.index.names == modin_renamed.index.names assert renamed.columns.names == modin_renamed.columns.names # # with specifying a level # dict renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0) modin_renamed = modin_df.rename( columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=0 ) tm.assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz") modin_renamed = modin_df.rename( columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="fizz" ) tm.assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1) modin_renamed = modin_df.rename( columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level=1 ) tm.assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz") modin_renamed = modin_df.rename( columns={"fizz1": "fizz3", "buzz2": "buzz3"}, level="buzz" ) tm.assert_index_equal(renamed.columns, modin_renamed.columns) # function func = str.upper renamed = df.rename(columns=func, level=0) modin_renamed = modin_df.rename(columns=func, level=0) tm.assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns=func, level="fizz") modin_renamed = modin_df.rename(columns=func, level="fizz") tm.assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns=func, level=1) modin_renamed = modin_df.rename(columns=func, level=1) tm.assert_index_equal(renamed.columns, modin_renamed.columns) renamed = df.rename(columns=func, level="buzz") modin_renamed = modin_df.rename(columns=func, level="buzz") tm.assert_index_equal(renamed.columns, modin_renamed.columns) # index renamed = df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) modin_renamed = modin_df.rename(index={"foo1": "foo3", "bar2": "bar3"}, level=0) tm.assert_index_equal(modin_renamed.index, renamed.index)
def test_sample(data, axis): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) with pytest.raises(ValueError): modin_df.sample(n=3, frac=0.4, axis=axis) with pytest.raises(KeyError): modin_df.sample(frac=0.5, weights="CoLuMn_No_ExIsT", axis=0) with pytest.raises(ValueError): modin_df.sample(frac=0.5, weights=modin_df.columns[0], axis=1) with pytest.raises(ValueError): modin_df.sample( frac=0.5, weights=[0.5 for _ in range(len(modin_df.index[:-1]))], axis=0 ) with pytest.raises(ValueError): modin_df.sample( frac=0.5, weights=[0.5 for _ in range(len(modin_df.columns[:-1]))], axis=1, ) with pytest.raises(ValueError): modin_df.sample(n=-3, axis=axis) with pytest.raises(ValueError): modin_df.sample(frac=0.2, weights=pandas.Series(), axis=axis) if isinstance(axis, str): num_axis = pandas.DataFrame()._get_axis_number(axis) else: num_axis = axis # weights that sum to 1 sums = sum(i % 2 for i in range(len(modin_df.axes[num_axis]))) weights = [i % 2 / sums for i in range(len(modin_df.axes[num_axis]))] modin_result = modin_df.sample( frac=0.5, random_state=42, weights=weights, axis=axis ) pandas_result = pandas_df.sample( frac=0.5, random_state=42, weights=weights, axis=axis ) df_equals(modin_result, pandas_result) # weights that don't sum to 1 weights = [i % 2 for i in range(len(modin_df.axes[num_axis]))] modin_result = modin_df.sample( frac=0.5, random_state=42, weights=weights, axis=axis ) pandas_result = pandas_df.sample( frac=0.5, random_state=42, weights=weights, axis=axis ) df_equals(modin_result, pandas_result) modin_result = modin_df.sample(n=0, axis=axis) pandas_result = pandas_df.sample(n=0, axis=axis) df_equals(modin_result, pandas_result) modin_result = modin_df.sample(frac=0.5, random_state=42, axis=axis) pandas_result = pandas_df.sample(frac=0.5, random_state=42, axis=axis) df_equals(modin_result, pandas_result) modin_result = modin_df.sample(n=2, random_state=42, axis=axis) pandas_result = pandas_df.sample(n=2, random_state=42, axis=axis) df_equals(modin_result, pandas_result) # issue #1692, numpy RandomState object # We must create a new random state for each iteration because the values that # are selected will be impacted if the object has already been used. random_state = np.random.RandomState(42) modin_result = modin_df.sample(frac=0.5, random_state=random_state, axis=axis) random_state = np.random.RandomState(42) pandas_result = pandas_df.sample(frac=0.5, random_state=random_state, axis=axis) df_equals(modin_result, pandas_result)
def test_keys(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.keys(), pandas_df.keys())
def test_loc_multi_index(): modin_df = pd.read_csv( "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0 ) pandas_df = pandas.read_csv( "modin/pandas/test/data/blah.csv", header=[0, 1, 2, 3], index_col=0 ) df_equals(modin_df.loc[1], pandas_df.loc[1]) df_equals(modin_df.loc[1, "Presidents"], pandas_df.loc[1, "Presidents"]) df_equals( modin_df.loc[1, ("Presidents", "Pure mentions")], pandas_df.loc[1, ("Presidents", "Pure mentions")], ) assert ( modin_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] == pandas_df.loc[1, ("Presidents", "Pure mentions", "IND", "all")] ) df_equals(modin_df.loc[(1, 2), "Presidents"], pandas_df.loc[(1, 2), "Presidents"]) tuples = [ ("bar", "one"), ("bar", "two"), ("bar", "three"), ("bar", "four"), ("baz", "one"), ("baz", "two"), ("baz", "three"), ("baz", "four"), ("foo", "one"), ("foo", "two"), ("foo", "three"), ("foo", "four"), ("qux", "one"), ("qux", "two"), ("qux", "three"), ("qux", "four"), ] modin_index = pd.MultiIndex.from_tuples(tuples, names=["first", "second"]) pandas_index = pandas.MultiIndex.from_tuples(tuples, names=["first", "second"]) frame_data = np.random.randint(0, 100, size=(16, 100)) modin_df = pd.DataFrame( frame_data, index=modin_index, columns=["col{}".format(i) for i in range(100)], ) pandas_df = pandas.DataFrame( frame_data, index=pandas_index, columns=["col{}".format(i) for i in range(100)], ) df_equals(modin_df.loc["bar", "col1"], pandas_df.loc["bar", "col1"]) assert modin_df.loc[("bar", "one"), "col1"] == pandas_df.loc[("bar", "one"), "col1"] df_equals( modin_df.loc["bar", ("col1", "col2")], pandas_df.loc["bar", ("col1", "col2")], ) # From issue #1456 transposed_modin = modin_df.T transposed_pandas = pandas_df.T df_equals( transposed_modin.loc[transposed_modin.index[:-2], :], transposed_pandas.loc[transposed_pandas.index[:-2], :], ) # From issue #1610 df_equals(modin_df.loc[modin_df.index], pandas_df.loc[pandas_df.index]) df_equals(modin_df.loc[modin_df.index[:7]], pandas_df.loc[pandas_df.index[:7]])
def test_fillna_invalid_method(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) # noqa F841 with pytest.raises(ValueError): modin_df.fillna(method="ffil")
def test_fillna_col_reordering(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.fillna(method="ffill"), pandas_df.fillna(method="ffill"))
def test_ffill(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.ffill(), pandas_df.ffill())
def test_fillna_skip_certain_blocks(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) # don't try to fill boolean, int blocks df_equals(modin_df.fillna(np.nan), pandas_df.fillna(np.nan))
import numpy as np #import pandas as pd import ray ray.init(huge_pages=False, plasma_directory="/localdisk/gashiman/plasma", memory=1024 * 1024 * 1024 * 200, object_store_memory=1024 * 1024 * 1024 * 200) import modin.pandas as pd df = pd.DataFrame([[2, np.datetime64('2013-08-01 08:14:37'), 1.1], [5, np.datetime64('2014-08-01 09:13:00'), 2.2], [8, np.datetime64('2015-08-01 09:48:00'), 3.3]], index=[1, 2, 3], columns=['health', 'timestamp', 'shield']) print(df) transformed = df[['health', 'timestamp', 'shield']].transform({ 'health': lambda x: x, 'timestamp': lambda x: pd.DatetimeIndex(x).year, 'shield': lambda x: x }).groupby(['health', 'timestamp', 'shield']) print(transformed) df1 = transformed.size().reset_index().sort_values(by=['timestamp', 0], ascending=[True, False]) print(df1)
import modin.pandas as pd df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], index=[1, 2, 3], columns=['id', 'max_speed', 'shield']) print(df) df1 = df.groupby(['max_speed', 'shield']).size() print(df1) print(df1.axes)
def test_insert_loc(data, loc): modin_df, pandas_df = pd.DataFrame(data), pandas.DataFrame(data) value = modin_df.iloc[:, 0] eval_insert(modin_df, pandas_df, loc=loc, value=value)
def test_first_valid_index(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) assert modin_df.first_valid_index() == (pandas_df.first_valid_index())
def test_values(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) np.testing.assert_equal(modin_df.values, pandas_df.values)
def test_iat(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) # noqa F841 with pytest.raises(NotImplementedError): modin_df.iat()
def test___setitem__(data): eval_setitem(*create_test_dfs(data), loc=-1, value=1) eval_setitem(*create_test_dfs(data), loc=-1, value=lambda df: type(df)(df[df.columns[0]])) nrows = len(data[list(data.keys())[0]]) arr = np.arange(nrows * 2).reshape(-1, 2) eval_setitem(*create_test_dfs(data), loc=-1, value=arr) eval_setitem(*create_test_dfs(data), col="___NON EXISTENT COLUMN", value=arr) eval_setitem(*create_test_dfs(data), loc=0, value=np.arange(nrows)) modin_df = pd.DataFrame(columns=data.keys()) pandas_df = pandas.DataFrame(columns=data.keys()) for col in modin_df.columns: modin_df[col] = np.arange(1000) for col in pandas_df.columns: pandas_df[col] = np.arange(1000) df_equals(modin_df, pandas_df) # Test series assignment to column modin_df = pd.DataFrame(columns=modin_df.columns) pandas_df = pandas.DataFrame(columns=pandas_df.columns) modin_df[modin_df.columns[-1]] = modin_df[modin_df.columns[0]] pandas_df[pandas_df.columns[-1]] = pandas_df[pandas_df.columns[0]] df_equals(modin_df, pandas_df) if not sys.version_info.major == 3 and sys.version_info.minor > 6: # This test doesn't work correctly on Python 3.6 # Test 2d ndarray assignment to column modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df["new_col"] = modin_df[[modin_df.columns[0]]].values pandas_df["new_col"] = pandas_df[[pandas_df.columns[0]]].values df_equals(modin_df, pandas_df) assert isinstance(modin_df["new_col"][0], type(pandas_df["new_col"][0])) # Transpose test modin_df = pd.DataFrame(data).T pandas_df = pandas.DataFrame(data).T # We default to pandas on non-string column names if not all(isinstance(c, str) for c in modin_df.columns): with pytest.warns(UserWarning): modin_df[modin_df.columns[0]] = 0 else: modin_df[modin_df.columns[0]] = 0 pandas_df[pandas_df.columns[0]] = 0 df_equals(modin_df, pandas_df) modin_df.columns = [str(i) for i in modin_df.columns] pandas_df.columns = [str(i) for i in pandas_df.columns] modin_df[modin_df.columns[0]] = 0 pandas_df[pandas_df.columns[0]] = 0 df_equals(modin_df, pandas_df) modin_df[modin_df.columns[0]][modin_df.index[0]] = 12345 pandas_df[pandas_df.columns[0]][pandas_df.index[0]] = 12345 df_equals(modin_df, pandas_df) # from issue #2390 modin_df = pd.DataFrame({"a": [1, 2, 3]}) pandas_df = pandas.DataFrame({"a": [1, 2, 3]}) modin_df["b"] = pd.Series([4, 5, 6, 7, 8]) pandas_df["b"] = pandas.Series([4, 5, 6, 7, 8]) df_equals(modin_df, pandas_df) # from issue #2442 data = {"a": [1, 2, 3, 4]} # Index with duplicated timestamp index = pandas.to_datetime( ["2020-02-06", "2020-02-06", "2020-02-22", "2020-03-26"]) md_df, pd_df = create_test_dfs(data, index=index) # Setting new column pd_df["b"] = pandas.Series(np.arange(4)) md_df["b"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df) # Setting existing column pd_df["b"] = pandas.Series(np.arange(4)) md_df["b"] = pd.Series(np.arange(4)) df_equals(md_df, pd_df)
def test_loc(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) key1 = modin_df.columns[0] key2 = modin_df.columns[1] # Scaler df_equals(modin_df.loc[0, key1], pandas_df.loc[0, key1]) # Series df_equals(modin_df.loc[0], pandas_df.loc[0]) df_equals(modin_df.loc[1:, key1], pandas_df.loc[1:, key1]) df_equals(modin_df.loc[1:2, key1], pandas_df.loc[1:2, key1]) # DataFrame df_equals(modin_df.loc[[1, 2]], pandas_df.loc[[1, 2]]) # List-like of booleans indices = [i % 3 == 0 for i in range(len(modin_df.index))] columns = [i % 5 == 0 for i in range(len(modin_df.columns))] modin_result = modin_df.loc[indices, columns] pandas_result = pandas_df.loc[indices, columns] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[:, columns] pandas_result = pandas_df.loc[:, columns] df_equals(modin_result, pandas_result) modin_result = modin_df.loc[indices] pandas_result = pandas_df.loc[indices] df_equals(modin_result, pandas_result) # See issue #80 # df_equals(modin_df.loc[[1, 2], ['col1']], pandas_df.loc[[1, 2], ['col1']]) df_equals(modin_df.loc[1:2, key1:key2], pandas_df.loc[1:2, key1:key2]) # From issue #421 df_equals(modin_df.loc[:, [key2, key1]], pandas_df.loc[:, [key2, key1]]) df_equals(modin_df.loc[[2, 1], :], pandas_df.loc[[2, 1], :]) # From issue #1023 key1 = modin_df.columns[0] key2 = modin_df.columns[-2] df_equals(modin_df.loc[:, key1:key2], pandas_df.loc[:, key1:key2]) # Write Item modin_df_copy = modin_df.copy() pandas_df_copy = pandas_df.copy() modin_df_copy.loc[[1, 2]] = 42 pandas_df_copy.loc[[1, 2]] = 42 df_equals(modin_df_copy, pandas_df_copy) # From issue #1775 df_equals( modin_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], pandas_df.loc[lambda df: df.iloc[:, 0].isin(list(range(1000)))], ) # From issue #1374 with pytest.raises(KeyError): modin_df.loc["NO_EXIST"]
def test_rename_sanity(): source_df = pandas.DataFrame( test_data["int_data"])[["col1", "index", "col3", "col4"]] mapping = {"col1": "a", "index": "b", "col3": "c", "col4": "d"} modin_df = pd.DataFrame(source_df) df_equals(modin_df.rename(columns=mapping), source_df.rename(columns=mapping)) renamed2 = source_df.rename(columns=str.lower) df_equals(modin_df.rename(columns=str.lower), renamed2) modin_df = pd.DataFrame(renamed2) df_equals(modin_df.rename(columns=str.upper), renamed2.rename(columns=str.upper)) # index data = {"A": {"foo": 0, "bar": 1}} # gets sorted alphabetical df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) assert_index_equal( modin_df.rename(index={ "foo": "bar", "bar": "foo" }).index, df.rename(index={ "foo": "bar", "bar": "foo" }).index, ) assert_index_equal( modin_df.rename(index=str.upper).index, df.rename(index=str.upper).index) # Using the `mapper` functionality with `axis` assert_index_equal( modin_df.rename(str.upper, axis=0).index, df.rename(str.upper, axis=0).index) assert_index_equal( modin_df.rename(str.upper, axis=1).columns, df.rename(str.upper, axis=1).columns, ) # have to pass something with pytest.raises(TypeError): modin_df.rename() # partial columns renamed = source_df.rename(columns={"col3": "foo", "col4": "bar"}) modin_df = pd.DataFrame(source_df) assert_index_equal( modin_df.rename(columns={ "col3": "foo", "col4": "bar" }).index, source_df.rename(columns={ "col3": "foo", "col4": "bar" }).index, ) # other axis renamed = source_df.T.rename(index={"col3": "foo", "col4": "bar"}) assert_index_equal( source_df.T.rename(index={ "col3": "foo", "col4": "bar" }).index, modin_df.T.rename(index={ "col3": "foo", "col4": "bar" }).index, ) # index with name index = pandas.Index(["foo", "bar"], name="name") renamer = pandas.DataFrame(data, index=index) modin_df = pd.DataFrame(data, index=index) renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) modin_renamed = modin_df.rename(index={"foo": "bar", "bar": "foo"}) assert_index_equal(renamed.index, modin_renamed.index) assert renamed.index.name == modin_renamed.index.name
def test_rename_sanity(): test_data = pandas.DataFrame(tm.getSeriesData()) mapping = {"A": "a", "B": "b", "C": "c", "D": "d"} modin_df = pd.DataFrame(test_data) df_equals(modin_df.rename(columns=mapping), test_data.rename(columns=mapping)) renamed2 = test_data.rename(columns=str.lower) df_equals(modin_df.rename(columns=str.lower), renamed2) modin_df = pd.DataFrame(renamed2) df_equals(modin_df.rename(columns=str.upper), renamed2.rename(columns=str.upper)) # index data = {"A": {"foo": 0, "bar": 1}} # gets sorted alphabetical df = pandas.DataFrame(data) modin_df = pd.DataFrame(data) tm.assert_index_equal( modin_df.rename(index={"foo": "bar", "bar": "foo"}).index, df.rename(index={"foo": "bar", "bar": "foo"}).index, ) tm.assert_index_equal( modin_df.rename(index=str.upper).index, df.rename(index=str.upper).index ) # Using the `mapper` functionality with `axis` tm.assert_index_equal( modin_df.rename(str.upper, axis=0).index, df.rename(str.upper, axis=0).index ) tm.assert_index_equal( modin_df.rename(str.upper, axis=1).columns, df.rename(str.upper, axis=1).columns, ) # have to pass something with pytest.raises(TypeError): modin_df.rename() # partial columns renamed = test_data.rename(columns={"C": "foo", "D": "bar"}) modin_df = pd.DataFrame(test_data) tm.assert_index_equal( modin_df.rename(columns={"C": "foo", "D": "bar"}).index, test_data.rename(columns={"C": "foo", "D": "bar"}).index, ) # other axis renamed = test_data.T.rename(index={"C": "foo", "D": "bar"}) tm.assert_index_equal( test_data.T.rename(index={"C": "foo", "D": "bar"}).index, modin_df.T.rename(index={"C": "foo", "D": "bar"}).index, ) # index with name index = pandas.Index(["foo", "bar"], name="name") renamer = pandas.DataFrame(data, index=index) modin_df = pd.DataFrame(data, index=index) renamed = renamer.rename(index={"foo": "bar", "bar": "foo"}) modin_renamed = modin_df.rename(index={"foo": "bar", "bar": "foo"}) tm.assert_index_equal(renamed.index, modin_renamed.index) assert renamed.index.name == modin_renamed.index.name
def listing_results(self, propertyTypes=["House"], minBedrooms=0, minBathrooms=0, minCarspaces=0, minPrice=0, maxPrice=1000000, minLandArea=0, state="", region="", area="", suburb="", includeSurroundingSuburbs=False): '''docstring''' api_listings = "listings/residential/_search" listings_params = { "listingType": "Sale", "propertyTypes": propertyTypes, "minBedrooms": minBedrooms, "minBathrooms": minBathrooms, "minCarspaces": minCarspaces, "minPrice": minPrice, "maxPrice": maxPrice, "minLandArea": minLandArea, "page:": 1, "locations": [{ "state": state, "region": region, "area": area, "suburb": suburb, "includeSurroundingSuburbs": includeSurroundingSuburbs }], "pageSize": 100 } results = pd.DataFrame(columns=[ 'id', 'advertiser_type', 'advertiser_id', 'price', 'price_min', 'price_max', 'features', 'property_type', 'bathrooms', 'bedrooms', 'carspaces', 'region', 'suburb', 'postcode', 'address', 'latitude', 'longitude', 'headline', 'description', 'labels', 'listingSlug' ]) listings_params = json.dumps(listings_params) headers = { 'X-API-Key': domain.api_secret, 'Content-Type': 'application/json' } url = str(domain.api_base + api_listings) try: listings_call = r.post(url, data=listings_params, headers=headers) except Exception as e: print('Error in API call: ', e) if 'errors' in listings_call.json(): error_log = ('processing error: ', listings_call.json()) result_count = listings_call.headers['X-Total-Count'] pagin_count = listings_call.headers['X-Pagination-PageSize'] print("results_count: ", result_count) print("pagin_count: ", pagin_count) if int(result_count) < int(pagin_count): print('hit single processing') data = listings_call.json() results = domain.clean_listings(data) return results elif int(result_count) > int(pagin_count): pag_results = pd.DataFrame() pages = int(int(result_count) / int(pagin_count)) + 1 if pages > 10: #built to mitigate the paging issue (limited to 1000 results) pages = 9 else: pass print("pages calced: ", pages) for i in range(0, pages): page = int( i ) + 1 #additonal iteration as domain api starts from 1 not 0 print("********next loop***********") print("page: ", page) globals()['listings_params' + str(i)] = { "listingType": "Sale", "propertyTypes": propertyTypes, "minBedrooms": minBedrooms, "minBathrooms": minBathrooms, "minCarspaces": minCarspaces, "minPrice": minPrice, "maxPrice": maxPrice, "minLandArea": minLandArea, "page": page, "locations": [{ "state": state, "region": region, "area": area, "suburb": suburb, "includeSurroundingSuburbs": includeSurroundingSuburbs }], "pageSize": 100 } globals()['listings_params' + str(i)] = json.dumps( globals()['listings_params' + str(i)]) print(globals()['listings_params' + str(i)]) headers = { 'X-API-Key': domain.api_secret, 'Content-Type': 'application/json' } url = str(domain.api_base + api_listings) try: globals()['listings_call' + str(i)] = r.post( url, data=globals()['listings_params' + str(i)], headers=headers) except Exception as e: print('Error in API call in loop: ', e) if 'errors' in globals()['listings_call' + str(i)].json(): error_log = ('processing error loop: ', globals()['listings_call' + str(i)].json()) print("loop: ", i) print( "total count: ", globals()['listings_call' + str(i)].headers['X-Total-Count']) print( "pagination number: ", globals()['listings_call' + str(i)].headers['X-Pagination-PageNumber']) globals()['data' + str(i)] = globals()['listings_call' + str(i)].json() globals()['df_' + str(i)] = domain.clean_listings( globals()['data' + str(i)]) #pag_results = pd.concat([pag_results,globals()['df_'+str(i)]],ignore_index=True) frames = [] for x in range(0, pages): frames.append(globals()['df_' + str(x)]) print("frame list length: ", len(frames)) print(frames) try: pag_results = pd.concat(frames) except Exception as e: print('Error in append loop: ', e) return pag_results
def test_rename_nocopy(): test_data = pandas.DataFrame(tm.getSeriesData()) modin_df = pd.DataFrame(test_data) modin_renamed = modin_df.rename(columns={"C": "foo"}, copy=False) modin_renamed["foo"] = 1 assert (modin_df["C"] == 1).all()
def clean_listings(data): results = pd.DataFrame(columns=[ 'id', 'advertiser_type', 'advertiser_id', 'price', 'price_min', 'price_max', 'features', 'property_type', 'bathrooms', 'bedrooms', 'carspaces', 'region', 'suburb', 'postcode', 'address', 'latitude', 'longitude', 'headline', 'description', 'labels', 'listingSlug' ]) for item in data: try: id = item['listing']['id'] advertiser_type = item['listing']['advertiser']['type'] advertiser_id = item['listing']['advertiser']['id'] price = item['listing']['priceDetails']['displayPrice'] if 'auction' in price.lower(): price_min = 'AUCTION' price_max = 'AUCTION' price = 'AUCTION' elif 'contact' in price.lower(): price_min = 'CONTACT' price_max = 'CONTACT' price = 'CONTACT' elif any(str.isdigit(c) for c in price): price_min_ = Price.fromstring(str(price)) price_min = price_min_.amount price_max = price.replace(str(price_min_.amount_text), '') price_max = Price.fromstring(str(price_max)) price_max = price_max.amount else: price_min = 'NA' price_max = 'NA' price = 'NA' features = item['listing']['propertyDetails']['features'] property_type = item['listing']['propertyDetails'][ 'propertyType'] bathrooms = item['listing']['propertyDetails']['bathrooms'] bedrooms = item['listing']['propertyDetails']['bedrooms'] carspaces = item['listing']['propertyDetails']['carspaces'] region = item['listing']['propertyDetails']['region'] suburb = item['listing']['propertyDetails']['suburb'] postcode = item['listing']['propertyDetails']['postcode'] address = item['listing']['propertyDetails'][ 'displayableAddress'] latitude = item['listing']['propertyDetails']['latitude'] longitude = item['listing']['propertyDetails']['longitude'] headline = item['listing']['headline'] description = item['listing']['summaryDescription'] labels = item['listing']['labels'] listingSlug = item['listing']['listingSlug'] dict_row = { 'id': id, 'advertiser_type': advertiser_type, 'advertiser_id': advertiser_id, 'price': price, 'price_min': price_min, 'price_max': price_max, 'features': features, 'property_type': property_type, 'bathrooms': bathrooms, 'bedrooms': bedrooms, 'carspaces': carspaces, 'region': region, 'suburb': suburb, 'postcode': postcode, 'address': address, 'latitude': latitude, 'longitude': longitude, 'headline': headline, 'description': description, 'labels': labels, 'listingSlug': listingSlug } results = results.append(dict_row, ignore_index=True) except Exception as e: pass #return ('processing_error',e) return results
def test_tail(data, n): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) df_equals(modin_df.tail(n), pandas_df.tail(n)) df_equals(modin_df.tail(len(modin_df)), pandas_df.tail(len(pandas_df)))
def test___setitem__(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df.__setitem__(modin_df.columns[-1], 1) pandas_df.__setitem__(pandas_df.columns[-1], 1) df_equals(modin_df, pandas_df) modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df[modin_df.columns[-1]] = pd.DataFrame(modin_df[modin_df.columns[0]]) pandas_df[pandas_df.columns[-1]] = pandas.DataFrame(pandas_df[pandas_df.columns[0]]) df_equals(modin_df, pandas_df) modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) rows = len(modin_df) arr = np.arange(rows * 2).reshape(-1, 2) modin_df[modin_df.columns[-1]] = arr pandas_df[pandas_df.columns[-1]] = arr df_equals(pandas_df, modin_df) with pytest.raises(ValueError, match=r"Wrong number of items passed"): modin_df["___NON EXISTENT COLUMN"] = arr modin_df[modin_df.columns[0]] = np.arange(len(modin_df)) pandas_df[pandas_df.columns[0]] = np.arange(len(pandas_df)) df_equals(modin_df, pandas_df) modin_df = pd.DataFrame(columns=modin_df.columns) pandas_df = pandas.DataFrame(columns=pandas_df.columns) for col in modin_df.columns: modin_df[col] = np.arange(1000) for col in pandas_df.columns: pandas_df[col] = np.arange(1000) df_equals(modin_df, pandas_df) # Test series assignment to column modin_df = pd.DataFrame(columns=modin_df.columns) pandas_df = pandas.DataFrame(columns=pandas_df.columns) modin_df[modin_df.columns[-1]] = modin_df[modin_df.columns[0]] pandas_df[pandas_df.columns[-1]] = pandas_df[pandas_df.columns[0]] df_equals(modin_df, pandas_df) if not sys.version_info.major == 3 and sys.version_info.minor > 6: # This test doesn't work correctly on Python 3.6 # Test 2d ndarray assignment to column modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) modin_df["new_col"] = modin_df[[modin_df.columns[0]]].values pandas_df["new_col"] = pandas_df[[pandas_df.columns[0]]].values df_equals(modin_df, pandas_df) assert isinstance(modin_df["new_col"][0], type(pandas_df["new_col"][0])) # Transpose test modin_df = pd.DataFrame(data).T pandas_df = pandas.DataFrame(data).T # We default to pandas on non-string column names if not all(isinstance(c, str) for c in modin_df.columns): with pytest.warns(UserWarning): modin_df[modin_df.columns[0]] = 0 else: modin_df[modin_df.columns[0]] = 0 pandas_df[pandas_df.columns[0]] = 0 df_equals(modin_df, pandas_df) modin_df.columns = [str(i) for i in modin_df.columns] pandas_df.columns = [str(i) for i in pandas_df.columns] modin_df[modin_df.columns[0]] = 0 pandas_df[pandas_df.columns[0]] = 0 df_equals(modin_df, pandas_df) modin_df[modin_df.columns[0]][modin_df.index[0]] = 12345 pandas_df[pandas_df.columns[0]][pandas_df.index[0]] = 12345 df_equals(modin_df, pandas_df)
def test_sort_values(request, data, axis, ascending, na_position): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) if "empty_data" not in request.node.name and ( (axis == 0 or axis == "over rows") or name_contains(request.node.name, numeric_dfs)): index = modin_df.index if axis == 1 or axis == "columns" else modin_df.columns key = index[0] modin_result = modin_df.sort_values( key, axis=axis, ascending=ascending, na_position=na_position, inplace=False, ) pandas_result = pandas_df.sort_values( key, axis=axis, ascending=ascending, na_position=na_position, inplace=False, ) df_equals(modin_result, pandas_result) modin_df_cp = modin_df.copy() pandas_df_cp = pandas_df.copy() modin_df_cp.sort_values( key, axis=axis, ascending=ascending, na_position=na_position, inplace=True, ) pandas_df_cp.sort_values( key, axis=axis, ascending=ascending, na_position=na_position, inplace=True, ) df_equals(modin_df_cp, pandas_df_cp) keys = [key, index[-1]] modin_result = modin_df.sort_values( keys, axis=axis, ascending=ascending, na_position=na_position, inplace=False, ) pandas_result = pandas_df.sort_values( keys, axis=axis, ascending=ascending, na_position=na_position, inplace=False, ) df_equals(modin_result, pandas_result) modin_df_cp = modin_df.copy() pandas_df_cp = pandas_df.copy() modin_df_cp.sort_values( keys, axis=axis, ascending=ascending, na_position=na_position, inplace=True, ) pandas_df_cp.sort_values( keys, axis=axis, ascending=ascending, na_position=na_position, inplace=True, ) df_equals(modin_df_cp, pandas_df_cp)
def test___len__(data): modin_df = pd.DataFrame(data) pandas_df = pandas.DataFrame(data) assert len(modin_df) == len(pandas_df)
def test_tshift(): idx = pd.date_range("1/1/2012", periods=5, freq="M") data = np.random.randint(0, 100, size=(len(idx), 4)) modin_df = pd.DataFrame(data, index=idx) pandas_df = pandas.DataFrame(data, index=idx) df_equals(modin_df.tshift(4), pandas_df.tshift(4))
def test_drop(): frame_data = {"A": [1, 2, 3, 4], "B": [0, 1, 2, 3]} simple = pandas.DataFrame(frame_data) modin_simple = pd.DataFrame(frame_data) df_equals(modin_simple.drop("A", axis=1), simple[["B"]]) df_equals(modin_simple.drop(["A", "B"], axis="columns"), simple[[]]) df_equals(modin_simple.drop([0, 1, 3], axis=0), simple.loc[[2], :]) df_equals(modin_simple.drop([0, 3], axis="index"), simple.loc[[1, 2], :]) pytest.raises(ValueError, modin_simple.drop, 5) pytest.raises(ValueError, modin_simple.drop, "C", 1) pytest.raises(ValueError, modin_simple.drop, [1, 5]) pytest.raises(ValueError, modin_simple.drop, ["A", "C"], 1) # errors = 'ignore' df_equals(modin_simple.drop(5, errors="ignore"), simple) df_equals(modin_simple.drop([0, 5], errors="ignore"), simple.loc[[1, 2, 3], :]) df_equals(modin_simple.drop("C", axis=1, errors="ignore"), simple) df_equals(modin_simple.drop(["A", "C"], axis=1, errors="ignore"), simple[["B"]]) # non-unique nu_df = pandas.DataFrame(zip(range(3), range(-3, 1), list("abc")), columns=["a", "a", "b"]) modin_nu_df = pd.DataFrame(nu_df) df_equals(modin_nu_df.drop("a", axis=1), nu_df[["b"]]) df_equals(modin_nu_df.drop("b", axis="columns"), nu_df["a"]) df_equals(modin_nu_df.drop([]), nu_df) nu_df = nu_df.set_index(pandas.Index(["X", "Y", "X"])) nu_df.columns = list("abc") modin_nu_df = pd.DataFrame(nu_df) df_equals(modin_nu_df.drop("X", axis="rows"), nu_df.loc[["Y"], :]) df_equals(modin_nu_df.drop(["X", "Y"], axis=0), nu_df.loc[[], :]) # inplace cache issue frame_data = random_state.randn(10, 3) df = pandas.DataFrame(frame_data, columns=list("abc")) modin_df = pd.DataFrame(frame_data, columns=list("abc")) expected = df[~(df.b > 0)] modin_df.drop(labels=df[df.b > 0].index, inplace=True) df_equals(modin_df, expected) midx = pd.MultiIndex( levels=[["lama", "cow", "falcon"], ["speed", "weight", "length"]], codes=[[0, 0, 0, 1, 1, 1, 2, 2, 2], [0, 1, 2, 0, 1, 2, 0, 1, 2]], ) df = pd.DataFrame( index=midx, columns=["big", "small"], data=[ [45, 30], [200, 100], [1.5, 1], [30, 20], [250, 150], [1.5, 0.8], [320, 250], [1, 0.8], [0.3, 0.2], ], ) with warns_that_defaulting_to_pandas(): df.drop(index="length", level=1)