def test_null_col(self, null_dtype): csv_file = os.path.join(self.root, "modin/pandas/test/data", "test_null_col.csv") ref = pandas.read_csv( csv_file, names=["a", "b", "c"], dtype={ "a": "int64", "b": "int64", "c": null_dtype }, skiprows=1, ) ref["a"] = ref["a"] + ref["b"] exp = pd.read_csv( csv_file, names=["a", "b", "c"], dtype={ "a": "int64", "b": "int64", "c": null_dtype }, skiprows=1, ) exp["a"] = exp["a"] + exp["b"] # df_equals cannot compare empty categories if null_dtype == "category": ref["c"] = ref["c"].astype("string") exp = to_pandas(exp) exp["c"] = exp["c"].astype("string") df_equals(ref, exp)
def std(df, **kwargs): df = df.groupby("a").agg({"b": "skew", "c": "skew"}) if not isinstance(df, pandas.DataFrame): df = to_pandas(df) df["b"] = df["b"].apply(lambda x: round(x, 10)) df["c"] = df["c"].apply(lambda x: round(x, 10)) return df
def test_usecols_csv(self): """ check with the following arguments: names, dtype, skiprows, delimiter """ csv_file = os.path.join(self.root, "modin/pandas/test/data", "test_usecols.csv") for kwargs in ( { "delimiter": "," }, { "sep": None }, { "skiprows": 1, "names": ["A", "B", "C", "D", "E"] }, { "dtype": { "a": "int32", "e": "string" } }, { "dtype": { "a": np.dtype("int32"), "b": np.dtype("int64"), "e": "string" } }, ): rp = pandas.read_csv(csv_file, **kwargs) rm = to_pandas(pd.read_csv(csv_file, engine="arrow", **kwargs)) df_equals(rp, rm)
def test_housing_csv(self): csv_file = os.path.join(self.root, "examples/data/boston_housing.csv") for kwargs in ({ "skiprows": 1, "names": self.boston_housing_names, "dtype": self.boston_housing_dtypes, }, ): rp = pandas.read_csv(csv_file, **kwargs) rm = to_pandas(pd.read_csv(csv_file, engine="arrow", **kwargs)) assert rp is not None assert rm is not None
def test_cat_codes(self): pandas_df = pandas.DataFrame(self.data) pandas_df["a"] = pandas_df["a"].astype("category") modin_df = pd.DataFrame(pandas_df) modin_df["a"] = modin_df["a"].cat.codes exp = to_pandas(modin_df) pandas_df["a"] = pandas_df["a"].cat.codes df_equals(pandas_df, exp)
def test_h2o_q5(self): lhs = self._get_h2o_df(self.h2o_data) rhs = self._get_h2o_df(self.h2o_data_big) ref = lhs.merge(rhs, on="id3") self._fix_category_cols(ref) modin_lhs = pd.DataFrame(lhs) modin_rhs = pd.DataFrame(rhs) modin_res = modin_lhs.merge(modin_rhs, on="id3") exp = to_pandas(modin_res) self._fix_category_cols(exp) df_equals(ref, exp)
def test_h2o_q1(self): df = self._get_h2o_df() ref = df.groupby(["id1"], observed=True).agg({"v1": "sum"}) ref.reset_index(inplace=True) modin_df = pd.DataFrame(df) set_execution_mode(modin_df, "lazy") modin_df = modin_df.groupby(["id1"], observed=True, as_index=False).agg({"v1": "sum"}) set_execution_mode(modin_df, None) exp = to_pandas(modin_df) exp["id1"] = exp["id1"].astype("category") df_equals(ref, exp)
def test_h2o_q4(self): df = self._get_h2o_df() ref = df.groupby(["id4"], observed=True).agg( {"v1": "mean", "v2": "mean", "v3": "mean"} ) ref.reset_index(inplace=True) modin_df = pd.DataFrame(df) set_execution_mode(modin_df, "lazy") modin_df = modin_df.groupby(["id4"], observed=True, as_index=False).agg( {"v1": "mean", "v2": "mean", "v3": "mean"} ) set_execution_mode(modin_df, None) exp = to_pandas(modin_df) df_equals(ref, exp)
def test_h2o_q10(self): df = self._get_h2o_df() ref = df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], observed=True).agg( {"v3": "sum", "v1": "count"} ) ref.reset_index(inplace=True) modin_df = pd.DataFrame(df) modin_df = modin_df.groupby( ["id1", "id2", "id3", "id4", "id5", "id6"], observed=True ).agg({"v3": "sum", "v1": "count"}) modin_df.reset_index(inplace=True) exp = to_pandas(modin_df) exp["id1"] = exp["id1"].astype("category") exp["id2"] = exp["id2"].astype("category") exp["id3"] = exp["id3"].astype("category") df_equals(ref, exp)
def test_h2o_q7(self): df = self._get_h2o_df() ref = (df.groupby(["id3"], observed=True).agg({ "v1": "max", "v2": "min" }).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]]) ref.reset_index(inplace=True) modin_df = pd.DataFrame(df) set_execution_mode(modin_df, "lazy") modin_df = modin_df.groupby(["id3"], observed=True).agg({ "v1": "max", "v2": "min" }) modin_df["range_v1_v2"] = modin_df["v1"] - modin_df["v2"] modin_df = modin_df[["range_v1_v2"]] modin_df.reset_index(inplace=True) set_execution_mode(modin_df, None) exp = to_pandas(modin_df) exp["id3"] = exp["id3"].astype("category") df_equals(ref, exp)
def test_time_parsing(self): csv_file = os.path.join(self.root, "modin/pandas/test/data", "test_time_parsing.csv") for kwargs in ({ "skiprows": 1, "names": [ "timestamp", "symbol", "high", "low", "open", "close", "spread", "volume", ], "parse_dates": ["timestamp"], "dtype": { "symbol": "string" }, }, ): rp = pandas.read_csv(csv_file, **kwargs) rm = to_pandas(pd.read_csv(csv_file, engine="arrow", **kwargs)) df_equals(rm["timestamp"].dt.year, rp["timestamp"].dt.year)