コード例 #1
0
    def test_null_col(self, null_dtype):
        csv_file = os.path.join(self.root, "modin/pandas/test/data",
                                "test_null_col.csv")
        ref = pandas.read_csv(
            csv_file,
            names=["a", "b", "c"],
            dtype={
                "a": "int64",
                "b": "int64",
                "c": null_dtype
            },
            skiprows=1,
        )
        ref["a"] = ref["a"] + ref["b"]

        exp = pd.read_csv(
            csv_file,
            names=["a", "b", "c"],
            dtype={
                "a": "int64",
                "b": "int64",
                "c": null_dtype
            },
            skiprows=1,
        )
        exp["a"] = exp["a"] + exp["b"]

        # df_equals cannot compare empty categories
        if null_dtype == "category":
            ref["c"] = ref["c"].astype("string")
            exp = to_pandas(exp)
            exp["c"] = exp["c"].astype("string")

        df_equals(ref, exp)
コード例 #2
0
 def std(df, **kwargs):
     df = df.groupby("a").agg({"b": "skew", "c": "skew"})
     if not isinstance(df, pandas.DataFrame):
         df = to_pandas(df)
     df["b"] = df["b"].apply(lambda x: round(x, 10))
     df["c"] = df["c"].apply(lambda x: round(x, 10))
     return df
コード例 #3
0
    def test_usecols_csv(self):
        """ check with the following arguments: names, dtype, skiprows, delimiter """
        csv_file = os.path.join(self.root, "modin/pandas/test/data",
                                "test_usecols.csv")

        for kwargs in (
            {
                "delimiter": ","
            },
            {
                "sep": None
            },
            {
                "skiprows": 1,
                "names": ["A", "B", "C", "D", "E"]
            },
            {
                "dtype": {
                    "a": "int32",
                    "e": "string"
                }
            },
            {
                "dtype": {
                    "a": np.dtype("int32"),
                    "b": np.dtype("int64"),
                    "e": "string"
                }
            },
        ):
            rp = pandas.read_csv(csv_file, **kwargs)
            rm = to_pandas(pd.read_csv(csv_file, engine="arrow", **kwargs))
            df_equals(rp, rm)
コード例 #4
0
 def test_housing_csv(self):
     csv_file = os.path.join(self.root, "examples/data/boston_housing.csv")
     for kwargs in ({
             "skiprows": 1,
             "names": self.boston_housing_names,
             "dtype": self.boston_housing_dtypes,
     }, ):
         rp = pandas.read_csv(csv_file, **kwargs)
         rm = to_pandas(pd.read_csv(csv_file, engine="arrow", **kwargs))
         assert rp is not None
         assert rm is not None
コード例 #5
0
    def test_cat_codes(self):
        pandas_df = pandas.DataFrame(self.data)
        pandas_df["a"] = pandas_df["a"].astype("category")

        modin_df = pd.DataFrame(pandas_df)

        modin_df["a"] = modin_df["a"].cat.codes
        exp = to_pandas(modin_df)

        pandas_df["a"] = pandas_df["a"].cat.codes

        df_equals(pandas_df, exp)
コード例 #6
0
    def test_h2o_q5(self):
        lhs = self._get_h2o_df(self.h2o_data)
        rhs = self._get_h2o_df(self.h2o_data_big)

        ref = lhs.merge(rhs, on="id3")
        self._fix_category_cols(ref)

        modin_lhs = pd.DataFrame(lhs)
        modin_rhs = pd.DataFrame(rhs)
        modin_res = modin_lhs.merge(modin_rhs, on="id3")

        exp = to_pandas(modin_res)
        self._fix_category_cols(exp)

        df_equals(ref, exp)
コード例 #7
0
    def test_h2o_q1(self):
        df = self._get_h2o_df()

        ref = df.groupby(["id1"], observed=True).agg({"v1": "sum"})
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        set_execution_mode(modin_df, "lazy")
        modin_df = modin_df.groupby(["id1"], observed=True,
                                    as_index=False).agg({"v1": "sum"})
        set_execution_mode(modin_df, None)

        exp = to_pandas(modin_df)
        exp["id1"] = exp["id1"].astype("category")

        df_equals(ref, exp)
コード例 #8
0
    def test_h2o_q4(self):
        df = self._get_h2o_df()

        ref = df.groupby(["id4"], observed=True).agg(
            {"v1": "mean", "v2": "mean", "v3": "mean"}
        )
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        set_execution_mode(modin_df, "lazy")
        modin_df = modin_df.groupby(["id4"], observed=True, as_index=False).agg(
            {"v1": "mean", "v2": "mean", "v3": "mean"}
        )
        set_execution_mode(modin_df, None)

        exp = to_pandas(modin_df)

        df_equals(ref, exp)
コード例 #9
0
    def test_h2o_q10(self):
        df = self._get_h2o_df()

        ref = df.groupby(["id1", "id2", "id3", "id4", "id5", "id6"], observed=True).agg(
            {"v3": "sum", "v1": "count"}
        )
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        modin_df = modin_df.groupby(
            ["id1", "id2", "id3", "id4", "id5", "id6"], observed=True
        ).agg({"v3": "sum", "v1": "count"})
        modin_df.reset_index(inplace=True)

        exp = to_pandas(modin_df)
        exp["id1"] = exp["id1"].astype("category")
        exp["id2"] = exp["id2"].astype("category")
        exp["id3"] = exp["id3"].astype("category")

        df_equals(ref, exp)
コード例 #10
0
    def test_h2o_q7(self):
        df = self._get_h2o_df()

        ref = (df.groupby(["id3"], observed=True).agg({
            "v1": "max",
            "v2": "min"
        }).assign(range_v1_v2=lambda x: x["v1"] - x["v2"])[["range_v1_v2"]])
        ref.reset_index(inplace=True)

        modin_df = pd.DataFrame(df)
        set_execution_mode(modin_df, "lazy")
        modin_df = modin_df.groupby(["id3"], observed=True).agg({
            "v1": "max",
            "v2": "min"
        })
        modin_df["range_v1_v2"] = modin_df["v1"] - modin_df["v2"]
        modin_df = modin_df[["range_v1_v2"]]
        modin_df.reset_index(inplace=True)
        set_execution_mode(modin_df, None)

        exp = to_pandas(modin_df)
        exp["id3"] = exp["id3"].astype("category")

        df_equals(ref, exp)
コード例 #11
0
 def test_time_parsing(self):
     csv_file = os.path.join(self.root, "modin/pandas/test/data",
                             "test_time_parsing.csv")
     for kwargs in ({
             "skiprows":
             1,
             "names": [
                 "timestamp",
                 "symbol",
                 "high",
                 "low",
                 "open",
                 "close",
                 "spread",
                 "volume",
             ],
             "parse_dates": ["timestamp"],
             "dtype": {
                 "symbol": "string"
             },
     }, ):
         rp = pandas.read_csv(csv_file, **kwargs)
         rm = to_pandas(pd.read_csv(csv_file, engine="arrow", **kwargs))
         df_equals(rm["timestamp"].dt.year, rp["timestamp"].dt.year)