def test_safe_group_by_apply(): df = DF([["a", 1], ["a", 2], [None, 3]], "b:str,c:long", True) def _m1(df): PD_UTILS.ensure_compatible(df) df["ct"] = df.shape[0] return df res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1) PD_UTILS.ensure_compatible(res) assert 3 == res.shape[0] assert 3 == res.shape[1] assert [["a", 1, 2], ["a", 2, 2], [None, 3, 1]] == res.values.tolist() res = PD_UTILS.safe_groupby_apply(df.native, [], _m1) PD_UTILS.ensure_compatible(res) assert 3 == res.shape[0] assert 3 == res.shape[1] assert [["a", 1, 3], ["a", 2, 3], [None, 3, 3]] == res.values.tolist() df = DF([[1.0, "a"], [1.0, "b"], [None, "c"], [None, "d"]], "b:double,c:str", True) res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1) assert [ [1.0, "a", 2], [1.0, "b", 2], [float("nan"), "c", 2], [float("nan"), "d", 2], ].__repr__() == res.values.tolist().__repr__()
def _apply_schema(self, pdf: pd.DataFrame, schema: Optional[Schema]) -> Tuple[pd.DataFrame, Schema]: PD_UTILS.ensure_compatible(pdf) if pdf.columns.dtype == "object": # pdf has named schema pschema = _input_schema(pdf) if schema is None or pschema == schema: return pdf, pschema.assert_not_empty() pdf = pdf[schema.assert_not_empty().names] else: # pdf has no named schema schema = _input_schema(schema).assert_not_empty() assert_or_throw( pdf.shape[1] == len(schema), ValueError( f"Pandas datafame column count doesn't match {schema}"), ) pdf.columns = schema.names return _enforce_type(pdf, schema), schema
def test_safe_group_by_apply_special_types(): def _m1(df): PD_UTILS.ensure_compatible(df) df["ct"] = df.shape[0] return df df = DF([["a", 1.0], [None, 3.0], [None, 3.0], [None, None]], "a:str,b:double", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", 1.0, 1], [None, 3.0, 2], [None, 3.0, 2], [None, None, 1]], "a:str,b:double,ct:int", True, ).assert_eq(res) dt = datetime.now() df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:datetime", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]], "a:str,b:datetime,ct:int", True, ).assert_eq(res) dt = date(2020, 1, 1) df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:date", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]], "a:str,b:date,ct:int", True, ).assert_eq(res) dt = date(2020, 1, 1) df = DF([["a", dt], ["b", dt], ["b", dt], ["b", None]], "a:str,b:date", True) res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1) PD_UTILS.ensure_compatible(res) assert 4 == res.shape[0] assert 3 == res.shape[1] DF( [["a", dt, 1], ["b", dt, 2], ["b", dt, 2], ["b", None, 1]], "a:str,b:date,ct:int", True, ).assert_eq(res)
def _m1(df): PD_UTILS.ensure_compatible(df) df["ct"] = df.shape[0] return df