def anti_join(self, y, on=None, on_x=None, on_y=None, sort=True, suffix_y="_y"): tf = tidyframe(self._obj, copy=False, check=False) y = tidyframe(y, copy=False, check=True) return tf.anti_join(y, on=on, on_x=on_x, on_y=on_y, sort=sort, suffix_y=suffix_y).to_pandas(copy=False)
def test_summarise(penguins_data): penguins_tidy = tidyframe(penguins_data, copy=False) expected = pd.DataFrame({ "A": [ np.mean(penguins_data["bill_length_mm"] + penguins_data["bill_depth_mm"]) ], "B": [ np.mean(penguins_data["bill_length_mm"] - penguins_data["bill_depth_mm"]) ] }) result = penguins_tidy.summarise({ ('A', 'B'): ("[np.mean(x + y), np.mean(x - y)]", ["bill_length_mm", "bill_depth_mm"]) }).to_pandas() assert_frame_equal_v2(expected, result) # summarise across expected = pd.DataFrame({ "bill_length_mm": [np.mean(penguins_data["bill_length_mm"])], "bill_depth_mm": [np.mean(penguins_data["bill_depth_mm"])] }) result = penguins_tidy.summarise( func=np.mean, column_names=['bill_length_mm', 'bill_depth_mm']).to_pandas() assert_frame_equal_v2(expected, result)
def test_mutate_across(penguins_data): penguins_tidy = tidyframe(penguins_data, copy=False) # across mode with column names expected = penguins_data[['bill_length_mm', 'body_mass_g']]\ .assign(demean_bill_length_mm=lambda x: x["bill_length_mm"] - np.mean(x["bill_length_mm"]))\ .assign(demean_body_mass_g=lambda x: x["body_mass_g"] - np.mean(x["body_mass_g"])) result = (penguins_tidy.select(['bill_length_mm', 'body_mass_g']).mutate( column_names=['bill_length_mm', 'body_mass_g'], func=lambda x: x - np.mean(x), prefix="demean_")).to_pandas() assert_frame_equal_v2(expected, result) # grouped across with column names expected = penguins_data[['bill_length_mm', 'body_mass_g', 'species']] expected = expected.groupby("species")\ .apply(lambda x: x.assign(demean_bill_length_mm=lambda y: y["bill_length_mm"] - np.mean(y["bill_length_mm"]))\ .assign(demean_body_mass_g=lambda y: y["body_mass_g"] - np.mean(y["body_mass_g"]))) result = (penguins_tidy.select( ['bill_length_mm', 'body_mass_g', 'species']).mutate(column_names=['bill_length_mm', 'body_mass_g'], func=lambda x: x - np.mean(x), prefix="demean_", by='species')).to_pandas() assert_frame_equal_v2(expected, result)
def test_mutate_groupby(penguins_data): penguins_tidy = tidyframe(penguins_data, copy=False) expected = penguins_data.groupby("sex", dropna=False)\ .apply(lambda x: x.assign(year_mod=lambda y: y["year"] + np.mean(y["year"]) - 4015)) result = penguins_tidy.mutate( { 'year_mod': "x['year'] + np.mean(x['year']) - 4015" }, by='sex').to_pandas() assert_frame_equal_v2(expected, result)
def separate(self, column_name, into, sep='[^[:alnum:]]+', strict=True, keep=False): tf = tidyframe(self._obj, copy=False, check=False) return tf.separate(column_name=column_name, into=into, sep=sep, strict=strict, keep=keep).to_pandas(copy=False)
def fill_na(self, column_direction_dict, order_by=None, ascending=True, na_position="last", by=None): tf = tidyframe(self._obj, copy=False, check=False) return tf.fill_na(column_direction_dict=column_direction_dict, order_by=order_by, ascending=ascending, na_position=na_position, by=by).to_pandas(copy=False)
def pivot_longer(self, cols, names_to="name", values_to="value", include=True, values_drop_na=False): tf = tidyframe(self._obj, copy=False, check=False) return tf.pivot_longer( cols=cols, names_to=names_to, values_to=values_to, include=include, values_drop_na=values_drop_na).to_pandas(copy=False)
def test_summarise_groupby(penguins_data): penguins_tidy = tidyframe(penguins_data, copy=False) # expected = penguins_data.groupby(["species", "sex"], dropna=False)\ # .apply(lambda x: x[["year"]].mean().rename({"year": "a_mean"}))\ # .reset_index().sort_values(["species", "sex"]) expected = penguins_data.groupby(["species", "sex"], dropna=False)\ .apply(lambda x: np.mean(x["year"]))\ .reset_index().rename(columns={0: "a_mean"})\ .sort_values(["species", "sex"]).reset_index(drop=True) expected = expected.convert_dtypes() result = penguins_tidy.summarise({"a_mean": (np.mean, 'year')}, by = ['species', 'sex'] ).to_pandas()\ .sort_values(["species", "sex"]).reset_index(drop=True) assert_frame_equal_v2(expected, result) # groupby summarise with kwargs expected = penguins_data.groupby(["species", "sex"], dropna=False)\ .apply(lambda x: np.mean(x["year"]) + 4)\ .reset_index().rename(columns={0: "a_mean"})\ .sort_values(["species", "sex"]).reset_index(drop=True) expected = expected.convert_dtypes() result = penguins_tidy.summarise( { "a_mean": lambda x, **kwargs: x['year'].mean() + kwargs['leap'] }, by=['species', 'sex'], leap=4).to_pandas().sort_values(["species", "sex"]).reset_index(drop=True) assert_frame_equal_v2(expected, result) # groupby summarise across expected = penguins_data.groupby(["species", "sex"], dropna=False)\ .apply(lambda x: x.loc[:, x.columns[x.apply(types.is_numeric_dtype, axis=0)]].mean()\ .add_prefix("avg_"))\ .reset_index()\ .sort_values(["species", "sex"]).reset_index(drop=True) expected = expected.convert_dtypes() result = penguins_tidy.summarise(func=np.mean, predicate=types.is_numeric_dtype, prefix="avg_", by=['species', 'sex']).to_pandas().sort_values( ["species", "sex"]).reset_index(drop=True) assert_frame_equal_v2(expected, result)
def slice_max(self, n=None, prop=None, order_by_column=None, with_ties=True, rounding_type="round", by=None): tf = tidyframe(self._obj, copy=False, check=False) return tf.slice_max(n=n, prop=prop, order_by_column=order_by_column, with_ties=with_ties, rounding_type=rounding_type, by=by).to_pandas(copy=False)
def slice_sample(self, n=None, prop=None, replace=False, weights=None, random_state=None, by=None): tf = tidyframe(self._obj, copy=False, check=False) return tf.slice_sample(n=n, prop=prop, replace=replace, weights=weights, random_state=random_state, by=by).to_pandas(copy=False)
def pivot_wider(self, names_from, values_from, values_fill=None, values_fn=None, id_cols=None, sep="__"): tf = tidyframe(self._obj, copy=False, check=False) return tf.pivot_wider(names_from=names_from, values_from=values_from, values_fill=values_fill, values_fn=values_fn, id_cols=id_cols, sep=sep).to_pandas(copy=False)
def group_modify(self, func, by, preserve_row_order=False, row_order_column_name="rowid_temp", is_pandas_udf=False, **kwargs): tf = tidyframe(self._obj, copy=False, check=False) return tf.group_modify(func=func, by=by, preserve_row_order=preserve_row_order, row_order_column_name=row_order_column_name, is_pandas_udf=is_pandas_udf, **kwargs).to_pandas(copy=False)
def test_mutate_orderby(penguins_data): penguins_tidy = tidyframe(penguins_data, copy=False) expected = (penguins_data[['year', 'species', 'bill_length_mm']]\ .sort_values("bill_length_mm")\ .assign(year_cumsum=lambda x: np.cumsum(x["year"]))\ .sort_index()) result = (penguins_tidy.select(['year', 'species', 'bill_length_mm']).mutate( { 'year_cumsum': (np.cumsum, 'year') }, order_by='bill_length_mm').to_pandas()) assert_frame_equal_v2(expected, result)
def summarise(self, dictionary=None, func=None, column_names=None, predicate=None, prefix="", by=None, **kwargs): tf = tidyframe(self._obj, copy=False, check=False) return tf.summarise(dictionary=dictionary, func=func, column_names=column_names, predicate=predicate, prefix=prefix, by=by, **kwargs).to_pandas(copy=False)
def test_select(penguins_data): penguins_tidy = tidyframe(penguins_data, copy=False) sel_cols = penguins_data.convert_dtypes().columns[ penguins_data.convert_dtypes().apply(lambda x: x.dtype != "string", axis=0)] expected = penguins_data.loc[:, sel_cols] result = penguins_tidy.select( predicate=lambda x: x.dtype != "string").to_pandas() assert_frame_equal_v2(expected, result) sel_cols = list( set(penguins_data.columns).difference(set(['sex', 'species']))) expected = penguins_data.loc[:, sel_cols] result = penguins_tidy.select(['sex', 'species'], include=False).to_pandas() assert_frame_equal_v2(expected, result)
def separate_rows(self, column_name, sep=';'): tf = tidyframe(self._obj, copy=False, check=False) return tf.separate_rows(column_name=column_name, sep=sep).to_pandas(copy=False)
def add_group_number(self, by=None, name='group_number'): tf = tidyframe(self._obj, copy=False, check=False) return tf.add_group_number(by=by, name=name).to_pandas(copy=False)
def add_row_number(self, name='row_number', by=None): tf = tidyframe(self._obj, copy=False, check=False) return tf.add_row_number(name=name, by=by).to_pandas(copy=False)
def colnames(self): tf = tidyframe(self._obj, copy=False, check=False) return tf.colnames
def dim(self): tf = tidyframe(self._obj, copy=False, check=False) return tf.dim
def shape(self): tf = tidyframe(self._obj, copy=False, check=False) return tf.shape
def split(self, by): tf = tidyframe(self._obj, copy=False, check=False) return tf.split(by=by).to_pandas(copy=False)
def unnest(self, nest_column_name='data'): tf = tidyframe(self._obj, copy=False, check=False) return tf.unnest(nest_column_name=nest_column_name).to_pandas( copy=False)
def ncol(self): tf = tidyframe(self._obj, copy=False, check=False) return tf.ncol
def nest(self, column_names=None, nest_column_name='data', include=True): tf = tidyframe(self._obj, copy=False, check=False) return tf.nest(column_names=column_names, nest_column_name=nest_column_name, include=include).to_pandas(copy=False)
def nest_by(self, by=None, nest_column_name='data', drop_by=True): tf = tidyframe(self._obj, copy=False, check=False) return tf.nest_by(by=by, nest_column_name=nest_column_name, drop_by=drop_by).to_pandas(copy=False)
def select(self, column_names=None, predicate=None, include=True): tf = tidyframe(self._obj, copy=False, check=False) return tf.select(column_names=column_names, predicate=predicate, include=include).to_pandas(copy=False)
def drop_na(self, column_names=None): tf = tidyframe(self._obj, copy=False, check=False) return tf.drop_na(column_names=column_names).to_pandas(copy=False)
def relocate(self, column_names, before=None, after=None): tf = tidyframe(self._obj, copy=False, check=False) return tf.relocate(column_names=column_names, before=before, after=after).to_pandas(copy=False)
def unite(self, column_names, into, sep="_", keep=False): tf = tidyframe(self._obj, copy=False, check=False) return tf.unite(column_names=column_names, into=into, sep=sep, keep=keep).to_pandas(copy=False)