Esempio n. 1
0
 def anti_join(self,
               y,
               on=None,
               on_x=None,
               on_y=None,
               sort=True,
               suffix_y="_y"):
     tf = tidyframe(self._obj, copy=False, check=False)
     y = tidyframe(y, copy=False, check=True)
     return tf.anti_join(y,
                         on=on,
                         on_x=on_x,
                         on_y=on_y,
                         sort=sort,
                         suffix_y=suffix_y).to_pandas(copy=False)
Esempio n. 2
0
def test_summarise(penguins_data):
    penguins_tidy = tidyframe(penguins_data, copy=False)

    expected = pd.DataFrame({
        "A": [
            np.mean(penguins_data["bill_length_mm"] +
                    penguins_data["bill_depth_mm"])
        ],
        "B": [
            np.mean(penguins_data["bill_length_mm"] -
                    penguins_data["bill_depth_mm"])
        ]
    })
    result = penguins_tidy.summarise({
        ('A', 'B'): ("[np.mean(x + y), np.mean(x - y)]",
                     ["bill_length_mm", "bill_depth_mm"])
    }).to_pandas()
    assert_frame_equal_v2(expected, result)

    # summarise across
    expected = pd.DataFrame({
        "bill_length_mm": [np.mean(penguins_data["bill_length_mm"])],
        "bill_depth_mm": [np.mean(penguins_data["bill_depth_mm"])]
    })
    result = penguins_tidy.summarise(
        func=np.mean, column_names=['bill_length_mm',
                                    'bill_depth_mm']).to_pandas()
    assert_frame_equal_v2(expected, result)
Esempio n. 3
0
def test_mutate_across(penguins_data):
    penguins_tidy = tidyframe(penguins_data, copy=False)

    # across mode with column names
    expected = penguins_data[['bill_length_mm', 'body_mass_g']]\
                    .assign(demean_bill_length_mm=lambda x: x["bill_length_mm"] - np.mean(x["bill_length_mm"]))\
                    .assign(demean_body_mass_g=lambda x: x["body_mass_g"] - np.mean(x["body_mass_g"]))
    result = (penguins_tidy.select(['bill_length_mm', 'body_mass_g']).mutate(
        column_names=['bill_length_mm', 'body_mass_g'],
        func=lambda x: x - np.mean(x),
        prefix="demean_")).to_pandas()
    assert_frame_equal_v2(expected, result)

    # grouped across with column names
    expected = penguins_data[['bill_length_mm', 'body_mass_g', 'species']]
    expected = expected.groupby("species")\
                .apply(lambda x: x.assign(demean_bill_length_mm=lambda y: y["bill_length_mm"] - np.mean(y["bill_length_mm"]))\
                                  .assign(demean_body_mass_g=lambda y: y["body_mass_g"] - np.mean(y["body_mass_g"])))

    result = (penguins_tidy.select(
        ['bill_length_mm', 'body_mass_g',
         'species']).mutate(column_names=['bill_length_mm', 'body_mass_g'],
                            func=lambda x: x - np.mean(x),
                            prefix="demean_",
                            by='species')).to_pandas()
    assert_frame_equal_v2(expected, result)
Esempio n. 4
0
def test_mutate_groupby(penguins_data):
    penguins_tidy = tidyframe(penguins_data, copy=False)


    expected = penguins_data.groupby("sex", dropna=False)\
                  .apply(lambda x: x.assign(year_mod=lambda y: y["year"] + np.mean(y["year"]) - 4015))
    result = penguins_tidy.mutate(
        {
            'year_mod': "x['year'] + np.mean(x['year']) - 4015"
        }, by='sex').to_pandas()
    assert_frame_equal_v2(expected, result)
Esempio n. 5
0
 def separate(self,
              column_name,
              into,
              sep='[^[:alnum:]]+',
              strict=True,
              keep=False):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.separate(column_name=column_name,
                        into=into,
                        sep=sep,
                        strict=strict,
                        keep=keep).to_pandas(copy=False)
Esempio n. 6
0
 def fill_na(self,
             column_direction_dict,
             order_by=None,
             ascending=True,
             na_position="last",
             by=None):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.fill_na(column_direction_dict=column_direction_dict,
                       order_by=order_by,
                       ascending=ascending,
                       na_position=na_position,
                       by=by).to_pandas(copy=False)
Esempio n. 7
0
 def pivot_longer(self,
                  cols,
                  names_to="name",
                  values_to="value",
                  include=True,
                  values_drop_na=False):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.pivot_longer(
         cols=cols,
         names_to=names_to,
         values_to=values_to,
         include=include,
         values_drop_na=values_drop_na).to_pandas(copy=False)
Esempio n. 8
0
def test_summarise_groupby(penguins_data):
    penguins_tidy = tidyframe(penguins_data, copy=False)

    # expected = penguins_data.groupby(["species", "sex"], dropna=False)\
    #                         .apply(lambda x: x[["year"]].mean().rename({"year": "a_mean"}))\
    #                         .reset_index().sort_values(["species", "sex"])
    expected = penguins_data.groupby(["species", "sex"], dropna=False)\
                            .apply(lambda x: np.mean(x["year"]))\
                            .reset_index().rename(columns={0: "a_mean"})\
                            .sort_values(["species", "sex"]).reset_index(drop=True)
    expected = expected.convert_dtypes()


    result = penguins_tidy.summarise({"a_mean": (np.mean, 'year')},
                                      by = ['species', 'sex']
                                      ).to_pandas()\
                                       .sort_values(["species", "sex"]).reset_index(drop=True)
    assert_frame_equal_v2(expected, result)

    # groupby summarise with kwargs
    expected = penguins_data.groupby(["species", "sex"], dropna=False)\
                            .apply(lambda x: np.mean(x["year"]) + 4)\
                            .reset_index().rename(columns={0: "a_mean"})\
                            .sort_values(["species", "sex"]).reset_index(drop=True)
    expected = expected.convert_dtypes()

    result = penguins_tidy.summarise(
        {
            "a_mean": lambda x, **kwargs: x['year'].mean() + kwargs['leap']
        },
        by=['species', 'sex'],
        leap=4).to_pandas().sort_values(["species",
                                         "sex"]).reset_index(drop=True)
    assert_frame_equal_v2(expected, result)

    # groupby summarise across
    expected = penguins_data.groupby(["species", "sex"], dropna=False)\
                            .apply(lambda x: x.loc[:, x.columns[x.apply(types.is_numeric_dtype, axis=0)]].mean()\
                                              .add_prefix("avg_"))\
                            .reset_index()\
                            .sort_values(["species", "sex"]).reset_index(drop=True)
    expected = expected.convert_dtypes()

    result = penguins_tidy.summarise(func=np.mean,
                                     predicate=types.is_numeric_dtype,
                                     prefix="avg_",
                                     by=['species',
                                         'sex']).to_pandas().sort_values(
                                             ["species",
                                              "sex"]).reset_index(drop=True)
    assert_frame_equal_v2(expected, result)
Esempio n. 9
0
 def slice_max(self,
               n=None,
               prop=None,
               order_by_column=None,
               with_ties=True,
               rounding_type="round",
               by=None):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.slice_max(n=n,
                         prop=prop,
                         order_by_column=order_by_column,
                         with_ties=with_ties,
                         rounding_type=rounding_type,
                         by=by).to_pandas(copy=False)
Esempio n. 10
0
 def slice_sample(self,
                  n=None,
                  prop=None,
                  replace=False,
                  weights=None,
                  random_state=None,
                  by=None):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.slice_sample(n=n,
                            prop=prop,
                            replace=replace,
                            weights=weights,
                            random_state=random_state,
                            by=by).to_pandas(copy=False)
Esempio n. 11
0
 def pivot_wider(self,
                 names_from,
                 values_from,
                 values_fill=None,
                 values_fn=None,
                 id_cols=None,
                 sep="__"):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.pivot_wider(names_from=names_from,
                           values_from=values_from,
                           values_fill=values_fill,
                           values_fn=values_fn,
                           id_cols=id_cols,
                           sep=sep).to_pandas(copy=False)
Esempio n. 12
0
 def group_modify(self,
                  func,
                  by,
                  preserve_row_order=False,
                  row_order_column_name="rowid_temp",
                  is_pandas_udf=False,
                  **kwargs):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.group_modify(func=func,
                            by=by,
                            preserve_row_order=preserve_row_order,
                            row_order_column_name=row_order_column_name,
                            is_pandas_udf=is_pandas_udf,
                            **kwargs).to_pandas(copy=False)
Esempio n. 13
0
def test_mutate_orderby(penguins_data):
    penguins_tidy = tidyframe(penguins_data, copy=False)

    expected = (penguins_data[['year', 'species', 'bill_length_mm']]\
                        .sort_values("bill_length_mm")\
                        .assign(year_cumsum=lambda x: np.cumsum(x["year"]))\
                        .sort_index())

    result = (penguins_tidy.select(['year', 'species',
                                    'bill_length_mm']).mutate(
                                        {
                                            'year_cumsum': (np.cumsum, 'year')
                                        },
                                        order_by='bill_length_mm').to_pandas())
    assert_frame_equal_v2(expected, result)
Esempio n. 14
0
 def summarise(self,
               dictionary=None,
               func=None,
               column_names=None,
               predicate=None,
               prefix="",
               by=None,
               **kwargs):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.summarise(dictionary=dictionary,
                         func=func,
                         column_names=column_names,
                         predicate=predicate,
                         prefix=prefix,
                         by=by,
                         **kwargs).to_pandas(copy=False)
Esempio n. 15
0
def test_select(penguins_data):
    penguins_tidy = tidyframe(penguins_data, copy=False)

    sel_cols = penguins_data.convert_dtypes().columns[
        penguins_data.convert_dtypes().apply(lambda x: x.dtype != "string",
                                             axis=0)]
    expected = penguins_data.loc[:, sel_cols]
    result = penguins_tidy.select(
        predicate=lambda x: x.dtype != "string").to_pandas()
    assert_frame_equal_v2(expected, result)

    sel_cols = list(
        set(penguins_data.columns).difference(set(['sex', 'species'])))
    expected = penguins_data.loc[:, sel_cols]
    result = penguins_tidy.select(['sex', 'species'],
                                  include=False).to_pandas()
    assert_frame_equal_v2(expected, result)
Esempio n. 16
0
 def separate_rows(self, column_name, sep=';'):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.separate_rows(column_name=column_name,
                             sep=sep).to_pandas(copy=False)
Esempio n. 17
0
 def add_group_number(self, by=None, name='group_number'):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.add_group_number(by=by, name=name).to_pandas(copy=False)
Esempio n. 18
0
 def add_row_number(self, name='row_number', by=None):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.add_row_number(name=name, by=by).to_pandas(copy=False)
Esempio n. 19
0
 def colnames(self):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.colnames
Esempio n. 20
0
 def dim(self):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.dim
Esempio n. 21
0
 def shape(self):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.shape
Esempio n. 22
0
 def split(self, by):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.split(by=by).to_pandas(copy=False)
Esempio n. 23
0
 def unnest(self, nest_column_name='data'):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.unnest(nest_column_name=nest_column_name).to_pandas(
         copy=False)
Esempio n. 24
0
 def ncol(self):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.ncol
Esempio n. 25
0
 def nest(self, column_names=None, nest_column_name='data', include=True):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.nest(column_names=column_names,
                    nest_column_name=nest_column_name,
                    include=include).to_pandas(copy=False)
Esempio n. 26
0
 def nest_by(self, by=None, nest_column_name='data', drop_by=True):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.nest_by(by=by,
                       nest_column_name=nest_column_name,
                       drop_by=drop_by).to_pandas(copy=False)
Esempio n. 27
0
 def select(self, column_names=None, predicate=None, include=True):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.select(column_names=column_names,
                      predicate=predicate,
                      include=include).to_pandas(copy=False)
Esempio n. 28
0
 def drop_na(self, column_names=None):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.drop_na(column_names=column_names).to_pandas(copy=False)
Esempio n. 29
0
 def relocate(self, column_names, before=None, after=None):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.relocate(column_names=column_names,
                        before=before,
                        after=after).to_pandas(copy=False)
Esempio n. 30
0
 def unite(self, column_names, into, sep="_", keep=False):
     tf = tidyframe(self._obj, copy=False, check=False)
     return tf.unite(column_names=column_names,
                     into=into,
                     sep=sep,
                     keep=keep).to_pandas(copy=False)