def test_datamation_groupby_multiple(): df = small_salary().df df = DatamationFrame(df) # Group by Degree, Work mean = df.groupby(['Degree', 'Work']).mean() assert "groupby" in mean.operations assert "mean" in mean.operations assert len(mean.states) == 2 assert df.equals(mean.states[0]) assert mean.Salary.Masters.Academia == 84.0298831968801 assert mean.Salary.Masters.Industry == 91.22576155606282 assert mean.Salary.PhD.Academia == 85.55796571969728 assert mean.Salary.PhD.Industry == 93.08335885824636 # sum sum = df.groupby(['Degree', 'Work']).sum() assert "groupby" in sum.operations assert "sum" in sum.operations assert len(sum.states) == 2 assert df.equals(sum.states[0]) assert sum.Salary.Masters.Academia == 840.2988319688011 assert sum.Salary.Masters.Industry == 5655.997216475895 assert sum.Salary.PhD.Academia == 1540.043382954551 assert sum.Salary.PhD.Industry == 930.8335885824636 # product product = df.groupby(['Degree', 'Work']).prod() assert "groupby" in product.operations assert "product" in product.operations assert len(product.states) == 2 assert df.equals(product.states[0]) assert product.Salary.Masters.Academia == 1.753532557780977e+19 assert product.Salary.Masters.Industry == 3.3602152421057308e+121 assert product.Salary.PhD.Academia == 6.027761935702164e+34 assert product.Salary.PhD.Industry == 4.8818435443657834e+19 # Group by species, island, sex df = DatamationFrame(load_penguins()) mean = df.groupby(['species', 'island', 'sex']).mean() assert "groupby" in mean.operations assert "mean" in mean.operations assert len(mean.states) == 2 assert df.equals(mean.states[0]) assert mean.bill_length_mm.Adelie.Biscoe.male == approx(40.5909090909091) assert mean.bill_length_mm.Adelie.Biscoe.female == approx( 37.35909090909092)
def _load_penguins(): print("🐧 loading penguins...") from palmerpenguins import load_penguins X, y = load_penguins(return_X_y=True) print(X.head()) return X, y
def train(): data, target = load_penguins(return_X_y=True, drop_na=True) models = { "rf": RandomForestClassifier(max_depth=3, random_state=0), "dt": DecisionTreeClassifier(max_depth=3, random_state=0), } for name, model in models.items(): model.fit(data, target) joblib.dump(model, f"model/{name}.pkl") print(f"save model: {name}")
def data_sourcing(): """ This template function uses the Palmer Peguins dataset as a place holder. Replace it by your own code to import your project's data. """ df = palmerpenguins.load_penguins() cols = [ "bill_length_mm", "bill_depth_mm", "flipper_length_mm", "body_mass_g", "sex", "species", ] return df[cols]
def input_dataframe(): penguin_df = load_penguins() return penguin_df
def penguins_data(): return simplify(load_penguins())