def test_discover(sample_data, expensive_stats): features = [ Feature('size', NullFiller(0), source='foo.features.contrib.user_a.feature_1'), Feature('strength', NullFiller(100), source='foo.features.contrib.user_b.feature_1') ] X_df, y_df = sample_data.X, sample_data.y y = np.asfarray(y_df) df = discover(features, X_df, y_df, y, expensive_stats=expensive_stats) expected_cols = { 'name', 'description', 'input', 'transformer', 'primitives', 'output', 'author', 'source', 'mutual_information', 'conditional_mutual_information', 'ninputs', 'nvalues', 'ncontinuous', 'ndiscrete', 'mean', 'std', 'variance', 'min', 'median', 'max', 'nunique', } actual_cols = df.columns assert not expected_cols.symmetric_difference(actual_cols) assert df.shape[0] == len(features) # test filter input = 'size' discovery_df = discover(features, X_df, y_df, y, input=input) assert discovery_df.shape[0] == len([ feature for feature in features if feature.input == input or input in feature.input ]) # test no data available # have to clear cache, as values on data already known ballet.discovery._summarize_feature.memory.clear() discovery_df = discover(features, None, None, None) assert discovery_df.shape[0] == len(features) actual_cols = discovery_df.columns assert not expected_cols.symmetric_difference(actual_cols) assert np.isnan(discovery_df['mean'].at[0])
def test_discover_target_nans(sample_data): features = [ Feature('size', NullFiller(0)), ] X_df, y_df = sample_data.X, sample_data.y y = np.asfarray(y_df) # introduce nan to target y[0] = np.nan discovery_df = discover(features, X_df, y_df, y) # stats with target should still be computed assert not np.isnan(discovery_df['mutual_information']).any()
from ballet import Feature from ballet.eng import NullFiller, SimpleFunctionTransformer input = ["Total Bsmt SF", "1st Flr SF", "2nd Flr SF"] transformer = [ SimpleFunctionTransformer(lambda df: df.sum(axis=1)), NullFiller() ] name = "Total Area" feature = Feature(input=input, transformer=transformer, name=name)
from ballet import Feature from ballet.eng import NullFiller from sklearn.preprocessing import OneHotEncoder input = ['Garage Finish'] transformer = [ NullFiller(replacement='Missing'), OneHotEncoder(), ] name = 'Garage finish fill' feature = Feature(input=input, transformer=transformer, name=name)
# include any imports used in this feature right here (within this code cell) from ballet import Feature from ballet.eng import NullFiller # what are the input columns to this feature? input = [ "hv3d3", "hv3d10", "hv3d11", "hv3d12", "hv3d13", # child hunger ] # what transformations do you want to apply to these specific input columns? transformer = [ ("hv3d3", lambda ser: (ser == 1) | (ser == 2)), NullFiller(0), lambda df: df.sum(axis=1), ] # what is a brief name of this feature? name = "Children hungry wave 3" # what is a longer human-readable description for this feature? you can include # more background on your calculations or thinking description = "Number of ways in which child may be measured as hungry in wave 3" # put it all together! feature = Feature(input, transformer, name, description)
from ballet import Feature from ballet.eng import NullFiller from sklearn.preprocessing import OneHotEncoder input = ['Bsmt Cond'] transformer = [ NullFiller(replacement='None'), OneHotEncoder(), ] name = 'Basement condition type' feature = Feature(input=input, transformer=transformer, name=name)
from ballet import Feature from ballet.eng import NullFiller input = ["Total Bsmt SF", "1st Flr SF", "2nd Flr SF"] transformer = [lambda df: df.sum(axis=1), NullFiller()] name = "Total Area" feature = Feature(input=input, transformer=transformer, name=name)
from ballet import Feature from ballet.eng import NullFiller input = "Mas Vnr Area" transformer = NullFiller() name = "Cleaned Masonry Veneer Area" feature = Feature(input=input, transformer=transformer, name=name)
from ballet import Feature from ballet.eng import NullFiller, SimpleFunctionTransformer input = ["Yr Sold", "Year Remod/Add"] def calc_age(df): return df["Yr Sold"] - df["Year Remod/Add"] transformer = [SimpleFunctionTransformer(calc_age), NullFiller()] name = "Age" feature = Feature(input=input, transformer=transformer, name=name)
from ballet.eng.external import SimpleImputer, StandardScaler import pandas as pd import numpy as np class MedianIncomeForGroup(BaseTransformer): def __init__(self, targetcol="PINCP"): self.targetcol = targetcol def fit(self, X, y=None): if not isinstance(y, (pd.Series, pd.DataFrame)): y = pd.Series(y.ravel(), name=self.targetcol) self.income_map_ = (X.to_frame().join(y).groupby( by=X.name)[self.targetcol].median().to_dict()) return self def transform(self, X): return X.map(self.income_map_) input = "ANC1P" transformer = [ NullFiller(replacement=-1), # don't appear to be any nans MedianIncomeForGroup(), SimpleImputer(), np.log1p, ] # TODO - function, transformer-like, or list thereof name = "log ancestry income" # TODO - str description = "replace ancestry with log median income for that ancestry in training data" # TODO - str feature = Feature(input, transformer, name=name, description=description)
elif rac1p_value == 9: bin_ = 2 elif rac1p_value == 6: bin_ = 3 elif rac1p_value == 2: bin_ = 4 elif fhisp_value == 1: bin_ = 5 elif rac1p_value == 1: bin_ = 6 return bin_ def bin_education(df): df["SCHL"].astype("int") df["EducationCategorized"] = df["SCHL"].apply(education_to_bin) # df["RAC1P"].astype("string") df['RaceCategorized'] = df[['RAC1P','FHISP']].apply(race_to_bin, axis=1) df['EducationRaceBinned'] = df['RaceCategorized'] + df['EducationCategorized'] return df transformer = [ NullFiller(replacement=0), bin_education, lambda df: df[["EducationRaceBinned"]], ] name = "Education and Race Binned" description = "Education was categorized with higher values corresponding to more attainment, Race was categorized by more representation having higher values. Values were then summed." feature = Feature(input, transformer, name)
from ballet import Feature from ballet.eng import NullFiller input = ["VALP", "NP"] transformer = [lambda x: x["VALP"] / x["NP"], NullFiller()] name = "Property value per household member" description = "Property value divided by number of person in households" feature = Feature(input, transformer, name=name, description=description)
from ballet import Feature from ballet.eng import NullFiller input = ["NP", "SCHL"] transformer = [lambda df: df["SCHL"] / (1 + df["NP"]), NullFiller()] name = "Education Household Average" description = "Ratio between level of education and number of person in household." feature = Feature(input, transformer, name=name, description=description)
from ballet import Feature from ballet.eng import NullFiller, SimpleFunctionTransformer input = ["Total Bsmt SF", "1st Flr SF", "2nd Flr SF"] transformer = [SimpleFunctionTransformer(lambda df: df.sum(axis=1)), NullFiller()] name = "Total Area Calculation" feature = Feature(input=input, transformer=transformer, name=name)
from ballet import Feature from ballet.eng import NullFiller input = ["JWAP", "JWDP"] # TODO - str or list of str transformer = [lambda df: df["JWAP"] - df["JWDP"], NullFiller()] # TODO - function, transformer-like, or list thereof name = "JWAP - JWDP" # TODO - str description = "Time of arrival for work minus Time of departure for work" # TODO - str feature = Feature(input, transformer, name=name, description=description)