from ballet import Feature import ballet.eng import sklearn.preprocessing input = ["Mas Vnr Area"] transformer = [ ballet.eng.missing.NullFiller(), sklearn.preprocessing.OneHotEncoder() ] name = "Masonry veneer area in square feet type" feature = Feature(input=input, transformer=transformer, name=name)
# taken from https://www.kaggle.com/tannercarbonati/detailed-data-analysis-ensemble-modeling # Adapted into the ballet framework import ballet import ballet.eng from ballet import Feature import numpy as np import pandas as pd import sklearn features = [] input = ['Pool QC'] transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()] misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill') features.append(misc_fill) input = ['Year Built', 'Garage Yr Blt'] def calc_age(df): mask = pd.isnull(df['Year Built']) df['Year Built'][mask] = df['Garage Yr Blt'][mask] return df['Year Built'] transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age) age = Feature(input=input, transformer=transformer) features.append(age) input = ['Garage Type'] transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()] misc_fill = Feature(input=input, transformer=transformer) features.append(misc_fill) input = ['Garage Finish']
# taken from https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn # Adapted into the ballet framework import ballet import ballet.eng from ballet import Feature import numpy as np import pandas as pd import sklearn ballet.__version__ features = [] input = ['Alley'] transformer = [ballet.eng.missing.NullFiller(replacement="NOACCESS", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()] misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill') features.append(misc_fill) input = ['MS Zoning'] def mode_filler(df): df = df.copy() return df.fillna(df.mode()['MS Zoning'][0]) transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()] misc_fill = Feature(input=input, transformer=transformer) features.append(misc_fill) input = ['MS SubClass'] transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()] ms_fill = Feature(input=input, transformer=transformer, name='MS Fill None') features.append(ms_fill)
from ballet import Feature input = "Lot Frontage" transformer = lambda ser: ser.fillna(0) feature = Feature(input, transformer)
from ballet.eng.sklearn import MinMaxScaler def fillna(df): df[["SCHG", "SCHL"]] = df[["SCHG", "SCHL"]].fillna(0) df["COW"] = df["COW"].replace("b", 9) df["COW"] = df["COW"].fillna(9) return df def filter_func(df): df["SCHG"].astype("int") df["SCHG"] = df["SCHG"].apply(lambda x: 1 if x > 16 else 0) # graduate df["SCHL"].astype("int") df["SCHL"] = df["SCHL"].apply(lambda x: 1 if x > 21 else 0) # bachelor df["EDU"] = df["SCHL"] + df["SCHG"] df["COW"].astype("int") df["COW"] = df["COW"].apply(lambda x: 1 if x < 7 and x >= 1 else 0) return df input = ["SCHG", "SCHL", "COW"] # TODO - str or list of str transformer = [ fillna, filter_func, lambda df: df[["EDU", "COW"]], ] # TODO - function, transformer-like, or list thereof name = "Education and Worker Class" # TODO - str description = "1st dim: whether attended college; 2st dim: whether working with payment" # TODO - str feature = Feature(input, transformer, name=name, description=description)
from ballet import Feature input = None # TODO - str or list of str transformer = None # TODO - function, transformer-like, or list thereof name = None # TODO - str feature = Feature(input, transformer, name)
# Heavily based on his words and not his code. import ballet import ballet.eng from ballet import Feature import numpy as np import pandas as pd import sklearn features = [] input = ['Bsmt Cond'] transformer = [ ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder() ] misc_fill = Feature(input=input, transformer=transformer) features.append(misc_fill) input = ['Bsmt Qual'] transformer = [ ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder() ] misc_fill = Feature(input=input, transformer=transformer) features.append(misc_fill) input = ['BsmtFin Type 1'] transformer = [ ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder() ]
# taken from https://www.kaggle.com/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda # Adapted into the ballet framework import ballet import ballet.eng from ballet import Feature import numpy as np import pandas as pd import sklearn features = [] input = ['Yr Sold'] year = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Year Categorical') features.append(year) input = ['Mo Sold'] month = Feature(input=input, transformer=sklearn.preprocessing.OneHotEncoder(), name='Month Categorical') features.append(month) input = ['Garage Yr Blt', 'Year Built'] def fill_garage(df): new_garage = df['Garage Yr Blt'].copy() mask = pd.isnull(new_garage) new_garage[mask] = df['Year Built'][mask] return new_garage
from ballet import Feature from sklearn import preprocessing input = ["Pool Area"] name = "PA" feature = Feature(input=input, transformer=preprocessing.Binarizer(), name=name)
import sklearn import sklearn_pandas from sklearn.model_selection import train_test_split ballet.__version__ features = [] # BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath : # missing values are likely zero for having no basement input = ['BsmtFin SF 1'] def mean_filler(df): df = df.copy() return df.fillna(df['BsmtFin SF 1'].mean()) transformer = ballet.eng.SimpleFunctionTransformer(func=mean_filler) mean_fill = Feature(input=input, transformer=transformer) features.append(mean_fill) # BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath : # missing values are likely zero for having no basement input = ['BsmtFin SF 2'] def mean_filler(df): df = df.copy() return df.fillna(df.mean()['BsmtFin SF 2']) transformer = ballet.eng.SimpleFunctionTransformer(func=mean_filler) mean_fill = Feature(input=input, transformer=transformer) features.append(mean_fill) # BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath : # missing values are likely zero for having no basement