Ejemplo n.º 1
0
from ballet import Feature
import ballet.eng
import sklearn.preprocessing

input = ["Mas Vnr Area"]
transformer = [
    ballet.eng.missing.NullFiller(),
    sklearn.preprocessing.OneHotEncoder()
]
name = "Masonry veneer area in square feet type"
feature = Feature(input=input, transformer=transformer, name=name)
# taken from https://www.kaggle.com/tannercarbonati/detailed-data-analysis-ensemble-modeling
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn

features = []

input = ['Pool QC']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='PoolQC Misc Fill')
features.append(misc_fill)

input = ['Year Built', 'Garage Yr Blt']
def calc_age(df):
    mask = pd.isnull(df['Year Built'])
    df['Year Built'][mask] = df['Garage Yr Blt'][mask]
    return df['Year Built']
transformer = ballet.eng.SimpleFunctionTransformer(func=calc_age)
age = Feature(input=input, transformer=transformer)
features.append(age)

input = ['Garage Type']
transformer = [ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
features.append(misc_fill)

input = ['Garage Finish']
Ejemplo n.º 3
0
# taken from https://www.kaggle.com/neviadomski/how-to-get-to-top-25-with-simple-model-sklearn
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn

ballet.__version__

features = []

input = ['Alley']
transformer = [ballet.eng.missing.NullFiller(replacement="NOACCESS", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer, name='Alley Misc Fill')
features.append(misc_fill)

input = ['MS Zoning']
def mode_filler(df):
    df = df.copy()
    return df.fillna(df.mode()['MS Zoning'][0])
transformer = [ballet.eng.SimpleFunctionTransformer(func=mode_filler), sklearn.preprocessing.OneHotEncoder()]
misc_fill = Feature(input=input, transformer=transformer)
features.append(misc_fill)

input = ['MS SubClass']
transformer = [ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull), sklearn.preprocessing.OneHotEncoder()]
ms_fill = Feature(input=input, transformer=transformer, name='MS Fill None')
features.append(ms_fill)
Ejemplo n.º 4
0
from ballet import Feature

input = "Lot Frontage"
transformer = lambda ser: ser.fillna(0)
feature = Feature(input, transformer)
Ejemplo n.º 5
0
from ballet.eng.sklearn import MinMaxScaler


def fillna(df):
    df[["SCHG", "SCHL"]] = df[["SCHG", "SCHL"]].fillna(0)
    df["COW"] = df["COW"].replace("b", 9)
    df["COW"] = df["COW"].fillna(9)
    return df


def filter_func(df):
    df["SCHG"].astype("int")
    df["SCHG"] = df["SCHG"].apply(lambda x: 1 if x > 16 else 0)  # graduate
    df["SCHL"].astype("int")
    df["SCHL"] = df["SCHL"].apply(lambda x: 1 if x > 21 else 0)  # bachelor
    df["EDU"] = df["SCHL"] + df["SCHG"]
    df["COW"].astype("int")
    df["COW"] = df["COW"].apply(lambda x: 1 if x < 7 and x >= 1 else 0)
    return df


input = ["SCHG", "SCHL", "COW"]  # TODO - str or list of str
transformer = [
    fillna,
    filter_func,
    lambda df: df[["EDU", "COW"]],
]  # TODO - function, transformer-like, or list thereof
name = "Education and Worker Class"  # TODO - str
description = "1st dim: whether attended college; 2st dim: whether working with payment"  # TODO - str
feature = Feature(input, transformer, name=name, description=description)
from ballet import Feature

input = None  # TODO - str or list of str
transformer = None  # TODO - function, transformer-like, or list thereof
name = None  # TODO - str
feature = Feature(input, transformer, name)
# Heavily based on his words and not his code.
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn

features = []

input = ['Bsmt Cond']
transformer = [
    ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull),
    sklearn.preprocessing.OneHotEncoder()
]
misc_fill = Feature(input=input, transformer=transformer)
features.append(misc_fill)

input = ['Bsmt Qual']
transformer = [
    ballet.eng.missing.NullFiller(replacement="None", isnull=pd.isnull),
    sklearn.preprocessing.OneHotEncoder()
]
misc_fill = Feature(input=input, transformer=transformer)
features.append(misc_fill)

input = ['BsmtFin Type 1']
transformer = [
    ballet.eng.missing.NullFiller(replacement="missing", isnull=pd.isnull),
    sklearn.preprocessing.OneHotEncoder()
]
# taken from https://www.kaggle.com/erikbruin/house-prices-lasso-xgboost-and-a-detailed-eda
# Adapted into the ballet framework
import ballet
import ballet.eng
from ballet import Feature
import numpy as np
import pandas as pd
import sklearn

features = []

input = ['Yr Sold']
year = Feature(input=input,
               transformer=sklearn.preprocessing.OneHotEncoder(),
               name='Year Categorical')
features.append(year)

input = ['Mo Sold']
month = Feature(input=input,
                transformer=sklearn.preprocessing.OneHotEncoder(),
                name='Month Categorical')
features.append(month)

input = ['Garage Yr Blt', 'Year Built']


def fill_garage(df):
    new_garage = df['Garage Yr Blt'].copy()
    mask = pd.isnull(new_garage)
    new_garage[mask] = df['Year Built'][mask]
    return new_garage
from ballet import Feature
from sklearn import preprocessing

input = ["Pool Area"]
name = "PA"
feature = Feature(input=input, transformer=preprocessing.Binarizer(), name=name)
import sklearn
import sklearn_pandas
from sklearn.model_selection import train_test_split

ballet.__version__

features = []

# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement
input = ['BsmtFin SF 1']
def mean_filler(df):
    df = df.copy()
    return df.fillna(df['BsmtFin SF 1'].mean())
transformer = ballet.eng.SimpleFunctionTransformer(func=mean_filler)
mean_fill = Feature(input=input, transformer=transformer)
features.append(mean_fill)

# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement

input = ['BsmtFin SF 2']
def mean_filler(df):
    df = df.copy()
    return df.fillna(df.mean()['BsmtFin SF 2'])
transformer = ballet.eng.SimpleFunctionTransformer(func=mean_filler)
mean_fill = Feature(input=input, transformer=transformer)
features.append(mean_fill)

# BsmtFinSF1, BsmtFinSF2, BsmtUnfSF, TotalBsmtSF, BsmtFullBath and BsmtHalfBath :
# missing values are likely zero for having no basement