예제 #1
0
def test_discover(sample_data, expensive_stats):
    features = [
        Feature('size',
                NullFiller(0),
                source='foo.features.contrib.user_a.feature_1'),
        Feature('strength',
                NullFiller(100),
                source='foo.features.contrib.user_b.feature_1')
    ]
    X_df, y_df = sample_data.X, sample_data.y
    y = np.asfarray(y_df)

    df = discover(features, X_df, y_df, y, expensive_stats=expensive_stats)

    expected_cols = {
        'name',
        'description',
        'input',
        'transformer',
        'primitives',
        'output',
        'author',
        'source',
        'mutual_information',
        'conditional_mutual_information',
        'ninputs',
        'nvalues',
        'ncontinuous',
        'ndiscrete',
        'mean',
        'std',
        'variance',
        'min',
        'median',
        'max',
        'nunique',
    }
    actual_cols = df.columns
    assert not expected_cols.symmetric_difference(actual_cols)

    assert df.shape[0] == len(features)

    # test filter
    input = 'size'
    discovery_df = discover(features, X_df, y_df, y, input=input)
    assert discovery_df.shape[0] == len([
        feature for feature in features
        if feature.input == input or input in feature.input
    ])

    # test no data available
    # have to clear cache, as values on data already known
    ballet.discovery._summarize_feature.memory.clear()
    discovery_df = discover(features, None, None, None)
    assert discovery_df.shape[0] == len(features)
    actual_cols = discovery_df.columns
    assert not expected_cols.symmetric_difference(actual_cols)
    assert np.isnan(discovery_df['mean'].at[0])
예제 #2
0
def test_discover_target_nans(sample_data):
    features = [
        Feature('size', NullFiller(0)),
    ]
    X_df, y_df = sample_data.X, sample_data.y
    y = np.asfarray(y_df)

    # introduce nan to target
    y[0] = np.nan

    discovery_df = discover(features, X_df, y_df, y)

    # stats with target should still be computed
    assert not np.isnan(discovery_df['mutual_information']).any()
from ballet import Feature
from ballet.eng import NullFiller, SimpleFunctionTransformer

input = ["Total Bsmt SF", "1st Flr SF", "2nd Flr SF"]
transformer = [
    SimpleFunctionTransformer(lambda df: df.sum(axis=1)),
    NullFiller()
]
name = "Total Area"
feature = Feature(input=input, transformer=transformer, name=name)
예제 #4
0
from ballet import Feature
from ballet.eng import NullFiller
from sklearn.preprocessing import OneHotEncoder

input = ['Garage Finish']
transformer = [
    NullFiller(replacement='Missing'),
    OneHotEncoder(),
]
name = 'Garage finish fill'
feature = Feature(input=input, transformer=transformer, name=name)
# include any imports used in this feature right here (within this code cell)
from ballet import Feature
from ballet.eng import NullFiller

# what are the input columns to this feature?
input = [
    "hv3d3",
    "hv3d10",
    "hv3d11",
    "hv3d12",
    "hv3d13",  # child hunger
]

# what transformations do you want to apply to these specific input columns?
transformer = [
    ("hv3d3", lambda ser: (ser == 1) | (ser == 2)),
    NullFiller(0),
    lambda df: df.sum(axis=1),
]

# what is a brief name of this feature?
name = "Children hungry wave 3"

# what is a longer human-readable description for this feature? you can include
# more background on your calculations or thinking
description = "Number of ways in which child may be measured as hungry in wave 3"

# put it all together!
feature = Feature(input, transformer, name, description)
예제 #6
0
from ballet import Feature
from ballet.eng import NullFiller
from sklearn.preprocessing import OneHotEncoder

input = ['Bsmt Cond']
transformer = [
    NullFiller(replacement='None'),
    OneHotEncoder(),
]
name = 'Basement condition type'
feature = Feature(input=input, transformer=transformer, name=name)
예제 #7
0
from ballet import Feature
from ballet.eng import NullFiller

input = ["Total Bsmt SF", "1st Flr SF", "2nd Flr SF"]
transformer = [lambda df: df.sum(axis=1), NullFiller()]
name = "Total Area"
feature = Feature(input=input, transformer=transformer, name=name)
예제 #8
0
from ballet import Feature
from ballet.eng import NullFiller

input = "Mas Vnr Area"
transformer = NullFiller()
name = "Cleaned Masonry Veneer Area"
feature = Feature(input=input, transformer=transformer, name=name)
예제 #9
0
from ballet import Feature
from ballet.eng import NullFiller, SimpleFunctionTransformer

input = ["Yr Sold", "Year Remod/Add"]


def calc_age(df):
    return df["Yr Sold"] - df["Year Remod/Add"]


transformer = [SimpleFunctionTransformer(calc_age), NullFiller()]
name = "Age"
feature = Feature(input=input, transformer=transformer, name=name)
예제 #10
0
from ballet.eng.external import SimpleImputer, StandardScaler
import pandas as pd
import numpy as np


class MedianIncomeForGroup(BaseTransformer):
    def __init__(self, targetcol="PINCP"):
        self.targetcol = targetcol

    def fit(self, X, y=None):
        if not isinstance(y, (pd.Series, pd.DataFrame)):
            y = pd.Series(y.ravel(), name=self.targetcol)
        self.income_map_ = (X.to_frame().join(y).groupby(
            by=X.name)[self.targetcol].median().to_dict())
        return self

    def transform(self, X):
        return X.map(self.income_map_)


input = "ANC1P"
transformer = [
    NullFiller(replacement=-1),  # don't appear to be any nans
    MedianIncomeForGroup(),
    SimpleImputer(),
    np.log1p,
]  # TODO - function, transformer-like, or list thereof
name = "log ancestry income"  # TODO - str
description = "replace ancestry with log median income for that ancestry in training data"  # TODO - str
feature = Feature(input, transformer, name=name, description=description)
	elif rac1p_value == 9: 
		bin_ = 2
	elif rac1p_value == 6:
		bin_ = 3
	elif rac1p_value == 2:
		bin_ = 4
	elif fhisp_value == 1:
		bin_ = 5
	elif rac1p_value == 1:
		bin_ = 6
	return bin_


def bin_education(df):
	df["SCHL"].astype("int")
	df["EducationCategorized"] = df["SCHL"].apply(education_to_bin) 
	# df["RAC1P"].astype("string")
	df['RaceCategorized'] = df[['RAC1P','FHISP']].apply(race_to_bin, axis=1)
	df['EducationRaceBinned'] = df['RaceCategorized'] + df['EducationCategorized']
	return df

transformer = [
	NullFiller(replacement=0),
	bin_education,
    lambda df: df[["EducationRaceBinned"]],
] 

name = "Education and Race Binned"
description = "Education was categorized with higher values corresponding to more attainment, Race was categorized by more representation having higher values. Values were then summed." 
feature = Feature(input, transformer, name)
from ballet import Feature
from ballet.eng import NullFiller

input = ["VALP", "NP"]
transformer = [lambda x: x["VALP"] / x["NP"], NullFiller()]
name = "Property value per household member"
description = "Property value divided by number of person in households"
feature = Feature(input, transformer, name=name, description=description)
예제 #13
0
from ballet import Feature
from ballet.eng import NullFiller

input = ["NP", "SCHL"]

transformer = [lambda df: df["SCHL"] / (1 + df["NP"]), NullFiller()]

name = "Education Household Average"
description = "Ratio between level of education and number of person in household."
feature = Feature(input, transformer, name=name, description=description)
예제 #14
0
from ballet import Feature
from ballet.eng import NullFiller, SimpleFunctionTransformer

input = ["Total Bsmt SF", "1st Flr SF", "2nd Flr SF"]
transformer = [SimpleFunctionTransformer(lambda df: df.sum(axis=1)), NullFiller()]
name = "Total Area Calculation"
feature = Feature(input=input, transformer=transformer, name=name)
예제 #15
0
from ballet import Feature
from ballet.eng import NullFiller

input = ["JWAP", "JWDP"]  # TODO - str or list of str
transformer = [lambda df: df["JWAP"] - df["JWDP"], NullFiller()]
# TODO - function, transformer-like, or list thereof
name = "JWAP - JWDP"  # TODO - str
description = "Time of arrival for work minus Time of departure for work"  # TODO - str
feature = Feature(input, transformer, name=name, description=description)