def test_make_step(): def some_method(self): pass LogisticRegression = make_step( sklearn.linear_model.LogisticRegression, attr_dict={"some_method": some_method} ) assert issubclass(LogisticRegression, Step) assert issubclass(LogisticRegression, sklearn.linear_model.LogisticRegression) assert hasattr(LogisticRegression, "get_params") assert hasattr(LogisticRegression, "set_params") assert hasattr(LogisticRegression, "fit") assert hasattr(LogisticRegression, "predict") assert hasattr(LogisticRegression, "some_method") assert LogisticRegression.__name__ == "LogisticRegression"
def test_make_step(class_name, expected, warns): def some_method(self): pass with warns: LogisticRegression = make_step( sklearn.linear_model.LogisticRegression, {"some_method": some_method}, class_name, ) assert issubclass(LogisticRegression, Step) assert issubclass(LogisticRegression, sklearn.linear_model.LogisticRegression) assert hasattr(LogisticRegression, "get_params") assert hasattr(LogisticRegression, "set_params") assert hasattr(LogisticRegression, "fit") assert hasattr(LogisticRegression, "predict") assert hasattr(LogisticRegression, "some_method") assert LogisticRegression.__name__ == expected
import numpy as np import random import sklearn.linear_model from sklearn.datasets import fetch_openml from sklearn.metrics import jaccard_score from sklearn.model_selection import train_test_split from baikal import Input, Model, make_step from baikal.plot import plot_model from baikal.steps import ColumnStack, Split, Lambda # ------- Define steps LogisticRegression = make_step(sklearn.linear_model.LogisticRegression) # ------- Load a multi-label dataset # (from https://www.openml.org/d/40597) X, Y = fetch_openml("yeast", version=4, return_X_y=True) Y = Y == "TRUE" X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0) n_targets = Y.shape[1] random.seed(87) order = list(range(n_targets)) random.shuffle(order) # ------- Build model x = Input()
import sklearn.decomposition import sklearn.ensemble import sklearn.linear_model import sklearn.preprocessing import sklearn.svm from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from baikal import Input, Model, make_step from baikal.plot import plot_model from baikal.steps import Stack # 1. Define the steps LogisticRegression = make_step(sklearn.linear_model.LogisticRegression) RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier) ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier) PCA = make_step(sklearn.decomposition.PCA) SVC = make_step(sklearn.svm.SVC) PowerTransformer = make_step(sklearn.preprocessing.PowerTransformer) # 2. Build the model x1 = Input(name="x1") x2 = Input(name="x2") y_t = Input(name="y_t") y1 = ExtraTreesClassifier()(x1, y_t) y2 = RandomForestClassifier()(x2, y_t) z = PowerTransformer()(x2) z = PCA()(z) y3 = LogisticRegression()(z, y_t)
import sklearn.svm from sklearn.datasets import load_breast_cancer from sklearn.model_selection import train_test_split from baikal import Input, Model, make_step from baikal.plot import plot_model # 1. Define a step SVC = make_step(sklearn.svm.SVC) # 2. Build the model x = Input() y_t = Input() y_p = SVC(C=1.0, kernel="rbf", gamma=0.5)(x, y_t) model = Model(x, y_p, y_t) plot_model(model, filename="readme_quick_example.png") # 3. Train the model dataset = load_breast_cancer() X_train, X_test, y_train, y_test = train_test_split(dataset.data, dataset.target, random_state=0) model.fit(X_train, y_train) # 4. Use the model y_test_pred = model.predict(X_test)
import sklearn.decomposition import sklearn.ensemble import sklearn.linear_model import sklearn.preprocessing from baikal import make_step LinearRegression = make_step(sklearn.linear_model.LinearRegression) LogisticRegression = make_step(sklearn.linear_model.LogisticRegression) RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier) ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier) PCA = make_step(sklearn.decomposition.PCA) LabelEncoder = make_step(sklearn.preprocessing.LabelEncoder) StandardScaler = make_step(sklearn.preprocessing.StandardScaler)
import sklearn.datasets import sklearn.ensemble import sklearn.linear_model from sklearn.metrics import f1_score from sklearn.model_selection import train_test_split from baikal import Input, Model, make_step from baikal.plot import plot_model from baikal.steps import Concatenate # ------- Define steps LogisticRegression = make_step(sklearn.linear_model.LogisticRegression) RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier) ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier) # ------- Load dataset data = sklearn.datasets.load_breast_cancer() X, y_p = data.data, data.target X_train, X_test, y_train, y_test = train_test_split(X, y_p, test_size=0.2, random_state=0) # ------- Build model x = Input() y_t = Input() y_p1 = LogisticRegression(function="predict_proba")(x, y_t) y_p2 = RandomForestClassifier(function="predict_proba")(x, y_t) ensemble_features = Concatenate()([y_p1, y_p2]) y_p = ExtraTreesClassifier()(ensemble_features, y_t)
import sklearn.preprocessing from sklearn.model_selection import cross_val_predict from baikal import make_step def _fit_predict_proba(self, X, y): self.fit(X, y) return cross_val_predict(self, X, y, method="predict_proba") def _fit_decision_function(self, X, y): self.fit(X, y) return cross_val_predict(self, X, y, method="decision_function") LinearRegression = make_step(sklearn.linear_model.LinearRegression) LogisticRegression = make_step(sklearn.linear_model.LogisticRegression) LinearSVC = make_step(sklearn.svm.LinearSVC) LinearSVCOOF = make_step(sklearn.svm.LinearSVC, attr_dict={"fit_predict": _fit_decision_function}) RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier) RandomForestClassifierOOF = make_step( sklearn.ensemble.RandomForestClassifier, attr_dict={"fit_predict": _fit_predict_proba}, ) ExtraTreesClassifier = make_step(sklearn.ensemble.ExtraTreesClassifier) PCA = make_step(sklearn.decomposition.PCA) LabelEncoder = make_step(sklearn.preprocessing.LabelEncoder) StandardScaler = make_step(sklearn.preprocessing.StandardScaler)
def train(self): import xgboost from baikal import make_step, Step, Input, Model from baikal.steps import Stack from sklearn_pandas import gen_features import custom_transformations as ct from custom_transformations import DataFrameMapperStep, ConcatDataFrame, CatBoostRegressorStep # these are the categorical columns in the dataset CATEGORICAL_COLUMNS = [ 'KitchenQual', 'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'OverallQual', 'OverallCond', ] # these columns will be terated as a numerical columns NUMERICAL_COLUMNS = [ 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold' ] # These columns have missing values and the one for which we will add missing indicator variable MISSING_INDICATOR = [ 'LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature' ] ## Categorical Columns for which we want One Hot Encoding ONEHOT_COLUMNS = [ 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition' ] ## Categorical Columns for which we want to have target encoding TARGET_COLUMNS = [ 'MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd' ] ## Columns for that require log transformations LOG_COLUMNS = [ 'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal' ] # Define Steps ElasticNetStep = make_step(ElasticNet, class_name='ElasticNet') ConcatStep = make_step(ConcatDataFrame, class_name='Concat') XGBRegressorStep = make_step(xgboost.XGBRegressor, class_name='XGBRegressor') LinearRegressionStep = make_step(sklearn.linear_model.LinearRegression, class_name='LinearRegression') # Define sklearn-pandas transformations. Here I am using gen_features utility to # define transformations for individual columns. baseProcessing = ( gen_features(columns=[[x] for x in MISSING_INDICATOR], classes=[{ 'class': MissingIndicator, 'features': 'all', 'sparse': False, 'error_on_new': False }], prefix='na_') + gen_features( columns=LOG_COLUMNS, classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.float).reshape((-1, 1)) }, { 'class': SimpleImputer, 'strategy': 'mean' }, { 'class': FunctionTransformer, 'func': np.log1p }]) + gen_features( columns=list(set(NUMERICAL_COLUMNS) - set(LOG_COLUMNS)), classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.float).reshape((-1, 1)) }, { 'class': SimpleImputer, 'strategy': 'mean' }], ) + [ # constructing new features -- age of the house (['YrSold', 'YearBuilt'], [ FunctionTransformer( func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)), FunctionTransformer(np.log1p) ], { 'alias': 'age' }), # constructing new feature -- remodeling age (['YrSold', 'YearRemodAdd'], [ FunctionTransformer( func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)), FunctionTransformer(np.log1p) ], { 'alias': 'remodel_age' }), # new feature -- total surface area (['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'], [ FunctionTransformer(lambda x: np.nansum(x, axis=1)), FunctionTransformer(np.log1p) ], { 'alias': 'numerical_TotalArea' }) ]) # Since CatBoost model can handle categorical data, we don't need to encode categorical variables # we will simply impute missing values and let CatBoost model handle categorical data. catModelPreprocessing = gen_features( columns=CATEGORICAL_COLUMNS, classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.object).reshape(-1, 1) }, { 'class': SimpleImputer, 'strategy': 'most_frequent' }], ) # for regression and XGBoost, we will need to encode categorical variables ourselfs. # Depending on the cardinality of the variable, I am either using one hot encoding or target encoding. regressionModelProcessing = ( gen_features(columns=[[x] for x in ONEHOT_COLUMNS], classes=[{ 'class': OneHotEncoder, 'handle_unknown': 'ignore', 'sparse': False }]) + gen_features(columns=[[x] for x in TARGET_COLUMNS], classes=[ { 'class': TargetEncoder }, { 'class': SimpleImputer, 'strategy': 'mean' }, ])) # Define DAG x = Input(name="x") y = Input(name='y') # Define feature transformations d0 = DataFrameMapperStep(baseProcessing, df_out=True, name='BasePreprocess')(x, y) d1 = DataFrameMapperStep(regressionModelProcessing, df_out=True, name='RegressionModelPreprocess')(x, y) d2 = DataFrameMapperStep(catModelPreprocessing, df_out=True, name='CatModelPreprocess')(x, y) # Consolidate features for catboost and elasticnet regressionFeatures = ConcatStep(name='RegressionFeatures')([d0, d1]) catFeatures = ConcatStep(name='CatBoostFeatures')([d0, d2]) # Generate predictions using three different algorithms. m1 = ElasticNetStep(name='ElasticNet')(regressionFeatures, y) m2 = XGBRegressorStep(name='XGBoost')(regressionFeatures, y) m3 = CatBoostRegressorStep(name='CatBoost', cat_features=CATEGORICAL_COLUMNS, iterations=10)(catFeatures, y) # combine predictions from the three models combinedPredictions = Stack(name='CombinePredictions')([m1, m3]) # construct an ensemble model ensembleModel = LinearRegressionStep()(combinedPredictions, y) model = Model(x, ensembleModel, y) model.fit(self.trainDF, self.trainDF['SalePrice']) self.artifact = { 'model.pkl': cloudpickle.dumps(model), 'environment': { 'pip': {} } } self.next(self.end)
class XGBRegressor(XGBStep, xgb.XGBRegressor): def __init__(self, *args, name=None, **kwargs): super().__init__(*args, name=name, **kwargs) class XGBClassifier(XGBStep, xgb.XGBClassifier): def __init__(self, *args, name=None, **kwargs): super().__init__(*args, name=name, **kwargs) class XGBRanker(XGBStep, xgb.XGBRanker): def __init__(self, *args, name=None, **kwargs): super().__init__(*args, name=name, **kwargs) SimpleImputer = make_step(impute.SimpleImputer, class_name="SimpleImputer") KNeighborsRegressor = make_step(neighbors.KNeighborsRegressor, class_name="KNeighborsRegressor") KNeighborsClassifier = make_step(neighbors.KNeighborsClassifier, class_name="KNeighborsClassifier") RandomForestRegressor = make_step(ensemble.RandomForestRegressor, class_name="RandomForestRegressor") RandomForestClassifier = make_step(ensemble.RandomForestClassifier, class_name="RandomForestClassifier") ExtraTreesRegressor = make_step(ensemble.ExtraTreesRegressor, class_name="ExtraTreesRegressor")
import sklearn.decomposition import sklearn.ensemble import sklearn.decomposition import sklearn.linear_model from sklearn import datasets from sklearn.model_selection import GridSearchCV, StratifiedKFold from baikal import Input, Model, make_step from baikal.sklearn import SKLearnWrapper LogisticRegression = make_step(sklearn.linear_model.LogisticRegression) RandomForestClassifier = make_step(sklearn.ensemble.RandomForestClassifier) PCA = make_step(sklearn.decomposition.PCA) def build_fn(): x = Input() y_t = Input() h = PCA(random_state=random_state, name="pca")(x) y_p = LogisticRegression(random_state=random_state, name="classifier")(h, y_t) model = Model(x, y_p, y_t) return model iris = datasets.load_iris() x_data = iris.data y_data = iris.target random_state = 123 verbose = 0
# Adapted from the scikit-learn example in: # https://scikit-learn.org/stable/auto_examples/compose/plot_transformed_target.html#sphx-glr-auto-examples-compose-plot-transformed-target-py import numpy as np import sklearn.linear_model import sklearn.preprocessing from sklearn.datasets import load_boston from sklearn.metrics import median_absolute_error, r2_score from sklearn.model_selection import train_test_split from baikal import make_step, Input, Model from baikal.plot import plot_model from baikal.steps import Lambda # ------- Define steps RidgeCV = make_step(sklearn.linear_model.RidgeCV) QuantileTransformer = make_step(sklearn.preprocessing.QuantileTransformer) # ------- Load dataset dataset = load_boston() target = np.array(dataset.feature_names) == "DIS" X = dataset.data[:, np.logical_not(target)] y = dataset.data[:, target].squeeze() X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) # ------- Build model transformer = QuantileTransformer(n_quantiles=300, output_distribution="normal") x = Input() y_t = Input()
def predict(self, X): raise KeyError("some failure") class _DummyEstimator(BaseEstimator): def __init__(self, x=123, y="abc"): self.x = x self.y = y self.fit_calls = 0 self.fit_predict_calls = 0 def predict(self, X): return X def predict_proba(self, X): return X def fit(self, X, y): self.fit_calls += 1 return self def fit_predict(self, X, y): self.fit_predict_calls += 1 return X def fit_predict_proba(self, X, y): return X DummyEstimator = make_step(_DummyEstimator, class_name="DummyEstimator")
def predict(self, X): raise KeyError("some failure") class _DummyEstimator(BaseEstimator): def __init__(self, x=123, y="abc"): self.x = x self.y = y self.fit_calls = 0 self.fit_predict_calls = 0 def predict(self, X): return X def predict_proba(self, X): return X def fit(self, X, y): self.fit_calls += 1 return self def fit_predict(self, X, y): self.fit_predict_calls += 1 return X def fit_predict_proba(self, X, y): return X DummyEstimator = make_step(_DummyEstimator)