def test_function_transformer_future_warning(validate, expected_warning): # FIXME: to be removed in 0.22 X = np.random.randn(100, 10) transformer = FunctionTransformer(validate=validate) with pytest.warns(expected_warning) as results: transformer.fit_transform(X) if expected_warning is None: assert len(results) == 0
def test_kw_arg(): X = np.linspace(0, 1, num=10).reshape((5, 2)) F = FunctionTransformer(np.around, kw_args=dict(decimals=3)) # Test that rounding is correct assert_array_equal(F.transform(X), np.around(X, decimals=3))
def test_inverse_transform(): X = np.array([1, 4, 9, 16]).reshape((2, 2)) # Test that inverse_transform works correctly F = FunctionTransformer( func=np.sqrt, inverse_func=np.around, inv_kw_args=dict(decimals=3)) testing.assert_array_equal( F.inverse_transform(F.transform(X)), np.around(np.sqrt(X), decimals=3))
def test_functiontransformer_vs_sklearn(): # Compare msmbuilder.preprocessing.FunctionTransformer # with sklearn.preprocessing.FunctionTransformer functiontransformerr = FunctionTransformerR() functiontransformerr.fit(np.concatenate(trajs)) functiontransformer = FunctionTransformer() functiontransformer.fit(trajs) y_ref1 = functiontransformerr.transform(trajs[0]) y1 = functiontransformer.transform(trajs)[0] np.testing.assert_array_almost_equal(y_ref1, y1)
def test_check_inverse(): X_dense = np.array([1, 4, 9, 16], dtype=np.float64).reshape((2, 2)) X_list = [X_dense, sparse.csr_matrix(X_dense), sparse.csc_matrix(X_dense)] for X in X_list: if sparse.issparse(X): accept_sparse = True else: accept_sparse = False trans = FunctionTransformer(func=np.sqrt, inverse_func=np.around, accept_sparse=accept_sparse, check_inverse=True, validate=True) assert_warns_message(UserWarning, "The provided functions are not strictly" " inverse of each other. If you are sure you" " want to proceed regardless, set" " 'check_inverse=False'.", trans.fit, X) trans = FunctionTransformer(func=np.expm1, inverse_func=np.log1p, accept_sparse=accept_sparse, check_inverse=True, validate=True) Xt = assert_no_warnings(trans.fit_transform, X) assert_allclose_dense_sparse(X, trans.inverse_transform(Xt)) # check that we don't check inverse when one of the func or inverse is not # provided. trans = FunctionTransformer(func=np.expm1, inverse_func=None, check_inverse=True, validate=True) assert_no_warnings(trans.fit, X_dense) trans = FunctionTransformer(func=None, inverse_func=np.expm1, check_inverse=True, validate=True) assert_no_warnings(trans.fit, X_dense)
def test_function_transformer_frame(): pd = pytest.importorskip('pandas') X_df = pd.DataFrame(np.random.randn(100, 10)) transformer = FunctionTransformer(validate=False) X_df_trans = transformer.fit_transform(X_df) assert hasattr(X_df_trans, 'loc')
#return pd.concat([X, histopatological_diagnosis_encoded], axis=1).astype(float) #return X.join(histopatological_diagnosis_encoded) return X X_encoded = encode(X) X_encoded # %% [markdown] # We will examine models with # %% pipelines = dict( pipeline_logistic_regression=Pipeline([ ('encoder', FunctionTransformer(encode)), ('scaler', StandardScaler()), ('clf', LogisticRegression(max_iter=300)) ]), pipeline_pca_logistic_regression=Pipeline([ ('encoder', FunctionTransformer(encode)), ('scaler', StandardScaler()), ('pca', PCA(0.95)), ('clf', LogisticRegression(max_iter=300)) ]), pipeline_nn=Pipeline([('encoder', FunctionTransformer(encode)), ('scaler', StandardScaler()), ('clf', MLPClassifier((10, ), learning_rate='adaptive', early_stopping=True))]), pipeline_pca_nn=Pipeline([('encoder', FunctionTransformer(encode)), ('scaler', StandardScaler()), ('pca', PCA(0.95)), ('clf',
def __init__(self): self._log_transformer = FunctionTransformer(np.log1p) self._drop_first_feature = FunctionTransformer( self._drop_first_feature)
from timeserio.preprocessing import (PandasColumnSelector, PandasDateTimeFeaturizer, PandasValueSelector, utils) @pytest.fixture def input_df(): df = mock.mock_raw_data(ids=[0, 1]) df['group'] = np.random.randint(2, size=len(df)) return df col_selector = ('select', PandasColumnSelector([ini.Columns.target])) val_selector = ('select', PandasValueSelector([ini.Columns.target])) identity = ('identity', utils.IdentityRegressor()) lagger = ('lag', FunctionTransformer(lambda x: x.shift(1), validate=False)) @pytest.mark.parametrize( 'pipeline, groupby, is_estimator', [ (Pipeline([val_selector]), 'id', False), (Pipeline([val_selector]), ['id'], False), (Pipeline([val_selector]), ['id', 'group'], False), (Pipeline_sk([val_selector]), ['id'], False), (Pipeline_sk([val_selector]), ['id', 'group'], False), (Pipeline([val_selector, identity]), ['id'], True), (Pipeline([val_selector, identity]), ['id', 'group'], True), (Pipeline_sk([val_selector, identity]), ['id'], True), (Pipeline_sk([val_selector, identity]), ['id', 'group'], True), ]
# Import CountVectorizer from sklearn.feature_extraction.text import CountVectorizer # Import other preprocessing modules from sklearn.preprocessing import Imputer from sklearn.feature_selection import chi2, SelectKBest # Select 300 best features chi_k = 300 # Import functional utilities from sklearn.preprocessing import FunctionTransformer, MaxAbsScaler from sklearn.pipeline import FeatureUnion # Perform preprocessing get_text_data = FunctionTransformer(combine_text_columns, validate=False) get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC_COLUMNS], validate=False) # Create the token pattern: TOKENS_ALPHANUMERIC TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)' # Instantiate pipeline: pl pl = Pipeline([ ('union', FeatureUnion( transformer_list=[('numeric_features', Pipeline([('selector', get_numeric_data), ('imputer', Imputer())])), ('text_features',
# containing the claim amount (``ClaimAmount``) for the same policy ids # (``IDpol``). df = load_mtpl2(n_samples=60000) # Note: filter out claims with zero amount, as the severity model # requires strictly positive target values. df.loc[(df["ClaimAmount"] == 0) & (df["ClaimNb"] >= 1), "ClaimNb"] = 0 # Correct for unreasonable observations (that might be data error) # and a few exceptionally large claim amounts df["ClaimNb"] = df["ClaimNb"].clip(upper=4) df["Exposure"] = df["Exposure"].clip(upper=1) df["ClaimAmount"] = df["ClaimAmount"].clip(upper=200000) log_scale_transformer = make_pipeline(FunctionTransformer(func=np.log), StandardScaler()) column_trans = ColumnTransformer( [ ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), ("onehot_categorical", OneHotEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), ("passthrough_numeric", "passthrough", ["BonusMalus"]), ("log_scaled_numeric", log_scale_transformer, ["Density"]), ], remainder="drop", ) X = column_trans.fit_transform(df) # Insurances companies are interested in modeling the Pure Premium, that is
def lab_gaussian(X_train, y_train): model = make_pipeline(FunctionTransformer(rgb_to_lab, validate=True), GaussianNB()) model.fit(X_train, y_train) return model
dow_trans = DayOfWeekTransformer() month_trans = MonthTransformer() tfidf_vec = TfidfVectorizer(ngram_range=(1, 2), max_features=2000) def select_time_column(X): return X[:, 0] def select_text_column(X): return X[:, 1] pipe = make_pipeline( make_union( make_pipeline(FunctionTransformer(select_time_column, validate=False), dow_trans), make_pipeline(FunctionTransformer(select_time_column, validate=False), month_trans), make_pipeline(FunctionTransformer(select_text_column, validate=False), tfidf_vec)), LassoCV(n_alphas=200, cv=5, max_iter=2000, verbose=True, n_jobs=-1, random_state=None)) pipe.fit(X, np.reshape(y, y_raw.shape)) joblib.dump(pipe, 'models/pipe.pkl')
PolynomialFeatures, PowerTransformer, StandardScaler, ) from feature_engine.selection import DropFeatures from feature_engine.wrappers import SklearnTransformerWrapper _transformers = [ Binarizer(threshold=2), KBinsDiscretizer(n_bins=3, encode="ordinal"), StandardScaler(), MinMaxScaler(), Normalizer(), PowerTransformer(), FunctionTransformer(np.log, validate=True), OrdinalEncoder(), ] _selectors = [ SelectFromModel(Lasso(random_state=1)), SelectKBest(f_regression, k=2), VarianceThreshold(), RFE(Lasso(random_state=1)), ] @pytest.mark.parametrize( "transformer", [ SimpleImputer(),
def testPreProc(): iris = load_iris() # 无量纲化使不同规格的数据转换到同一规格。常见的无量纲化方法有标准化和区间缩放法。 # 标准化是依照特征矩阵的列处理数据,其通过求z-score的方法,将样本的特征值转换到同一量纲下。 # 归一化是依照特征矩阵的行处理数据,其目的在于样本向量在点乘运算或其他核函数计算相似性时, # 拥有统一的标准,也就是说都转化为“单位向量”。 from sklearn.preprocessing import StandardScaler from sklearn.preprocessing import MinMaxScaler MinMaxScaler().fit_transform(iris.data) StandardScaler().fit_transform(iris.data) # 二值化,阈值设置为3,返回值为二值化后的数据 from sklearn.preprocessing import Binarizer Binarizer(threshold=3).fit_transform(iris.data) # 哑编码,对IRIS数据集的目标值,返回值为哑编码后的数据,注意是2D的 # OneHotEncoder(sparse = False).fit_transform( testdata[['age']] ) from sklearn.preprocessing import OneHotEncoder OneHotEncoder().fit_transform(iris.target.reshape((-1, 1))) # 对于字符串型离散变量可以先用LabelEncoder 转换为数值再用OneHotEncoder编码 # 注意LabelEncoder是1D而OneHotEncoder是2D的 from sklearn.preprocessing import LabelEncoder LabelEncoder().fit_transform(iris.data[""]) # 缺失值计算,返回值为计算缺失值后的数据 # 参数missing_value为缺失值的表示形式,默认为NaN # 参数strategy为缺失值填充方式,默认为mean(均值) from numpy import vstack, array, nan from sklearn.preprocessing import Imputer Imputer().fit_transform(vstack((array([nan, nan, nan, nan]), iris.data))) # 数据变换 # 多项式变换 from sklearn.preprocessing import PolynomialFeatures # 多项式转换 #参数degree为度,默认值为 PolynomialFeatures().fit_transform(iris.data) # 自定义转换函数为对数函数的数据变换 #第一个参数是单变元函数 from numpy import log1p from sklearn.preprocessing import FunctionTransformer FunctionTransformer(log1p).fit_transform(iris.data) # 特征选择之filter # 方差选择法,返回值为特征选择后的数据 #参数threshold为方差的阈值 from sklearn.feature_selection import VarianceThreshold VarianceThreshold(threshold=3).fit_transform(iris.data) # 选择K个最好的特征,返回选择特征后的数据 # 第一个参数为计算评估特征是否好的函数,该函数输入特征矩阵和目标向量, # 输出二元组(评分,P值)的数组,数组第i项为第i个特征的评分和P值。在此定义为计算相关系数 # 第二个参数k为选择的特征个数 from sklearn.feature_selection import SelectKBest from scipy.stats import pearsonr # 评价函数为 pearsonr 相关系数 SelectKBest(lambda X, Y: array(map(lambda x: pearsonr(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target) # 评价函数为 卡方检验函数 from sklearn.feature_selection import chi2 SelectKBest(chi2, k=2).fit_transform(iris.data, iris.target) # 互信息法 # from minepy import MINE # # 由于MINE的设计不是函数式的,定义mic方法将其为函数式的,返回一个二元组,二元组的第2项设置成固定的P值0.5 # def mic(x, y): # m = MINE() # m.compute_score(x, y) # return (m.mic(), 0.5) # SelectKBest(lambda X, Y: array(map(lambda x: mic(x, Y), X.T)).T, k=2).fit_transform(iris.data, iris.target) # 特征选择之wrapper # 递归特征消除法,返回特征选择后的数据 # 参数estimator为基模型 # 参数n_features_to_select为选择的特征个数 from sklearn.feature_selection import RFE from sklearn.linear_model import LogisticRegression RFE(estimator=LogisticRegression(), n_features_to_select=2).fit_transform(iris.data, iris.target) # 特征选择之embedded # 使用带惩罚项的基模型,除了筛选出特征外,同时也进行了降维。 # 使用feature_selection库的SelectFromModel类结合带L1惩罚项的逻辑回归模型,来选择特征: from sklearn.feature_selection import SelectFromModel from sklearn.linear_model import LogisticRegression # 带L1惩罚项的逻辑回归作为基模型的特征选择 SelectFromModel(LogisticRegression(penalty="l1", C=0.1)).fit_transform( iris.data, iris.target) # L1惩罚项降维的原理在于保留多个对目标值具有同等相关性的特征中的一个,所以没选到的特征不代表不重要。 # 故可结合L2惩罚项来优化。若一个特征在L1中的权值为1,选择在L2中权值差别不大且在L1中权值为0的特征构成同类集合, # 将这一集合中的特征平分L1中的权值 # GBDT作为基模型的特征选择 from sklearn.ensemble import GradientBoostingClassifier SelectFromModel(GradientBoostingClassifier()).fit_transform( iris.data, iris.target)
def __init__(self): self.transformer_ = FunctionTransformer(to_dense, validate=False)
def __init__(self, impute_val=None): self.transformer_ = FunctionTransformer( impute_null, kw_args={"impute_val": impute_val}, validate=False )
# # dataframe slicing # selectionlist gets passed the parameterlist from json object selectionlist = [] selectionlist.extend((args.list)) # read data, df1=pd.read_table('penalties.csv', sep=';',header=0) # all headers colnames = list(df1.columns.values) # slice data X=df1.ix[:,selectionlist] # sqrt transform the heavily skewed data transformer = FunctionTransformer(np.sqrt) Xtran = transformer.transform(X) X = pd.DataFrame(Xtran) selectionheaders = selectionlist oldnames = X.columns.values # rename all columns with original columnheaders X.rename(columns=dict(zip(oldnames, selectionheaders)), inplace=True) # rest indizes colnamesrest = [x for x in colnames if x not in selectionlist] Rest = df1.ix[:, colnamesrest] # deletes multiplier columns del Rest['multiplier'] #plot 3by3 scatterplotmatrix from pandas.tools.plotting import scatter_matrix scatter_matrix(X, alpha=0.2, figsize=(3, 3))
def on_field(f, *vec): return make_pipeline(FunctionTransformer(itemgetter(f), validate=False), *vec)
def __init__(self, cast_type=None): self.transformer_ = FunctionTransformer( feature_cast, kw_args={"cast_type": cast_type}, validate=False )
# %% # The remaining columns can be used to predict the frequency of claim events. # Those columns are very heterogeneous with a mix of categorical and numeric # variables with different scales, possibly very unevenly distributed. # # In order to fit linear models with those predictors it is therefore # necessary to perform standard feature transformations as follows: from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.compose import ColumnTransformer log_scale_transformer = make_pipeline( FunctionTransformer(np.log, validate=False), StandardScaler() ) linear_model_preprocessor = ColumnTransformer( [ ("passthrough_numeric", "passthrough", ["BonusMalus"]), ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), ("log_scaled_numeric", log_scale_transformer, ["Density"]), ("onehot_categorical", OneHotEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"]), ], remainder="drop",
#4.5 Transforming Features # Load libraries import numpy as np from sklearn.preprocessing import FunctionTransformer # Create feature matrix features = np.array([[2, 3], [2, 3], [2, 3]]) # Define a simple function def add_ten(x): return x + 10 # Create transformer ten_transformer = FunctionTransformer(add_ten) # Transform feature matrix print("Transform features: \n", ten_transformer.transform(features)) print("\n") print("\n") print("\n") # Load library import pandas as pd # Create DataFrame df = pd.DataFrame(features, columns=["feature_1", "feature_2"]) # Apply function print(df.apply(add_ten)) print("\n")
# %% # The remaining columns can be used to predict the frequency of claim events. # Those columns are very heterogeneous with a mix of categorical and numeric # variables with different scales, possibly very unevenly distributed. # # In order to fit linear models with those predictors it is therefore # necessary to perform standard feature transformations as follows: from sklearn.pipeline import make_pipeline from sklearn.preprocessing import FunctionTransformer, OneHotEncoder from sklearn.preprocessing import StandardScaler, KBinsDiscretizer from sklearn.compose import ColumnTransformer log_scale_transformer = make_pipeline( FunctionTransformer(np.log, validate=False), StandardScaler()) linear_model_preprocessor = ColumnTransformer( [ ("passthrough_numeric", "passthrough", ["BonusMalus"]), ("binned_numeric", KBinsDiscretizer(n_bins=10), ["VehAge", "DrivAge"]), ("log_scaled_numeric", log_scale_transformer, ["Density"]), ( "onehot_categorical", OneHotEncoder(), ["VehBrand", "VehPower", "VehGas", "Region", "Area"], ), ], remainder="drop", )
from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.816716439759918 exported_pipeline = make_pipeline( make_union(SelectPercentile(score_func=f_classif, percentile=46), FunctionTransformer(copy)), PCA(iterated_power=8, svd_solver="randomized"), PCA(iterated_power=8, svd_solver="randomized"), LinearSVC(C=0.001, dual=False, loss="squared_hinge", penalty="l2", tol=1e-05)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def train(self): import xgboost from baikal import make_step, Step, Input, Model from baikal.steps import Stack from sklearn_pandas import gen_features import custom_transformations as ct from custom_transformations import DataFrameMapperStep, ConcatDataFrame, CatBoostRegressorStep # these are the categorical columns in the dataset CATEGORICAL_COLUMNS = [ 'KitchenQual', 'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'OverallQual', 'OverallCond', ] # these columns will be terated as a numerical columns NUMERICAL_COLUMNS = [ 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold' ] # These columns have missing values and the one for which we will add missing indicator variable MISSING_INDICATOR = [ 'LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature' ] ## Categorical Columns for which we want One Hot Encoding ONEHOT_COLUMNS = [ 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition' ] ## Categorical Columns for which we want to have target encoding TARGET_COLUMNS = [ 'MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd' ] ## Columns for that require log transformations LOG_COLUMNS = [ 'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal' ] # Define Steps ElasticNetStep = make_step(ElasticNet, class_name='ElasticNet') ConcatStep = make_step(ConcatDataFrame, class_name='Concat') XGBRegressorStep = make_step(xgboost.XGBRegressor, class_name='XGBRegressor') LinearRegressionStep = make_step(sklearn.linear_model.LinearRegression, class_name='LinearRegression') # Define sklearn-pandas transformations. Here I am using gen_features utility to # define transformations for individual columns. baseProcessing = ( gen_features(columns=[[x] for x in MISSING_INDICATOR], classes=[{ 'class': MissingIndicator, 'features': 'all', 'sparse': False, 'error_on_new': False }], prefix='na_') + gen_features( columns=LOG_COLUMNS, classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.float).reshape((-1, 1)) }, { 'class': SimpleImputer, 'strategy': 'mean' }, { 'class': FunctionTransformer, 'func': np.log1p }]) + gen_features( columns=list(set(NUMERICAL_COLUMNS) - set(LOG_COLUMNS)), classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.float).reshape((-1, 1)) }, { 'class': SimpleImputer, 'strategy': 'mean' }], ) + [ # constructing new features -- age of the house (['YrSold', 'YearBuilt'], [ FunctionTransformer( func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)), FunctionTransformer(np.log1p) ], { 'alias': 'age' }), # constructing new feature -- remodeling age (['YrSold', 'YearRemodAdd'], [ FunctionTransformer( func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)), FunctionTransformer(np.log1p) ], { 'alias': 'remodel_age' }), # new feature -- total surface area (['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'], [ FunctionTransformer(lambda x: np.nansum(x, axis=1)), FunctionTransformer(np.log1p) ], { 'alias': 'numerical_TotalArea' }) ]) # Since CatBoost model can handle categorical data, we don't need to encode categorical variables # we will simply impute missing values and let CatBoost model handle categorical data. catModelPreprocessing = gen_features( columns=CATEGORICAL_COLUMNS, classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.object).reshape(-1, 1) }, { 'class': SimpleImputer, 'strategy': 'most_frequent' }], ) # for regression and XGBoost, we will need to encode categorical variables ourselfs. # Depending on the cardinality of the variable, I am either using one hot encoding or target encoding. regressionModelProcessing = ( gen_features(columns=[[x] for x in ONEHOT_COLUMNS], classes=[{ 'class': OneHotEncoder, 'handle_unknown': 'ignore', 'sparse': False }]) + gen_features(columns=[[x] for x in TARGET_COLUMNS], classes=[ { 'class': TargetEncoder }, { 'class': SimpleImputer, 'strategy': 'mean' }, ])) # Define DAG x = Input(name="x") y = Input(name='y') # Define feature transformations d0 = DataFrameMapperStep(baseProcessing, df_out=True, name='BasePreprocess')(x, y) d1 = DataFrameMapperStep(regressionModelProcessing, df_out=True, name='RegressionModelPreprocess')(x, y) d2 = DataFrameMapperStep(catModelPreprocessing, df_out=True, name='CatModelPreprocess')(x, y) # Consolidate features for catboost and elasticnet regressionFeatures = ConcatStep(name='RegressionFeatures')([d0, d1]) catFeatures = ConcatStep(name='CatBoostFeatures')([d0, d2]) # Generate predictions using three different algorithms. m1 = ElasticNetStep(name='ElasticNet')(regressionFeatures, y) m2 = XGBRegressorStep(name='XGBoost')(regressionFeatures, y) m3 = CatBoostRegressorStep(name='CatBoost', cat_features=CATEGORICAL_COLUMNS, iterations=10)(catFeatures, y) # combine predictions from the three models combinedPredictions = Stack(name='CombinePredictions')([m1, m3]) # construct an ensemble model ensembleModel = LinearRegressionStep()(combinedPredictions, y) model = Model(x, ensembleModel, y) model.fit(self.trainDF, self.trainDF['SalePrice']) self.artifact = { 'model.pkl': cloudpickle.dumps(model), 'environment': { 'pip': {} } } self.next(self.end)
# Creating a learning pipeline # ---------------------------- # The encoders for both clean and dirty data are first imported: from sklearn.preprocessing import FunctionTransformer from sklearn.preprocessing import OneHotEncoder from dirty_cat import SimilarityEncoder, TargetEncoder, MinHashEncoder,\ GapEncoder encoders_dict = { 'one-hot': OneHotEncoder(handle_unknown='ignore', sparse=False), 'similarity': SimilarityEncoder(similarity='ngram'), 'target': TargetEncoder(handle_unknown='ignore'), 'minhash': MinHashEncoder(n_components=100), 'gap': GapEncoder(n_components=100), 'numerical': FunctionTransformer(None) } # We then create a function that takes one key of our ``encoders_dict``, # returns a pipeline object with the associated encoder, # as well as a Scaler and a RidgeCV regressor: from sklearn.compose import ColumnTransformer from sklearn.pipeline import Pipeline def make_pipeline(encoding_method): # static transformers from the other columns transformers = [(enc + '_' + col, encoders_dict[enc], [col]) for col, enc in clean_columns.items()] # adding the encoded column
def make_clf(*args, **kwargs): clf = make_pipeline(FunctionTransformer(crossterm), LogisticRegressionCV()) return clf
x, y, _, test_x, test_y, smiles, test_smiles = load_data( data_cfg, **repr_cfg[utils_section]) # change y in case of classification if 'classification' == task_cfg[utils_section]['task']: log_scale = True if 'log' == data_cfg[csv_section]['scale'].lower().strip( ) else False y = task_cfg[utils_section]['cutoffs'](y, log_scale) test_y = task_cfg[utils_section]['cutoffs'](test_y, log_scale) training_features = x training_target = y testing_features = test_x # Average CV score on the training set was: 0.8708849509343505 exported_pipeline = make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_depth=20, max_features=0.9000000000000001, max_samples=0.5, min_samples_leaf=1, min_samples_split=2, n_estimators=500)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 666) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) print('Success.')
x_fit = x_pipeline.fit_transform(x) y_pipeline = Pipeline([ ("labeler", OrdinalEncoder()), ]) y_fit = y_pipeline.fit_transform(y) # Fit the pipeline model = LogisticRegression(solver="lbfgs") model.fit(x_fit, y_fit) # Create a fully integrated pipeline prediction_pipeline = Pipeline([ ("preprocessing", x_pipeline), ("pipeline", FunctionTransformer(func=call_model, kw_args={"model": model})), ( "retransform labels", FunctionTransformer( # Turn the 0-1 labels back into setosa / virginica labels func=reverse_labels, validate=False, kw_args={"pipe": y_pipeline}), ) ]) # Try the full pipeline preds = prediction_pipeline.transform(x) print(confusion_matrix(y, preds)) # Save models
from sklearn.preprocessing import FunctionTransformer def func(df, book_maker, type_of_bet): col = "{}_over_count".format( book_maker) if type_of_bet == "total" else "{}_diff_home_count".format( book_maker) cols = [col] df_ = df[cols] return df_.values.tolist() betting_count_encoder = FunctionTransformer(func=func, kw_args={ "book_maker": "oversea", "type_of_bet": "total" }) if __name__ == "__main__": from data import Data data = Data(alliance="NBA") df1 = data.get_train(book_maker="oversea", type_of_bet="total") # print(df1[["game_time", "away_team", "home_team", "away_score", "home_score", # "is_back_to_back", ]]) print(betting_count_encoder.fit_transform(df1)) # print(game_count_encoder.steps[1][1].categories_)
from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activation from keras.layers.normalization import BatchNormalization from keras.layers.advanced_activations import PReLU from keras.utils import np_utils, generic_utils from sklearn.preprocessing import FunctionTransformer #read data train_tour1 = pd.read_csv('numerai_training_data.csv') feature = pd.DataFrame(train_tour1.ix[:,0:21]) target = pd.DataFrame(train_tour1.target) #log feature transformer = FunctionTransformer(np.log1p) feature_log = transformer.transform(feature) #add all feature feature_log = pd.DataFrame(feature_log) feature_all = pd.concat([feature, feature_log], axis =1 ) #separate target and features feature_all = np.asarray(feature_all) target = np.asarray(target) # convert list of labels to binary class matrix target = np_utils.to_categorical(target) # pre-processing: divide by max and substract mean scale = np.max(feature_all)
def __init__(self, model, df_train, categorical_inputs, categorical_imputer, numeric_inputs, numeric_imputer, input_preproc, class_names=None, **kwargs): """ Args: categorical_imputer: The imputer that is to be used for categorical columns. The imputer is not allowed to add new columns or change order of the existing ones. numeric_imputer: The imputer that is to be used for numeric columns. The imputer is not allowed to add new columns or change order of the existing ones. """ self.model = model self.categorical_inputs = categorical_inputs self.categorical_imputer = categorical_imputer self.numeric_inputs = numeric_inputs self.numeric_imputer = numeric_imputer self.input_preproc = input_preproc class_names = [str(c) for c in class_names] self.interpret_preproc = make_column_transformer( ( make_pipeline( # wrap in a function transformer to prevent being refitted FunctionTransformer(categorical_imputer.transform, validate=False), OrdinalEncoder()), categorical_inputs), # wrap in a function transformer to prevent being refitted (FunctionTransformer(numeric_imputer.transform, validate=False), numeric_inputs)) xx_train = self.interpret_preproc.fit_transform( df_train[self.categorical_inputs + self.numeric_inputs]) if xx_train.shape[1] != len(categorical_inputs) + len(numeric_inputs): raise ValueError( "Imputers are not allowed to add new columns or to change their order." ) self.ordenc = self.interpret_preproc.transformers_[0][1][1] try: cat_name_idx = { k: v for k, v in enumerate(self.ordenc.categories_) } self.categorical_names = { k: v for k, v in zip(categorical_inputs, self.ordenc.categories_) } except AttributeError: cat_name_idx = {} self.categorical_names = {} self.explainer = LimeTabularExplainer( xx_train, feature_names=categorical_inputs + numeric_inputs, class_names=class_names, categorical_features=range(len(categorical_inputs)), categorical_names=cat_name_idx, mode="classification" if is_classifier(self.model) else "regression", **kwargs) self.full_model = make_pipeline(FunctionTransformer(self._preproc_fn), self.model)
def _generate_features(self, X, y=None, numeric_extra=None, categorical_extra=None): try: self.feature_pipeline_ except AttributeError: n_days = X['dayofweek'].nunique() n_hours = X['hour'].nunique() self.feature_pipeline_ = Pipeline([( 'features', FeatureUnion([ # time of week part of TOWT ('weeks', Pipeline([ ('split', FeatureUnion([ ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')) ])), ('hours', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek', 'hour']))), ('term', PatsyTransformer('-1 + C(dayofweek):C(hour)')) ])) if (n_days > 1) and (n_hours > 1) else ('days', Pipeline([ ('select', ColumnSelector('dayofweek')), ('ordinal', OrdinalEncoder(cols=['dayofweek'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=['dayofweek']))), ('one_hot', OneHotEncoder(cols=['dayofweek'], return_df=False)) ])) if n_days > 1 else ('hours', Pipeline( [('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer(missing_values=-1, strategy='most_frequent')), ('to_pandas', FunctionTransformer( lambda x: pd.DataFrame(x, columns=['hour']))), ('one_hot', OneHotEncoder(cols=['hour'], return_df=False))])), # temperature part of TOWT ('temperature', ColumnTransformer([ ('encode_temperature', IntervalEncoder( n_chunks=10, span=0.1 * X[self.temperature_col].std(), method='normal'), [self.temperature_col]) ])), ('temperature_interact', 'drop' if n_hours == 1 else Pipeline( [('split', FeatureUnion([ ('temperature_part', Pipeline([ ('select', ColumnSelector(self.temperature_col)), ( 'create_bins', KBinsDiscretizer( n_bins=self.n_bins_temperature, strategy='quantile', encode='ordinal'), ) ])), ('hour_part', Pipeline([('select', ColumnSelector('hour')), ('ordinal', OrdinalEncoder(cols=['hour'], return_df=False)), ('unknown', SimpleImputer( missing_values=-1, strategy='most_frequent'))])) ])), ('to_pandas', FunctionTransformer(lambda x: pd.DataFrame( x, columns=[self.temperature_col, 'hour']))), ('term', PatsyTransformer( f'-1 + C({self.temperature_col}):C(hour)'))])), # deal with extra numerical regressors ('numerical_regressors', 'drop' if not numeric_extra else ColumnTransformer( [(f'encode_{col}', IntervalEncoder(n_chunks=4, span=0.1 * X[col].std(), method='normal'), [col]) for col in numeric_extra])), # deal with extra categorical regressors ('categorical_regressors', 'drop' if not categorical_extra else TargetEncoder(cols=categorical_extra, return_df=False, handle_missing='value', handle_unknown='value')) ]))]) # Fit the pipeline self.feature_pipeline_.fit(X, y) finally: return self.feature_pipeline_.transform(X)
est = preprocessing.KBinsDiscretizer(n_bins=[3, 2, 2], encode='ordinal').fix(X) est.transform(X) ## encoding categorical features genders = ['female', 'male'] locations = ['from Africa', 'from Asia', 'from Europe', 'from US'] browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari'] X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']] enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers]) print(enc.fit(X)) print(enc.transform([['female', 'from Europe', 'uses Firefox']]).toarray()) ## generating polynomial features from sklearn.preprocessing import PolynomialFeatures X = np.arange(9).reshape(3, 3) poly0 = PolynomialFeatures(2) poly1 = PolynomialFeatures(degree=3, interaction_only=True) print(X) print(poly0.fit_transform(X)) print(poly1.fit_transform(X)) ## custom transformers from sklearn.preprocessing import FunctionTransformer transformer = FunctionTransformer(np.log1p, validate=True) X = np.array([[0, 1],[2, 3]]) print(transformer.transform(X))
def knn_lab(X_train, y_train): model = make_pipeline(FunctionTransformer(rgb_to_lab, validate=True), neighbors.KNeighborsClassifier()) model.fit(X_train, y_train) return model
from sklearn.preprocessing import FunctionTransformer def add_extra_features(X, add_bedrooms_per_room=True): rooms_per_household = X[:, rooms_ix] / X[:, household_ix] population_per_household = X[:, population_ix] / X[:, household_ix] if add_bedrooms_per_room: bedrooms_per_room = X[:, bedrooms_ix] / X[:, rooms_ix] return np.c_[X, rooms_per_household, population_per_household, bedrooms_per_room] else: return np.c_[X, rooms_per_household, population_per_household] attr_adder = FunctionTransformer(add_extra_features, validate=False, kw_args={"add_bedrooms_per_room": False}) housing_extra_attribs = attr_adder.fit_transform(housing.values) #%% housing_extra_attribs = pd.DataFrame( housing_extra_attribs, columns=list(housing.columns) + ["rooms_per_household", "population_per_household"], index=housing.index) housing_extra_attribs.head() #%% [markdown] # Now let's build a pipeline for preprocessing the numerical attributes (note that # we could use `CombinedAttributesAdder()` instead of `FunctionTransformer(...)` if # we preferred):
# training data train = load_file() X = train.drop(['target'], axis=1) y = train.target # bump all values up 1, so missing is now zero cat_columns = get_cat_features_idx(X) X = make_missing_zero(X, cat_columns) # make a pipeline pipe = Pipeline([('encode', OneHotEncoder(categorical_features=cat_columns, handle_unknown='ignore')), ('to_dense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), ('model', LogisticRegression())]) param_grid = {'model': [GaussianNB(), LogisticRegression()]} model = GridSearchCV(pipe, param_grid, scoring='roc_auc') model.fit(X.as_matrix(), y) logger.info("Best Params: {}".format(model.best_params_)) results = cross_val_predict(model, X, y, method='predict_proba')[:, 1] score = gini_normalized(y, results) logger.info( "Cross-val normalized gini score on training set is {}".format(score)) # predict test = make_missing_zero(load_file("test"), cat_columns) test['target'] = model.predict_proba(test.as_matrix())[:, 1]
def myTrans(X): #可以自定义函数 X_ = np.log10(X) + 2 # 小数定标标准化,缩放到[-1, 1]范围 # 变换公式:Decimal scaling y=(X/10的k次方) (k确保maX|y|<1) # X_ = X/10**np.ceil(np.log10(np.abs(X).max())) # 对数Logistic模式 y = 1/(1+e^(-X)) # X_ = 1/(1+np.exp(-X)) return X_ trans = FunctionTransformer(myTrans) cols = ['年龄', '收入'] X_ = trans.fit_transform(df[cols]) print(X_) ###################################################################### ######## Part2. 标准化、正则化等 ###################################################################### #####=======StandardScaler============ ### z-score标准化:y = (X-mean)/std # 标准化:将不同规模和量纲的数据处理,缩放到相同的数据区间和范围,以减少规模、特征、分布差异等对模型的影响。 # 做法:将数据转换为标准正态分布(均值=0,标准差= 1),这样可利用正态分布的特征, # 一种去中心化方法,会改变原数据分布,不适合于稀疏数据处理。 from sklearn.preprocessing import StandardScaler
Any step in the pipeline must be an object that implements the fit and transform methods. The FunctionTransformer creates an object with these methods out of any Python function that you pass to it. We'll use it to help select subsets of data in a way that plays nicely with pipelines. You are working with numeric data that needs imputation, and text data that needs to be converted into a bag-of-words. You'll create functions that separate the text from the numeric variables and see how the .fit() and .transform() methods work. INSTRUCTIONS 100XP Compute the selector get_text_data by using a lambda function and FunctionTransformer() to obtain all 'text' columns. Compute the selector get_numeric_data by using a lambda function and FunctionTransformer() to obtain all the numeric columns (including missing data). These are 'numeric' and 'with_missing'. Fit and transform get_text_data using the .fit_transform() method with sample_df as the argument. Fit and transform get_numeric_data using the same approach as above. ''' # Import FunctionTransformer from sklearn.preprocessing import FunctionTransformer # Obtain the text data: get_text_data get_text_data = FunctionTransformer(lambda x: x['text'], validate=False) # Obtain the numeric data: get_numeric_data get_numeric_data = FunctionTransformer(lambda x: x[['numeric', 'with_missing']], validate=False) # Fit and transform the text data: just_text_data just_text_data = get_text_data.fit_transform(sample_df) # Fit and transform the numeric data: just_numeric_data just_numeric_data = get_numeric_data.fit_transform(sample_df) # Print head to check results print('Text Data') print(just_text_data.head()) print('\nNumeric Data') print(just_numeric_data.head())