def scale_onehot(df, target): """Perform basic scaling and one-hot encoding""" features = df.drop(target, axis=1) categorical_cols = [[f] for f in features.select_dtypes('object').columns] categorical_pipe = gen_features(columns=categorical_cols, classes=[{ 'class': SimpleImputer, 'strategy': 'constant', 'fill_value': 'Na' }, OneHotEncoder]) numerical_cols = [[f] for f in features.select_dtypes('number').columns] numerical_pipe = gen_features(columns=numerical_cols, classes=[SimpleImputer, StandardScaler]) mapper = DataFrameMapper(categorical_pipe + numerical_pipe, df_out=True) X = mapper.fit_transform(df) y = df[target] target_names = sorted(y.unique()) y = pd.Categorical(y, categories=target_names, ordered=True) y = y.codes return X, y, target_names
def make_pipeline_model(numeric_feature, category_feature, estimator, X=None, y=None): ''' 通过指定类别型和数值型特征构建以及指定的模型构建pipeline,如果给出数据集就完成训练,最终返回pipeline模型 numeric_feature: 数值特征 list category_feature: 类别特征 list X:X数据 传入pandas.DataFrame对象 y:Y数据 传入pandas.Series对象 return: pipeline_model ''' feature_def = gen_features( columns=category_feature, classes=[CategoricalDomain, CategoricalImputer, LabelBinarizer]) mapper_numerical = DataFrameMapper([(numeric_feature, [ ContinuousDomain(), SimpleImputer(strategy='mean'), StandardScaler() ])]) mapper_category = DataFrameMapper(feature_def) mapper = FeatureUnion([('mapper_numerical', mapper_numerical), ('mapper_category', mapper_category)]) pipeline_model = PMMLPipeline([('mapper', mapper), ('classifier', estimator)]) if X is not None and y is not None: pipeline_model.fit(X, y) return pipeline_model
def clean_df(df, id_cols=[]): from sklearn_pandas import gen_features, DataFrameMapper, CategoricalImputer print("Imputation of numeric Columns") if id_cols.__len__() > 0: df.set_index(keys=id_cols, inplace=True) df_numeric = df.select_dtypes(include=[int, float]) df_non_num = df.select_dtypes(exclude=[int, float]) num_imp_train = gen_features(columns=df_numeric.columns, classes=[CategoricalImputer]) # num_std_scale = gen_features( # columns= df_numeric.columns, # classes = [StandardScaler] # ) num_map_train = DataFrameMapper(num_imp_train, df_out=True, input_df=True) # num_scale_map = DataFrameMapper(num_std_scale,df_out=True,input_df=True) print("Train Dataset numeric Impute") df_new = num_map_train.fit_transform(df_numeric) # print("Scaling Data") # df_new = num_scale_map.fit_transform(df_new) df_new = df_new.merge(df_non_num, left_index=True, right_index=True) print("Imputation with Mode Complete") return df_new
def create_mapper_sklearn_pandas_contrib(categorical_features, numeric_features): from sklearn_pandas import DataFrameMapper, gen_features categorial_maps = gen_features(columns=[[feature] for feature in categorical_features ], classes=[{ 'class': OneHotEncoder, 'dtype': np.float32, 'sparse': False, 'handle_unknown': 'ignore' }]) numeric_maps = gen_features(columns=[[feature] for feature in numeric_features], classes=[StandardScaler]) return DataFrameMapper(categorial_maps + numeric_maps, default=None)
def scale_data(data): feature_def = gen_features(columns=data.columns.values.reshape( (-1, 1)).tolist(), classes=[sklearn.preprocessing.StandardScaler]) mapper = DataFrameMapper(feature_def, df_out=True) transformed_data = mapper.fit_transform(data) return mapper, transformed_data
def transform_data(self, df, runtime_label): df_features, df_labels = df, df.pop(runtime_label) # Define which features are going to be transformed to a range of 0 to 1 (continuous) nfeats = gen_features( columns=[[i] for i in list(df_features.select_dtypes(include=[float]))], classes=[sklearn.preprocessing.MinMaxScaler] ) # Define which features are going to be binarized (categorical) sfeats = gen_features( columns=list(df.select_dtypes(include=[object])), classes=[sklearn.preprocessing.LabelBinarizer] ) # Do the transformations defined above mapper = DataFrameMapper(nfeats+sfeats,df_out=True) df_features = mapper.fit_transform(df_features) return df_features, df_labels
def feature_union(category_feature,numeric_feature): mapper_category = DataFrameMapper(gen_features( columns = category_feature, # LabelEncoder classes = [CategoricalDomain,CategoricalImputer,LabelEncoder] )) mapper_numerical = DataFrameMapper([ (numeric_feature,[ContinuousDomain(),SimpleImputer(strategy='mean'),StandardScaler()]) ]) pipeline_transformer = FeatureUnion([('mapper_category',mapper_category),\ ('mapper_numerical',mapper_numerical)]) return pipeline_transformer
def build_converter(self): """ Prepares a mapper between Pandas Dataframe and sklearn matrix """ label_encoding = gen_features(columns=self.label_fields, classes=[LabelBinarizer]) categorical = gen_features(columns=[[f] for f in self.categorical_fields], classes=[{ 'class': SimpleImputer, 'strategy': "most_frequent" }, { 'class': OneHotEncoder, 'sparse': False }], suffix="_cat") numeric = gen_features(columns=[[t[0]] for t in self.thresholds], classes=[{ 'class': SimpleImputer, 'strategy': 'median' }]) # if any boolean boolean = [] for f in self.df.columns: if self.df[f].dtype == bool and f not in self.label_fields: self.df[f] = pd.to_numeric(self.df[f]) boolean.append(([f], None)) mapper_X = DataFrameMapper( numeric) if self.only_numeric else DataFrameMapper(categorical + boolean + numeric) mapper_y = DataFrameMapper(label_encoding) return mapper_X, mapper_y
def fit(self, X, y=None): self.ncols = [] self.scols = [] # print("mapping features") for col in X: if X[col].dtype == float: # print("numerical col: %s" % col) self.ncols.append([col]) else: # print("categorical col: %s" % col) self.scols.append([col]) nfeats = gen_features( columns=self.ncols, classes=[{'class': sklearn.preprocessing.MinMaxScaler, }] ) sfeats = gen_features( columns=self.scols, classes=[{'class': LabelBinarizer2}] ) self.mapper = DataFrameMapper(nfeats + sfeats, df_out=True) self.mapper.fit(X) # print("features mapped") return self
def _create_apply_transformers(df): from sklearn_pandas import DataFrameMapper import category_encoders as ce data_raw = df obj_cols = data_raw.select_dtypes("object").columns.to_list() from sklearn_pandas import gen_features feature_def = gen_features( columns=obj_cols, classes=[{ "class": ce.OrdinalEncoder, "handle_unknown": "return_nan", "handle_missing": "return_nan" }], ) mapper = DataFrameMapper(feature_def, default=None, df_out=True) data_transformed = mapper.fit_transform(data_raw) return data_transformed, mapper
TARGET = df_train_flt['isPurchase'].copy() TRAIN = df_train_flt.drop('isPurchase', axis=1).copy() # 1. Drop some columns use_cols = [ 'channelGrouping', 'visitNumber', 'device.deviceCategory', 'device.isMobile', 'totals.hits', 'totals.newVisits', 'totals.pageviews', 'trafficSource.isTrueDirect' ] TRAIN = TRAIN[use_cols] # Preprocessing # Pipeline feature_cat = gen_features( columns=['channelGrouping', 'device.deviceCategory'], classes=[ModifiedLabelEncoder, OneHotEncoder]) feature_num = gen_features(columns=[['visitNumber'], ['device.isMobile'], ['totals.hits'], ['totals.newVisits'], ['totals.pageviews'], ['trafficSource.isTrueDirect']], classes=[StandardScaler]) mapper = DataFrameMapper(feature_cat + feature_num, input_df=True, df_out=True) TRAIN_preprocessed = mapper.fit_transform(TRAIN.copy()) # Spilt train/test sets X_train, X_test, y_train, y_test = train_test_split(TRAIN_preprocessed, TARGET, random_state=26) # Auto-sklearn
df_train = loader.read_original_data(table_code='train') df_test = loader.read_original_data(table_code='test') # Consider only a subset of columns df_train.set_index('PassengerId', inplace=True) df_test.set_index('PassengerId', inplace=True) USE_COLS = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] TARGET = ['Survived'] X_train = df_train[USE_COLS].copy() y_train = df_train[TARGET].copy().values.reshape(-1, ) X_test = df_test[USE_COLS].copy() # Preprocessing feature_cat = gen_features( columns=['Pclass', 'Sex', 'Embarked'], classes=[CategoricalImputer, ModifiedLabelEncoder, OneHotEncoder]) feature_num = gen_features(columns=[['Age'], ['SibSp'], ['Parch'], ['Fare']], classes=[Imputer, StandardScaler]) mapper = DataFrameMapper(feature_cat + feature_num, input_df=True, df_out=True) X_train_fit = mapper.fit_transform(X_train.copy()) # Training from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
data.columns = [str(c) for c in data.columns] # Shuffle, Shuffle and Shuffle! data = data.sample(frac=1) num_cols = len(data.columns) X = data.iloc[:,1:num_cols-1] y = data.iloc[:,num_cols-1] X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=42) #Preprocess X_train feature_def = gen_features( columns=[[c] for c in X_train.columns[:7]], classes=[MinMaxScaler] ) feature_def += ((pos_col, [LabelBinarizer()]),) svc_preprocessor = DataFrameMapper(feature_def) X_train = svc_preprocessor.fit_transform(X_train) svc_preprocessor_fn = os.path.join('../model/tmp/svc_preprocessor.%s.pkl' % (nrows,)) joblib.dump(svc_preprocessor, open(svc_preprocessor_fn, 'wb')) X_test = svc_preprocessor.transform(X_test) ##### #Didn't help!! #X_train, y_train = downsample_negatives(X_train, y_train) for cv in [1,10,20]:
# Consider only a subset of columns df_train.set_index('PassengerId', inplace=True) df_test.set_index('PassengerId', inplace=True) #print(df_train.head()) USE_COLS = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'] TARGET = ['Survived'] X_train = df_train[USE_COLS].copy() y_train = df_train[TARGET].copy().values.reshape(-1,) X_test = df_test[USE_COLS].copy() # Preprocessing # 1. 1-hot encode categorical columns feature_cat = gen_features(columns=['Pclass', 'Sex', 'Embarked'], classes=[CategoricalImputer, {'class': FunctionTransformer, 'func': pd.get_dummies, 'validate':False}] ) feature_num = gen_features(columns=[['Age'], ['SibSp'], ['Parch'], ['Fare']], classes=[Imputer, StandardScaler]) ''' mapper = DataFrameMapper([ ('Sex', [CategoricalImputer(), FunctionTransformer(pd.get_dummies, validate=False)]), ('Embarked', [CategoricalImputer(), FunctionTransformer(pd.get_dummies, validate=False)]), (['Age', 'SibSp', 'Parch', 'Fare'], [Imputer(), StandardScaler()]) ], df_out=True) ''' mapper = DataFrameMapper( feature_cat + feature_num, input_df=True, df_out=True)
df = df.sort_values([ 'DayOfYear', 'Carrier', 'Origin', 'Dest', 'FlightNum', 'CRSDepTime', 'CRSArrTime' ], inplace=False).reset_index(drop=True) # Select subset training = df[[ 'FlightNum', 'DayOfWeek', 'DayOfMonth', 'Carrier', 'Origin', 'Dest', 'Distance', 'DepDelay', 'CRSDepTime', 'CRSArrTime', 'DayOfYear' ]] training.loc[:, ['CRSDepTime', 'CRSArrTime' ]] = training.loc[:, ['CRSDepTime', 'CRSArrTime']].astype(int) # Convert categoricals to indicators. feature_def = gen_features(columns=['Carrier', 'Origin', 'Dest'], classes=[LabelBinarizer]) mapper = DataFrameMapper(feature_def, default=None) training_vectors = mapper.fit_transform(training) results_vector = df.ArrDelay.values df_training = pd.DataFrame(columns=mapper.transformed_names_, data=training_vectors) # Generate train/test sets. X_train, X_test, y_train, y_test = train_test_split(training_vectors, results_vector, test_size=0.1, random_state=43) # Do the regression. regressor = LinearRegression() regressor.fit(X_train, y_train) predicted = regressor.predict(X_test) # Show results
data.columns = [str(c) for c in data.columns] # Shuffle, Shuffle and Shuffle! data = data.sample(frac=1) num_cols = len(data.columns) X = data.iloc[:, 1:num_cols - 1] y = data.iloc[:, num_cols - 1] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) #Preprocess X_train feature_def = gen_features(columns=[[c] for c in X_train.columns[:7]], classes=[MinMaxScaler]) feature_def += ((pos_col, [LabelBinarizer()]), ) svc_preprocessor = DataFrameMapper(feature_def) X_train = svc_preprocessor.fit_transform(X_train) svc_preprocessor_fn = os.path.join('../model/tmp/svc_preprocessor.%s.pkl' % (nrows, )) joblib.dump(svc_preprocessor, open(svc_preprocessor_fn, 'wb')) X_test = svc_preprocessor.transform(X_test) ##### #Didn't help!! #X_train, y_train = downsample_negatives(X_train, y_train) for cv in [1, 10, 20]:
def create_preprocessing_pipeline() -> Pipeline: def impute_garage_yr_blt(X, y): X.loc[X['GarageYrBlt'].isnull(), ['GarageYrBlt']] = X.loc[X['GarageYrBlt'].isnull(), 'YearBuilt'] return X pipeline = Pipeline(steps=[ ('drop_id', tr.DropColumns(columns_to_drop=['Id'])), ('drop_cols_with_missing_values', tr.DropColumns(columns_to_drop=[ 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 'LotFrontage' ])), ('drop_electrical_rows', tr.DataFrameFunctionTransformer( lambda X, y: X.drop(X.loc[X['Electrical'].isnull()].index))), ('impute_garage_columns', DataFrameMapper(gen_features(columns=[['GarageType'], [ 'GarageFinish' ], ['GarageQual'], ['GarageCond']], classes=[{ 'class': SimpleImputer, 'strategy': 'constant', 'fill_value': 'No' }]), input_df=True, df_out=True, default=None)), ('impute_garage_yr_blt', tr.DataFrameFunctionTransformer(impute_garage_yr_blt)), ('impute_bsmt_columns', DataFrameMapper(gen_features( columns=[['BsmtCond'], ['BsmtQual'], ['BsmtExposure'], ['BsmtFinType1'], ['BsmtFinType2']], classes=[{ 'class': SimpleImputer, 'strategy': 'constant', 'fill_value': 'No' }]), input_df=True, df_out=True, default=None)), ('impute_mas_vnr_type', DataFrameMapper([( ['MasVnrType'], SimpleImputer(strategy='constant', fill_value='None'))], input_df=True, df_out=True, default=None)), ('impute_mas_vnr_area', DataFrameMapper([(['MasVnrArea'], SimpleImputer(strategy='constant', fill_value=0))], input_df=True, df_out=True, default=None)), # partial == (lambda y: (lambda x: y * x))(i) ('replace_values', DataFrameMapper([( key, tr.SeriesFunctionTransformer( partial(lambda value, col: col.replace(value), value))) for (key, value) in { "MSSubClass": { 20: "SC20", 30: "SC30", 40: "SC40", 45: "SC45", 50: "SC50", 60: "SC60", 70: "SC70", 75: "SC75", 80: "SC80", 85: "SC85", 90: "SC90", 120: "SC120", 150: "SC150", 160: "SC160", 180: "SC180", 190: "SC190" }, "MoSold": { 1: "Jan", 2: "Feb", 3: "Mar", 4: "Apr", 5: "May", 6: "Jun", 7: "Jul", 8: "Aug", 9: "Sep", 10: "Oct", 11: "Nov", 12: "Dec" }, "BsmtCond": { "No": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5 }, "BsmtExposure": { "No": 0, "Mn": 1, "Av": 2, "Gd": 3 }, "BsmtFinType1": { "No": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6 }, "BsmtFinType2": { "No": 0, "Unf": 1, "LwQ": 2, "Rec": 3, "BLQ": 4, "ALQ": 5, "GLQ": 6 }, "BsmtQual": { "No": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5 }, "ExterCond": { "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5 }, "ExterQual": { "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5 }, "Functional": { "Sal": 1, "Sev": 2, "Maj2": 3, "Maj1": 4, "Mod": 5, "Min2": 6, "Min1": 7, "Typ": 8 }, "GarageCond": { "No": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5 }, "GarageQual": { "No": 0, "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5 }, "HeatingQC": { "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5 }, "KitchenQual": { "Po": 1, "Fa": 2, "TA": 3, "Gd": 4, "Ex": 5 }, "LandSlope": { "Sev": 1, "Mod": 2, "Gtl": 3 }, "LotShape": { "IR3": 1, "IR2": 2, "IR1": 3, "Reg": 4 }, "PavedDrive": { "N": 0, "P": 1, "Y": 2 }, "Street": { "Grvl": 1, "Pave": 2 }, "Utilities": { "ELO": 1, "NoSeWa": 2, "NoSewr": 3, "AllPub": 4 } }.items()], input_df=True, df_out=True, default=None)), ('drop_suspicious_columns', tr.DropColumns(columns_to_drop=[ 'Utilities', 'Street', 'Condition2', 'RoofMatl', 'Heating', 'KitchenAbvGr', 'PoolArea' ])) ]) return pipeline
def main(): # set parameters and retrieve data begin_week = 1 end_week = 11 window_size = 5 records = retrieve_spl9_overall_records() records = add_week(records, 10, semis_matchups) records = add_week(records, end_week, finals_matchups) matchups = get_matchups(records) info = get_matchups_info(records, matchups, weeks(begin_week, end_week)) instances = get_instances(info, end_week, window_size) train_instances = instances.loc[[*weeks(begin_week, end_week - 1)]] predict_instances = instances.loc[to_week_label(end_week)] # Create label transformations and attribute normalizations attribute_mapper = DataFrameMapper([ *gen_features(columns=[['cost_x'], ['cost_y']], classes=[{ 'class': StandardScaler }]), *gen_features(columns=append_player_subscripts([ *('result-{}'.format(w + 1) for w in range(window_size)), *('tier-{}'.format(w + 1) for w in range(window_size)) ]), classes=[LabelBinarizer]), *gen_features(columns=append_player_subscripts(tier_list), classes=[LabelBinarizer]), ('tier', LabelBinarizer()) ]) label_mapper = DataFrameMapper([('result', LabelBinarizer())]) X = attribute_mapper.fit_transform(train_instances.copy()) y = label_mapper.fit_transform(train_instances.copy()).ravel() X_act = attribute_mapper.transform(predict_instances.copy()) y_act = label_mapper.transform(predict_instances.copy()).ravel() print(X.shape) print(X_act.shape) seed = 2718281828 validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed) with_prob = 'accuracy' without_prob = 'accuracy' trees_clf = RandomizedSearchCV(ExtraTreesClassifier(random_state=seed), cv=validation, n_iter=500, random_state=seed, scoring=with_prob, param_distributions={ 'n_estimators': stats.randint(low=40, high=200), 'max_features': stats.uniform(loc=0.01, scale=0.99), 'max_depth': stats.randint(low=1, high=10) }) trees_clf.fit(X, y) print("Extra Trees:") print(trees_clf.best_params_) print(trees_clf.best_score_) print(accuracy_score(y_act, trees_clf.predict(X_act))) grad_clf = RandomizedSearchCV( GradientBoostingClassifier(random_state=seed), cv=validation, n_iter=500, random_state=seed, scoring=with_prob, param_distributions={ 'loss': ['exponential'], 'learning_rate': stats.uniform(loc=1, scale=3), 'n_estimators': stats.randint(low=40, high=200), 'max_depth': stats.randint(low=1, high=10), 'min_samples_split': stats.randint(low=2, high=13), 'max_features': stats.uniform(loc=0.01, scale=0.99) }) grad_clf.fit(X, y) print("Gradient Boosting:") print(grad_clf.best_params_) print(grad_clf.best_score_) print(accuracy_score(y_act, grad_clf.predict(X_act))) forest_clf = RandomizedSearchCV(RandomForestClassifier(random_state=seed), cv=validation, n_iter=500, random_state=seed, scoring=with_prob, param_distributions={ 'n_estimators': stats.randint(low=10, high=100), 'max_features': stats.uniform(loc=0.01, scale=0.99), 'max_depth': stats.randint(low=1, high=10) }) forest_clf.fit(X, y) print("Random Forest:") print(forest_clf.best_params_) print(forest_clf.best_score_) print(accuracy_score(y_act, forest_clf.predict(X_act))) ada_clf = RandomizedSearchCV(AdaBoostClassifier(DecisionTreeClassifier(), random_state=seed), cv=validation, n_iter=500, random_state=seed, scoring=with_prob, param_distributions={ 'base_estimator__max_depth': stats.randint(low=1, high=10), 'n_estimators': stats.randint(low=10, high=100), 'learning_rate': stats.uniform(loc=1.29, scale=0.06) }) ada_clf.fit(X, y) print("AdaBoost:") print(ada_clf.best_params_) print(ada_clf.best_score_) print(accuracy_score(y_act, ada_clf.predict(X_act))) bagging_clf = RandomizedSearchCV(BaggingClassifier(random_state=seed), cv=validation, n_iter=500, random_state=seed, scoring=with_prob, param_distributions={ 'n_estimators': stats.randint(low=57, high=75), 'max_samples': stats.randint(low=6, high=9) }) bagging_clf.fit(X, y) print("Bagging:") print(bagging_clf.best_params_) print(bagging_clf.best_score_) print(accuracy_score(y_act, bagging_clf.predict(X_act))) svc_clf = RandomizedSearchCV(SVC(random_state=seed), cv=validation, n_iter=500, random_state=seed, scoring=with_prob, param_distributions={ 'kernel': ['poly'], 'degree': stats.randint(low=2, high=4), 'C': stats.uniform(loc=1, scale=13), 'coef0': stats.uniform(loc=-7, scale=8) }) svc_clf.fit(X, y) print("SVC:") print(svc_clf.best_params_) print(svc_clf.best_score_) print(accuracy_score(y_act, svc_clf.predict(X_act))) nn_clf = RandomizedSearchCV(MLPClassifier(random_state=seed), cv=validation, n_iter=500, random_state=seed, scoring=with_prob, param_distributions={ 'activation': ['relu'], 'solver': ['lbfgs'], 'hidden_layer_sizes': [(7, )], 'alpha': stats.uniform(loc=0.1e-4, scale=1.5e-4) }) nn_clf.fit(X, y) print("Neural Network:") print(nn_clf.best_params_) print(nn_clf.best_score_) print(accuracy_score(y_act, nn_clf.predict(X_act))) base_voting_clf = VotingClassifier([ ('ada', AdaBoostClassifier(DecisionTreeClassifier(random_state=seed))), ('svc', SVC(random_state=seed)), ('grad', GradientBoostingClassifier(random_state=seed)) ]) voting_clf = RandomizedSearchCV(base_voting_clf, cv=validation, n_iter=500, scoring=with_prob, param_distributions={ 'voting': ['hard', 'soft'], 'svc__probability': [True], 'svc__kernel': ['sigmoid'], 'svc__gamma': stats.uniform(loc=5.1e-2, scale=0.1e-2), 'svc__coef0': stats.uniform(loc=8.5e-2, scale=0.2e-2), 'svc__C': stats.uniform(loc=5.4, scale=0.02), 'ada__base_estimator__max_depth': [1], 'ada__n_estimators': stats.randint(low=38, high=40), 'ada__learning_rate': stats.uniform(loc=1.29, scale=0.06) }) voting_clf.fit(X, y) print("Voting:") print(voting_clf.best_params_) print(voting_clf.best_score_) print(accuracy_score(y_act, voting_clf.predict(X_act)))
def train(self): import xgboost from baikal import make_step, Step, Input, Model from baikal.steps import Stack from sklearn_pandas import gen_features import custom_transformations as ct from custom_transformations import DataFrameMapperStep, ConcatDataFrame, CatBoostRegressorStep # these are the categorical columns in the dataset CATEGORICAL_COLUMNS = [ 'KitchenQual', 'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition', 'OverallQual', 'OverallCond', ] # these columns will be terated as a numerical columns NUMERICAL_COLUMNS = [ 'LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold' ] # These columns have missing values and the one for which we will add missing indicator variable MISSING_INDICATOR = [ 'LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature' ] ## Categorical Columns for which we want One Hot Encoding ONEHOT_COLUMNS = [ 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition' ] ## Categorical Columns for which we want to have target encoding TARGET_COLUMNS = [ 'MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd' ] ## Columns for that require log transformations LOG_COLUMNS = [ 'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal' ] # Define Steps ElasticNetStep = make_step(ElasticNet, class_name='ElasticNet') ConcatStep = make_step(ConcatDataFrame, class_name='Concat') XGBRegressorStep = make_step(xgboost.XGBRegressor, class_name='XGBRegressor') LinearRegressionStep = make_step(sklearn.linear_model.LinearRegression, class_name='LinearRegression') # Define sklearn-pandas transformations. Here I am using gen_features utility to # define transformations for individual columns. baseProcessing = ( gen_features(columns=[[x] for x in MISSING_INDICATOR], classes=[{ 'class': MissingIndicator, 'features': 'all', 'sparse': False, 'error_on_new': False }], prefix='na_') + gen_features( columns=LOG_COLUMNS, classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.float).reshape((-1, 1)) }, { 'class': SimpleImputer, 'strategy': 'mean' }, { 'class': FunctionTransformer, 'func': np.log1p }]) + gen_features( columns=list(set(NUMERICAL_COLUMNS) - set(LOG_COLUMNS)), classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.float).reshape((-1, 1)) }, { 'class': SimpleImputer, 'strategy': 'mean' }], ) + [ # constructing new features -- age of the house (['YrSold', 'YearBuilt'], [ FunctionTransformer( func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)), FunctionTransformer(np.log1p) ], { 'alias': 'age' }), # constructing new feature -- remodeling age (['YrSold', 'YearRemodAdd'], [ FunctionTransformer( func=lambda x: np.clip(x[:, 0] - x[:, 1], 0, 1000)), FunctionTransformer(np.log1p) ], { 'alias': 'remodel_age' }), # new feature -- total surface area (['1stFlrSF', '2ndFlrSF', 'TotalBsmtSF'], [ FunctionTransformer(lambda x: np.nansum(x, axis=1)), FunctionTransformer(np.log1p) ], { 'alias': 'numerical_TotalArea' }) ]) # Since CatBoost model can handle categorical data, we don't need to encode categorical variables # we will simply impute missing values and let CatBoost model handle categorical data. catModelPreprocessing = gen_features( columns=CATEGORICAL_COLUMNS, classes=[{ 'class': FunctionTransformer, 'func': lambda x: x.astype(np.object).reshape(-1, 1) }, { 'class': SimpleImputer, 'strategy': 'most_frequent' }], ) # for regression and XGBoost, we will need to encode categorical variables ourselfs. # Depending on the cardinality of the variable, I am either using one hot encoding or target encoding. regressionModelProcessing = ( gen_features(columns=[[x] for x in ONEHOT_COLUMNS], classes=[{ 'class': OneHotEncoder, 'handle_unknown': 'ignore', 'sparse': False }]) + gen_features(columns=[[x] for x in TARGET_COLUMNS], classes=[ { 'class': TargetEncoder }, { 'class': SimpleImputer, 'strategy': 'mean' }, ])) # Define DAG x = Input(name="x") y = Input(name='y') # Define feature transformations d0 = DataFrameMapperStep(baseProcessing, df_out=True, name='BasePreprocess')(x, y) d1 = DataFrameMapperStep(regressionModelProcessing, df_out=True, name='RegressionModelPreprocess')(x, y) d2 = DataFrameMapperStep(catModelPreprocessing, df_out=True, name='CatModelPreprocess')(x, y) # Consolidate features for catboost and elasticnet regressionFeatures = ConcatStep(name='RegressionFeatures')([d0, d1]) catFeatures = ConcatStep(name='CatBoostFeatures')([d0, d2]) # Generate predictions using three different algorithms. m1 = ElasticNetStep(name='ElasticNet')(regressionFeatures, y) m2 = XGBRegressorStep(name='XGBoost')(regressionFeatures, y) m3 = CatBoostRegressorStep(name='CatBoost', cat_features=CATEGORICAL_COLUMNS, iterations=10)(catFeatures, y) # combine predictions from the three models combinedPredictions = Stack(name='CombinePredictions')([m1, m3]) # construct an ensemble model ensembleModel = LinearRegressionStep()(combinedPredictions, y) model = Model(x, ensembleModel, y) model.fit(self.trainDF, self.trainDF['SalePrice']) self.artifact = { 'model.pkl': cloudpickle.dumps(model), 'environment': { 'pip': {} } } self.next(self.end)
with open(df_name.replace('.csv', '') + '_gbk.csv' , 'w') as f: f.write(data) f.close() df = pd.read_csv(df_name.replace('.csv', '') + '_gbk.csv', encoding='gbk' ) return df data_4 = read_csv_gbk('算话测试报告-恒普-20200826.part01//算话变量_个人资料置信度_8w.csv') #data_4 = pd.read_csv(r'C:\Users\徐钦华\Desktop\数据分析项目\算话测试\算话测试报告-恒普-20200826.part01\算话变量_个人资料置信度_8w.csv', # engine='python') data_5 = pd.read_csv('算话测试报告-恒普-20200826.part01//算话变量_团伙风险识别_8w.csv',engine='python') label_data=label_data[label_data['数据编号'].notna()] mapper_2 = DataFrameMapper([('pdls041',LabelBinarizer())],default=None,df_out=True) data_2=mapper_2.fit_transform(data_2.copy()) mapper_3=DataFrameMapper([('z_risk_rate',LabelBinarizer())],default=None,df_out=True) data_3 = mapper_3.fit_transform(data_3.copy()) feature_def_4 = gen_features(columns=[[m] for m in ['xx'+str(i) for i in range(598,619)]+['xx1247']+['xx'+str(i) for i in range(2531,2538)]],classes=[MinMaxScaler,StandardScaler]) mapper_4=DataFrameMapper(feature_def_4,default=None,input_df=True,df_out=True) data_4=np.round(mapper_4.fit_transform(data_4.copy()),2) feature_def_5=gen_features(columns=[['xx1174'],['xx1175'],['xx1176'],['xx1183'],['xx1184'],['xx1185'],['xx631'],['xx632'],['xx633'],['xx2495'], ['xx2492'],['xx2538']],classes=[MinMaxScaler,StandardScaler]) mapper_5 = DataFrameMapper(feature_def_5,default=None,input_df=True,df_out=True) data_5 = np.round(mapper_5.fit_transform(data_5.copy()),2) for i in ['R01','R02','R07']: data_3[i]=0 data_3[i][(data_3['z_risk_reason'].notna())&(data_3['z_risk_reason'].str.contains(i))] = 1 for i in['R03','R04','R05','R06']: data_3['R_risk']=0 data_3['R_risk'][(data_3['z_risk_reason'].notna())&(data_3['z_risk_reason'].str.contains(i))] = 1 for j in ['S100','S200','S210','S220','S221','S222','S223','S224','S225']: data_3[j]=0 data_3[j][(data_3['z_business_source'].notna())&(data_3['z_business_source'].str.contains(j))]=1
y_test_df = pd.read_csv(y_test_path) y_test = y_test_df.get('Survived') y = train_df.get('Survived') X = train_df.drop('Survived', axis=1) categorical_features = ['Sex', 'Embarked'] numerical_features = ['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'] features_def = [] if categorical_features and len(categorical_features) > 0: for feature in categorical_features: categorical_feature_def = gen_features( columns=[[feature]], classes=[ {'class': SimpleImputer, 'strategy': 'most_frequent'}, {'class': OneHotEncoder, 'handle_unknown': 'ignore'} ] ) features_def = features_def + categorical_feature_def if numerical_features and len(numerical_features) > 0: for feature in numerical_features: numerical_feature_def = gen_features( columns=[[feature]], classes=[ {'class': SimpleImputer, 'strategy': 'mean'}, {'class': StandardScaler}, ] ) features_def = features_def + numerical_feature_def
ordinal_features = ['level.mito', 'level.ribo'] numeric_features = X.columns.values.tolist() for i in boolean_features: X[i] = X[i].astype(int) for i in categorical_features + boolean_features + ordinal_features: numeric_features.remove(i) # Uses extracts from https://github.com/kinir/catboost-with-pipelines/blob/master/sklearn-pandas-catboost.ipynb. gen_category = gen_features( columns=[[i] for i in categorical_features + boolean_features], classes=[{ "class": SimpleImputer, "strategy": "most_frequent" }, { "class": OneHotEncoder }]) gen_category_ord_enc = gen_features( columns=[[i] for i in categorical_features + boolean_features], classes=[{ "class": SimpleImputer, "strategy": "most_frequent" }, { "class": OrdinalEncoder, "dtype": np.int8 }]) gen_ordinal = gen_features(columns=[[i] for i in ordinal_features],
def map_features(features=[]): numerical_def = gen_features(columns=[[c] for c in features], classes=[{ 'class': StandardScaler }]) return numerical_def