def test_StackingEstimator_2(): """Assert that the StackingEstimator returns transformed X with a synthetic feature in regression.""" reg = RandomForestRegressor(random_state=42) stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42)) # fit reg.fit(training_features_r, training_target_r) stack_reg.fit(training_features_r, training_target_r) # get transformd X X_reg_transformed = stack_reg.transform(training_features_r) assert np.allclose(reg.predict(training_features_r), X_reg_transformed[:, 0])
def test_StackingEstimator_1(): """Assert that the StackingEstimator returns transformed X with synthetic features in classification.""" clf = RandomForestClassifier(random_state=42) stack_clf = StackingEstimator(estimator=RandomForestClassifier(random_state=42)) # fit clf.fit(training_features, training_target) stack_clf.fit(training_features, training_target) # get transformd X X_clf_transformed = stack_clf.transform(training_features) assert np.allclose(clf.predict(training_features), X_clf_transformed[:, 0]) assert np.allclose(clf.predict_proba(training_features), X_clf_transformed[:, 1:1 + len(np.unique(training_target))])
def test_StackingEstimator_4(): """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in regression.""" stack_reg = StackingEstimator(estimator=RandomForestRegressor(random_state=42)) meta_reg = Lasso(random_state=42) sklearn_pipeline = make_pipeline(stack_reg, meta_reg) # fit in pipeline sklearn_pipeline.fit(training_features_r, training_target_r) # fit step by step stack_reg.fit(training_features_r, training_target_r) X_reg_transformed = stack_reg.transform(training_features_r) meta_reg.fit(X_reg_transformed, training_target_r) # scoring score = meta_reg.score(X_reg_transformed, training_target_r) pipeline_score = sklearn_pipeline.score(training_features_r, training_target_r) assert np.allclose(score, pipeline_score) # test cv score cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features_r, training_target_r, cv=3, scoring='r2')) known_cv_score = 0.795877470354 assert np.allclose(known_cv_score, cv_score)
def test_StackingEstimator_3(): """Assert that the StackingEstimator worked as expected in scikit-learn pipeline in classification.""" stack_clf = StackingEstimator(estimator=RandomForestClassifier(random_state=42)) meta_clf = LogisticRegression() sklearn_pipeline = make_pipeline(stack_clf, meta_clf) # fit in pipeline sklearn_pipeline.fit(training_features, training_target) # fit step by step stack_clf.fit(training_features, training_target) X_clf_transformed = stack_clf.transform(training_features) meta_clf.fit(X_clf_transformed, training_target) # scoring score = meta_clf.score(X_clf_transformed, training_target) pipeline_score = sklearn_pipeline.score(training_features, training_target) assert np.allclose(score, pipeline_score) # test cv score cv_score = np.mean(cross_val_score(sklearn_pipeline, training_features, training_target, cv=3, scoring='accuracy')) known_cv_score = 0.947282375315 assert np.allclose(known_cv_score, cv_score)
from xgboost import XGBRegressor # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-15.707779240894274 exported_pipeline = make_pipeline( StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.75, learning_rate=0.001, loss="quantile", max_depth=2, max_features=0.2, min_samples_leaf=14, min_samples_split=12, n_estimators=100, subsample=0.5)), XGBRegressor(learning_rate=0.01, max_depth=2, min_child_weight=9, n_estimators=100, nthread=1, subsample=0.1)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
X = X[:, 1:] pd.DataFrame(X).to_csv("./Datasets/X.csv") pd.DataFrame(y).to_csv("./Datasets/y.csv") Y = pd.read_csv('./Datasets/y.csv') Y.drop('Unnamed: 0', axis=1, inplace=True) tpot_data = pd.read_csv('./Datasets/X.csv', sep=',', dtype=np.float64) #features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(X, Y, random_state=42) # Average CV score on the training set was: -0.00021627142164234252 exported_pipeline = make_pipeline( StackingEstimator(estimator=LinearSVR(C=0.5, dual=False, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.1)), StandardScaler(), StackingEstimator( estimator=RandomForestRegressor(bootstrap=False, max_features=0.6500000000000001, min_samples_leaf=1, min_samples_split=2, n_estimators=100)), RandomForestRegressor(bootstrap=False, max_features=0.8, min_samples_leaf=2, min_samples_split=10, n_estimators=410)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42)
from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:0.7648445932689355 exported_pipeline = make_pipeline( make_union( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), StackingEstimator(estimator=make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), LGBMClassifier(learning_rate=0.01193776641714437, max_depth=4, n_estimators=1122, random_state=42)))), LGBMClassifier(learning_rate=0.026797460873256924, max_depth=3, n_estimators=216, random_state=42)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import pandas as pd from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:-2245.3301019020714 exported_pipeline = make_pipeline( StackingEstimator( estimator=RandomForestRegressor(bootstrap=False, max_features=0.7500000000000001, min_samples_leaf=3, min_samples_split=3, n_estimators=100)), ExtraTreesRegressor(bootstrap=False, max_features=0.6000000000000001, min_samples_leaf=2, min_samples_split=4, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=2019) # Average CV score on the training set was:0.9077543142597638 exported_pipeline = make_pipeline( StackingEstimator( estimator=GradientBoostingClassifier(learning_rate=0.1, max_depth=3, max_features=0.15000000000000002, min_samples_leaf=5, min_samples_split=15, n_estimators=100, subsample=0.5)), LinearSVC(C=20.0, dual=False, loss="squared_hinge", penalty="l1", tol=0.01)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:-17.58292391242326 exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=make_pipeline( StackingEstimator( estimator=ElasticNetCV(l1_ratio=0.55, tol=0.001)), StackingEstimator(estimator=GradientBoostingRegressor( alpha=0.85, learning_rate=1.0, loss="lad", max_depth=4, max_features=0.6000000000000001, min_samples_leaf=11, min_samples_split=20, n_estimators=100, subsample=0.1)), LassoLarsCV(normalize=True))), FunctionTransformer(copy)), LinearSVR(C=5.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.1)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.9932996110655544 exported_pipeline = make_pipeline( RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.55, n_estimators=100), step=0.15000000000000002), StackingEstimator( estimator=LogisticRegression(C=1.0, dual=False, penalty="l2")), StackingEstimator(estimator=RandomForestClassifier(bootstrap=False, criterion="gini", max_features=0.8, min_samples_leaf=6, min_samples_split=14, n_estimators=100)), KNeighborsClassifier(n_neighbors=2, p=2, weights="distance")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from tpot.builtins import StackingEstimator, ZeroCount from sklearn.compose import TransformedTargetRegressor from sklearn.preprocessing import QuantileTransformer # NOTE: Make sure that the class is labeled 'target' in the data file import competitions d = competitions.get_data() tpot_data = d.data # tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:-747046.8597394783 exported_pipeline = make_pipeline( StackingEstimator(estimator=ElasticNetCV(l1_ratio=1.0, tol=0.001)), FastICA(tol=0.8), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=ExtraTreesRegressor(bootstrap=True, max_features=0.5, min_samples_leaf=14, min_samples_split=11, n_estimators=100)), ZeroCount(), MaxAbsScaler(), LassoLarsCV(normalize=False)) # exported_pipeline = TransformedTargetRegressor(regressor=exported_pipeline, transformer=QuantileTransformer(output_distribution='normal')) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) train_results = exported_pipeline.predict(training_features) from pylab import *
def train_model(X_train, y_train): #model = LogisticRegression() #model = RandomForestClassifier() #model = ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9000000000000001, min_samples_leaf=4, min_samples_split=15) #model = GradientBoostingClassifier(max_depth=2, max_features=0.25, min_samples_leaf=13, min_samples_split=15, n_estimators=100, subsample=0.4) #model = DecisionTreeClassifier(max_depth=7, min_samples_leaf=16, min_samples_split=10) #model = RandomForestClassifier(bootstrap=False, max_features=0.6500000000000001, min_samples_leaf=9, min_samples_split=19) model = VotingClassifier( estimators= [('lr', LogisticRegression()), ('et', ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.9000000000000001, min_samples_leaf=4, min_samples_split=15)), ('gb', GradientBoostingClassifier(max_depth=2, max_features=0.25, min_samples_leaf=13, min_samples_split=15, n_estimators=100, subsample=0.4)), ('dt', DecisionTreeClassifier(max_depth=7, min_samples_leaf=16, min_samples_split=10)), ('rf', RandomForestClassifier(bootstrap=False, max_features=0.6500000000000001, min_samples_leaf=9, min_samples_split=19)), ('gb2', GradientBoostingClassifier(learning_rate=0.01, max_depth=2, max_features=0.8, min_samples_leaf=11, min_samples_split=10, subsample=0.7000000000000001)), ('gb3', GradientBoostingClassifier(max_depth=7, max_features=0.15000000000000002, min_samples_leaf=5, min_samples_split=17, n_estimators=100, subsample=0.6500000000000001)), ('pip1', make_pipeline( StackingEstimator( estimator=LinearSVC( dual=False, loss="squared_hinge", tol=1e-05)), StandardScaler( ), DecisionTreeClassifier(criterion="entropy", max_depth=6, min_samples_leaf=7, min_samples_split=9))) #('rf2', RandomForestClassifier(criterion="entropy", max_features=0.25, min_samples_split=8, n_estimators=100)) ], voting='hard') model.fit(X_train, y_train) return model
import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingClassifier from sklearn.model_selection import train_test_split from sklearn.neural_network import MLPClassifier from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 1.0 exported_pipeline = make_pipeline( StackingEstimator( estimator=MLPClassifier(alpha=0.001, learning_rate_init=0.001)), GradientBoostingClassifier(learning_rate=0.1, max_depth=7, max_features=1.0, min_samples_leaf=1, min_samples_split=9, n_estimators=100, subsample=0.9500000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# Average CV score on the training set was: -6.981145679172217 exported_pipeline = make_pipeline( make_union( make_union( FunctionTransformer(copy), make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), FeatureAgglomeration(affinity="euclidean", linkage="average"), SelectPercentile(score_func=f_regression, percentile=15), RBFSampler(gamma=0.8500000000000001))), SelectFwe(score_func=f_regression, alpha=0.005)), StackingEstimator( estimator=ExtraTreesRegressor(bootstrap=False, max_features=0.7000000000000001, min_samples_leaf=7, min_samples_split=11, n_estimators=100)), ExtraTreesRegressor(bootstrap=False, max_features=0.9000000000000001, min_samples_leaf=2, min_samples_split=5, n_estimators=100)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(test_data) export_test_to_csv(predictions=results) print(len(results))
from xgboost import XGBClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.9104161420758785 exported_pipeline = make_pipeline( RobustScaler(), StackingEstimator(estimator=SGDClassifier(alpha=0.0, eta0=0.1, fit_intercept=False, l1_ratio=0.75, learning_rate="invscaling", loss="hinge", penalty="elasticnet", power_t=100.0)), XGBClassifier(learning_rate=0.1, max_depth=7, min_child_weight=8, n_estimators=100, nthread=1, subsample=0.6000000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=7) # Average CV score on the training set was: 0.8093557422969188 exported_pipeline = make_pipeline( StackingEstimator(estimator=ExtraTreesClassifier(bootstrap=False, criterion="gini", max_features=0.05, min_samples_leaf=8, min_samples_split=12, n_estimators=100)), ExtraTreesClassifier(bootstrap=True, criterion="entropy", max_features=0.6000000000000001, min_samples_leaf=20, min_samples_split=13, n_estimators=100)) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 7) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
def main(): #读数据 train = pd.read_csv("../data/processed/train.csv") test = pd.read_csv("../data/processed/test.csv") train.pop("id") test.pop("id") target = train.pop("血糖") train_x = train.as_matrix() train_y = target.as_matrix() test_x = test.as_matrix() high_labels = np.zeros((train_y.shape[0], )) for i in range(train_y.shape[0]): if train_y[i] < 11.2: #训练集的高值判断 high_labels[i] = 1 else: high_labels[i] = -1 #预测结果取5次平均 N = 5 kf = KFold(n_splits=N, random_state=42) i = 0 result_mean = 0.0 test_preds = np.zeros((test_x.shape[0], N)) #构造一个用于存储异常值的字典,存储方式:id:[].例如{938:[14,14,14],314:[13]} outlier = {} for train_index, test_index in kf.split(train_x): training_features, training_target = train_x[train_index], train_y[ train_index] testing_features, testing_target = train_x[test_index], train_y[ test_index] #构造模型,预测血糖值 exported_pipeline = Pipeline([ ("scaler", MaxAbsScaler()), ("SVR", StackingEstimator( estimator=LinearSVR(C=0.01, dual=False, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.001))), ("RidgeCV", StackingEstimator(estimator=RidgeCV())), # ("LGB", StackingEstimator(estimator=lgb.LGBMRegressor(objective='regression', # boosting_type="GBDT", # num_leaves=31, # learning_rate=0.01, # feature_fraction=0.5, # bagging_fraction=0.5, # bagging_freq=5, # n_estimators=400))), ("XGB", XGBRegressor(max_depth=8, n_estimators=200, colsample_bytree=0.8, subsample=0.8, tweedie_variance_power=1.4, eta=0.01, booster="gbtree", random_state=1015, gamma=1, silent=1, min_child_weight=5, objective="reg:tweedie", n_jobs=-1)) ]) exported_pipeline.fit(training_features, training_target) test_pred = exported_pipeline.predict(test_x) #预测异常值 high_results, pred_high_list = modif_value( training_features, high_labels[train_index], test_x, train_x[np.where(high_labels == -1)[0]], train_y[np.where(high_labels == -1)[0]]) #存储异常值 if len(high_results) != 0 and len(pred_high_list) != 0: for ii, jj in enumerate(pred_high_list): if jj not in outlier: outlier[jj] = [] outlier[jj].append(high_results[ii]) for index, value in zip(high_results, pred_high_list): print(index, value) # 线下CV testing_results = exported_pipeline.predict(testing_features) # 改值 cv_high_results, cv_pred_high_list = modif_value( training_features, high_labels[train_index], testing_features, train_x[np.where(high_labels == -1)[0]], train_y[np.where(high_labels == -1)[0]]) if len(cv_high_results) != 0 and len(cv_pred_high_list) != 0: for ii, jj in enumerate(cv_pred_high_list): testing_results[jj] = cv_high_results[ii] result_mean += np.round( mean_squared_error(testing_target, testing_results), 5) print( 'CV_ROUND (', i, ') mse -> ', np.round(mean_squared_error(testing_target, testing_results), 5) / 2) test_preds[:, i] = test_pred i += 1 results = test_preds.mean(axis=1) #修改异常值 for index in outlier: print(index, outlier[index]) results[index] = max(outlier[index]) # 线下CV result_mean /= N print("offline CV Mean squared error: %.5f" % (result_mean / 2)) ouput = pd.DataFrame() ouput[0] = results #ouput.to_csv("../result/1.25-WQX-PolyFeatures.csv", header=None, index=False, encoding="utf-8") # ouput.to_csv(r'../result/test{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), # header=None,index=False, float_format='%.4f') # save(ouput, 'xgb_class') print(ouput.describe()) print(ouput.loc[ouput[0] > 8])
from sklearn.preprocessing import Binarizer, StandardScaler from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator from sklearn.preprocessing import FunctionTransformer from copy import copy # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8291955789781877 exported_pipeline = make_pipeline( make_union( make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), StackingEstimator( estimator=LogisticRegression(C=0.1, dual=True, penalty="l2")), SelectPercentile(score_func=f_classif, percentile=78)), make_pipeline( make_union(FunctionTransformer(copy), FunctionTransformer(copy)), SelectPercentile(score_func=f_classif, percentile=46), Binarizer(threshold=0.1))), StandardScaler(), LinearSVC(C=0.001, dual=True, loss="hinge", penalty="l2", tol=0.01)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.preprocessing import Normalizer from tpot.builtins import StackingEstimator, ZeroCount from xgboost import XGBClassifier # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.8666666666666668 exported_pipeline = make_pipeline( StackingEstimator(estimator=XGBClassifier(learning_rate=0.1, max_depth=3, min_child_weight=4, n_estimators=100, nthread=1, subsample=0.1)), Normalizer(norm="l1"), ZeroCount(), RandomForestClassifier(bootstrap=False, criterion="entropy", max_features=0.1, min_samples_leaf=11, min_samples_split=20, n_estimators=100)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.feature_selection import VarianceThreshold from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=1234) # Average CV score on the training set was:0.7821728129678194 exported_pipeline = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=False)), VarianceThreshold(threshold=0.0005), GradientBoostingRegressor(alpha=0.8, learning_rate=0.1, loss="lad", max_depth=4, max_features=0.25, min_samples_leaf=12, min_samples_split=10, n_estimators=100, subsample=0.9000000000000001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
# NOTE: Make sure that the outcome column is labeled 'target' in the data file data = pd.read_csv(pathlib.Path(__file__).parent.absolute().__str__() + '/student-mat.csv', sep=';') data['target'] = data['G3'] data.drop(columns='G3', inplace=True) features = data.drop('target', axis=1).select_dtypes([np.number]) training_features, testing_features, training_target, testing_target = \ train_test_split(features, data['target'], random_state=25) features = features.dtypes.to_dict() # Instantiate model model = make_pipeline( make_union( FunctionTransformer(copy), StackingEstimator(estimator=RidgeCV()) ), XGBRegressor(learning_rate=0.1, max_depth=2, min_child_weight=9, n_estimators=1000, nthread=1, objective="reg:squarederror", subsample=0.35000000000000003) ) # Fix random state for all the steps in exported pipeline set_param_recursive(model.steps, 'random_state', 25) Model = model.fit(training_features, training_target) # Building Metrics testing_pred = Model.predict(testing_features) score = model.score(testing_features, testing_target) mse = MSE(testing_target, testing_pred) rmse = mse**(1/2) max_error = max_error(testing_target, testing_pred) eval_metrics_dict = {'r2':score, 'mse': mse, 'rmse': rmse, 'max_error': max_error}
import numpy as np import pandas as pd from sklearn.decomposition import PCA from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.pipeline import make_pipeline, make_union from tpot.builtins import StackingEstimator from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=42) # Average CV score on the training set was: 0.9619047619047618 exported_pipeline = make_pipeline( StackingEstimator(estimator=MultinomialNB(alpha=0.01, fit_prior=False)), PCA(iterated_power=10, svd_solver="randomized"), GaussianNB()) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.linear_model import RidgeCV from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -4.3217232470775455 exported_pipeline = make_pipeline( MinMaxScaler(), StackingEstimator(estimator=LinearSVR(C=0.01, dual=True, epsilon=0.1, loss="squared_epsilon_insensitive", tol=0.0001)), StackingEstimator(estimator=LinearSVR(C=20.0, dual=True, epsilon=0.01, loss="epsilon_insensitive", tol=0.1)), RidgeCV() ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeRegressor from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:-24608.83594657029 exported_pipeline = make_pipeline( StackingEstimator(estimator=DecisionTreeRegressor( max_depth=3, min_samples_leaf=20, min_samples_split=3)), DecisionTreeRegressor(max_depth=10, min_samples_leaf=9, min_samples_split=4)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Average CV score on the training set was:-0.002941894277857136 exported_pipeline = make_pipeline( StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.85, learning_rate=0.01, loss="lad", max_depth=2, max_features=0.15000000000000002, min_samples_leaf=7, min_samples_split=7, n_estimators=100, subsample=0.4)), MinMaxScaler(), StackingEstimator(estimator=LinearSVR(C=1.0, dual=True, epsilon=0.001, loss="epsilon_insensitive", tol=1e-05)), PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), RandomForestRegressor(bootstrap=False, max_features=0.45, min_samples_leaf=6, min_samples_split=3, n_estimators=100) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.ensemble import ExtraTreesClassifier from sklearn.feature_selection import RFE from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.991096262627164 exported_pipeline = make_pipeline( RFE(estimator=ExtraTreesClassifier(criterion="gini", max_features=0.6000000000000001, n_estimators=100), step=0.15000000000000002), StackingEstimator(estimator=DecisionTreeClassifier(criterion="entropy", max_depth=8, min_samples_leaf=4, min_samples_split=8)), KNeighborsClassifier(n_neighbors=1, p=2, weights="uniform")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
##merge the 2 datasets on sample data = pd.merge(meta, shared, on=['sample']) ##remove adenoma samples data = data[data.dx.str.contains("adenoma") == False] data.rename(columns={'dx': 'class'}, inplace=True) x = data.drop(["sample", "class", "numOtus", "label"], axis=1) diagnosis = {"cancer": 1, "normal": 0} y = data["class"].replace(diagnosis) y.dropna() x.dropna() # Score on the training set was:0.8492612704601008 exported_pipeline = make_pipeline( StackingEstimator(estimator=RandomForestClassifier(bootstrap=True, criterion="gini", max_features=0.8, min_samples_leaf=2, min_samples_split=2, n_estimators=100)), StackingEstimator( estimator=RandomForestClassifier(bootstrap=True, criterion="entropy", max_features=0.6000000000000001, min_samples_leaf=8, min_samples_split=7, n_estimators=100)), StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=3, min_samples_leaf=13, min_samples_split=10)), RFE(estimator=ExtraTreesClassifier(criterion="entropy", max_features=0.6500000000000001,
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB, MultinomialNB from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import PolynomialFeatures from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=None) # Average CV score on the training set was:0.9800000000000001 exported_pipeline = make_pipeline( PolynomialFeatures(degree=2, include_bias=False, interaction_only=False), StackingEstimator(estimator=GaussianNB()), MultinomialNB(alpha=10.0, fit_prior=True) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator, ZeroCount from xgboost import XGBRegressor # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.2532849505281343 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_regression, percentile=89), StackingEstimator( estimator=KNeighborsRegressor(n_neighbors=48, p=1, weights="uniform")), StackingEstimator(estimator=XGBRegressor(learning_rate=0.001, max_depth=1, min_child_weight=3, n_estimators=50, n_jobs=1, objective="reg:squarederror", subsample=0.9500000000000001, verbosity=0)), MinMaxScaler(), StackingEstimator(estimator=SGDRegressor(alpha=0.01, eta0=0.01, fit_intercept=False, l1_ratio=0.0, learning_rate="constant", loss="huber", penalty="elasticnet",
def main(): train = pd.read_csv("../data/processed/train.csv") train.pop("id") target = train.pop("血糖") train_x = train.as_matrix() train_y = target.as_matrix() N = 5 kf = KFold(n_splits=N, random_state=42) result_mean = 0.0 for train_index, test_index in kf.split(train_x): training_features, training_target = train_x[train_index], train_y[ train_index] testing_features, testing_target = train_x[test_index], train_y[ test_index] scaler = MaxAbsScaler() scaler.fit(training_features) training_features = scaler.transform(training_features) testing_features = scaler.transform(testing_features) knn = KNeighborsRegressor(n_neighbors=9, p=1, weights="distance") linear_svr = LinearSVR(C=0.01, dual=False, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.001) ridge = RidgeCV() gbm = lgb.LGBMRegressor(objective='regression', boosting_type="GBDT", num_leaves=17, learning_rate=0.01, feature_fraction=0.5, bagging_fraction=0.5, bagging_freq=5, reg_alpha=0.1, reg_lambda=0.5, n_estimators=400) lr = LinearRegression() en = ElasticNetCV(l1_ratio=0.1, tol=0.01) xgb = XGBRegressor(learning_rate=0.01, max_depth=8, min_child_weight=8, n_estimators=100, nthread=1, subsample=0.15000000000000002) et = ExtraTreesRegressor(bootstrap=True, max_features=0.35000000000000003, min_samples_leaf=3, min_samples_split=12, n_estimators=100) rf = RandomForestRegressor(bootstrap=True, max_features=0.9500000000000001, min_samples_leaf=15, min_samples_split=6, n_estimators=100) exported_pipeline0 = make_pipeline( StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.1, tol=0.01)), ExtraTreesRegressor(bootstrap=False, max_features=0.2, min_samples_leaf=3, min_samples_split=16, n_estimators=100)) exported_pipeline1 = Pipeline([ ("SVR", StackingEstimator( estimator=LinearSVR(C=0.01, dual=False, epsilon=1.0, loss="squared_epsilon_insensitive", tol=0.001))), ("RidgeCV", StackingEstimator(estimator=RidgeCV())), ("LGB", lgb.LGBMRegressor(objective='regression', boosting_type="GBDT", num_leaves=17, learning_rate=0.01, feature_fraction=0.5, bagging_fraction=0.5, bagging_freq=5, reg_alpha=0.1, reg_lambda=0.5, n_estimators=400)) ]) exported_pipeline2 = make_pipeline( StackingEstimator(estimator=RidgeCV()), StackingEstimator( estimator=XGBRegressor(learning_rate=0.01, max_depth=8, min_child_weight=8, n_estimators=100, nthread=1, subsample=0.15000000000000002)), ExtraTreesRegressor(bootstrap=True, max_features=0.35000000000000003, min_samples_leaf=3, min_samples_split=12, n_estimators=100)) stack = Ensemble(n_splits=10, stacker=LinearRegression(), base_models=(rf, knn, lr, linear_svr, ridge, en, gbm, xgb, et, exported_pipeline0, exported_pipeline1, exported_pipeline2)) results = stack.fit_predict(X=training_features, y=training_target, T=testing_features) result_mean += np.round(mean_squared_error(testing_target, results), 5) result_mean /= (N) print("Mean squared error: %.5f" % (result_mean / 2))
import numpy as np from sklearn.linear_model import LassoLarsCV from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsRegressor from sklearn.pipeline import make_pipeline, make_union from sklearn.preprocessing import MinMaxScaler, StandardScaler from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'class' in the data file tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64) features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['class'], random_state=42) exported_pipeline = make_pipeline( StackingEstimator(estimator=LassoLarsCV(normalize=True)), StandardScaler(), MinMaxScaler(), KNeighborsRegressor(n_neighbors=52, p=1, weights="distance")) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
from sklearn.svm import LinearSVR from tpot.builtins import StackingEstimator from xgboost import XGBRegressor # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: -3.982083588281725 exported_pipeline = make_pipeline( StackingEstimator(estimator=XGBRegressor(learning_rate=0.1, max_depth=2, min_child_weight=8, n_estimators=300, n_jobs=1, objective="reg:squarederror", subsample=0.1, verbosity=0)), MinMaxScaler(), LinearSVR(C=20.0, dual=True, epsilon=1.0, loss="epsilon_insensitive", tol=0.1)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)
import numpy as np import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import BernoulliNB from sklearn.pipeline import make_pipeline, make_union from sklearn.svm import LinearSVC from tpot.builtins import StackingEstimator # NOTE: Make sure that the class is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1).values training_features, testing_features, training_target, testing_target = \ train_test_split(features, tpot_data['target'].values, random_state=42) # Score on the training set was:0.6111975314359399 exported_pipeline = make_pipeline( StackingEstimator(estimator=BernoulliNB(alpha=0.01, fit_prior=True)), LinearSVC(C=20.0, dual=False, loss="squared_hinge", penalty="l1", tol=0.0001)) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features)