def test_oof_pred_mode_no_get_params(self): S_train_1 = np.ones(X_train.shape[0]).reshape(-1, 1) S_test_1 = np.ones(X_test.shape[0]).reshape(-1, 1) models = [MinimalEstimator()] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression=True, n_folds=n_folds, shuffle=False, save_dir=temp_dir, mode='oof_pred', random_state=0, verbose=0) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join( temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_oof_mode_xtest_is_none(self): model = LinearRegression() S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) S_test_1 = None models = [LinearRegression()] S_train_2, S_test_2 = stacking(models, X_train, y_train, None, regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, mode = 'oof', random_state = 0, verbose = 0) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_oof_mode_proba(self): model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, n_jobs = 1, verbose = 0, method = 'predict_proba') S_test_1 = None models = [LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression = False, n_folds = n_folds, shuffle = False, stratified = True, mode = 'oof', random_state = 0, verbose = 0, needs_proba = True, save_dir=temp_dir) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_pred_mode(self): model = LogisticRegression() S_train_1 = None _ = model.fit(X_train, y_train) S_test_1 = model.predict(X_test).reshape(-1, 1) models = [LogisticRegression()] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression=False, n_folds=n_folds, shuffle=False, save_dir=temp_dir, mode='pred', random_state=0, verbose=0, stratified=True) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join( temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_oof_mode_metric(self): model = LinearRegression() scorer = make_scorer(mean_absolute_error) scores = cross_val_score(model, X_train, y = y_train, cv = n_folds, scoring = scorer, n_jobs = 1, verbose = 0) mean_str_1 = '%.8f' % np.mean(scores) std_str_1 = '%.8f' % np.std(scores) models = [LinearRegression()] S_train, S_test = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, save_dir=temp_dir, mode = 'oof', random_state = 0, verbose = 0) # Load mean score and std from file # Normally if cleaning is performed there is only one .log.txt file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.log.txt')))[-1] # take the latest file with open(file_name) as f: for line in f: if 'MEAN' in line: split = line.strip().split() break mean_str_2 = split[1][1:-1] std_str_2 = split[3][1:-1] assert_equal(mean_str_1, mean_str_2) assert_equal(std_str_1, std_str_2)
def test_oof_pred_mode_sample_weight_random(self): np.random.seed(0) sw = np.random.rand(len(y_train)) model = LinearRegression() S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, n_jobs = 1, verbose = 0, method = 'predict', fit_params = {'sample_weight': sw}).reshape(-1, 1) _ = model.fit(X_train, y_train, sample_weight = sw) S_test_1 = model.predict(X_test).reshape(-1, 1) models = [LinearRegression()] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, mode = 'oof_pred', random_state = 0, verbose = 0, sample_weight = sw) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def test_pred_bag_mode(self): S_test_temp = np.zeros((X_test.shape[0], n_folds)) kf = KFold(n_splits = n_folds, shuffle = False, random_state = 0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] X_te = X_train[te_index] y_te = y_train[te_index] model = LinearRegression() _ = model.fit(X_tr, y_tr) S_test_temp[:, fold_counter] = model.predict(X_test) S_test_1 = np.mean(S_test_temp, axis = 1).reshape(-1, 1) S_train_1 = None models = [LinearRegression()] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, mode = 'pred_bag', random_state = 0, verbose = 0) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def Level_1(self, append_model): self.models = [ LogisticRegression(random_state=0), LinearDiscriminantAnalysis(), KNeighborsClassifier(), GaussianNB(), DecisionTreeClassifier(random_state=0), BaggingClassifier(DecisionTreeClassifier(random_state=0), bootstrap=True, oob_score=True, n_jobs=-1, random_state=0), RandomForestClassifier(n_jobs=-1, random_state=0), ExtraTreesClassifier(n_jobs=-1, random_state=0), AdaBoostClassifier(DecisionTreeClassifier(random_state=0), random_state=0), GradientBoostingClassifier(random_state=0), MLPClassifier(random_state=0) ] if append_model==None: pass else: self.models.append(append_model) self.S_train, self.S_test = stacking(self.models, np.array(self.X_train), np.array(self.y_train), np.array(self.X_test), regression=False, mode='oof_pred', needs_proba=True, save_dir='.', metric=log_loss, n_folds=5, stratified=True, shuffle=True, random_state=0, verbose=2 ) return None
def test_pred_bag_mode_proba(self): S_test_1 = np.zeros((X_test.shape[0], n_classes)) S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] X_te = X_train[te_index] y_te = y_train[te_index] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') _ = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1[:, class_id] = np.mean(S_test_temp[:, class_id::n_classes], axis=1) S_train_1 = None models = [ LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') ] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression=False, n_folds=n_folds, shuffle=False, save_dir=temp_dir, mode='pred_bag', random_state=0, verbose=0, stratified=True, needs_proba=True) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join( temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def get_stacking_features(path=None): print(f"Training for {N_FOLDS} CV folds") if path is None: # TODO: Some refactoring. df = pd.read_csv("elo/data/augmented_train.csv") print(df.sample(5)) # TODO: Find a better way to impute inf and missing values. df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(df.median()) X_train = df.drop(FEATS_EXCLUDED, axis=1, errors='ignore').values y_train = df.loc[:, "target"].values test_df = pd.read_csv("elo/data/augmented_test.csv") print(test_df.sample(5)) # TODO: Find a better way to impute inf and missing values. test_df = test_df.replace([np.inf, -np.inf], np.nan) test_df = test_df.fillna(test_df.median()) X_test = test_df.drop(FEATS_EXCLUDED, axis=1, errors='ignore').values first_level_models = [ XGBRegressor(**HYPEROPT_XGBOOST_OPTIMAL_HP), LGBMRegressor(**HYPEROPT_LIGHTGBM_OPTIMAL_HP), LGBMRegressor(**OPTUNA_LIGTHGBM_OPTIMAL_HP), XGBRegressor(seed=SEED), LGBMRegressor(seed=SEED), KNeighborsRegressor(), LinearRegression(), ExtraTreesRegressor(random_state=SEED), GradientBoostingRegressor(random_state=SEED), Lasso(random_state=SEED) ] # This didn't work at all without proper tuning!!! # SGDRegressor(random_state=SEED)] # TODO: Should I add "shuffling"? stacked_train, stacked_test = stacking(first_level_models, X_train, y_train, X_test, regression=True, metric=rmse, n_folds=N_FOLDS, random_state=SEED, verbose=2, save_dir="elo/data/stacking", shuffle=True) else: stacked_train, stacked_test = np.load(path) df = pd.read_csv("elo/data/augmented_train.csv") # TODO: Find a better way to impute inf and missing values. df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(df.median()) y_train = df.loc[:, "target"].values # Observe the data print(stacked_train[:5]) print(stacked_test[:5]) print(y_train[:5]) return stacked_train, stacked_test, y_train
def test_oof_pred_mode_proba_2_models(self): # Model a model = LogisticRegression() S_train_1_a = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') _ = model.fit(X_train, y_train) S_test_1_a = model.predict_proba(X_test) # Model b model = GaussianNB() S_train_1_b = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') _ = model.fit(X_train, y_train) S_test_1_b = model.predict_proba(X_test) S_train_1 = np.c_[S_train_1_a, S_train_1_b] S_test_1 = np.c_[S_test_1_a, S_test_1_b] models = [LogisticRegression(), GaussianNB()] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression=False, n_folds=n_folds, shuffle=False, stratified=True, mode='oof_pred', random_state=0, verbose=0, needs_proba=True, save_dir=temp_dir) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join( temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def model_sel(x_train, y_train, x_test, y_test): clf1 = LogisticRegression(random_state=0) clf2 = BernoulliNB() clf3 = SVC(C= 1, kernel= 'linear') clf4 = RandomForestClassifier(n_estimators=800, min_samples_split=5, min_samples_leaf=1, max_features='sqrt',max_depth=100, bootstrap=False) clf5 = xgb.XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.3, n_estimators=100, max_depth=4) clf6 = neighbors.KNeighborsClassifier(n_neighbors=9,p=1) clf7 = GradientBoostingClassifier(max_depth=5, min_samples_split=4, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10, learning_rate = 0.15, n_estimators = 300) clf8 = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1, n_estimators=200, random_state=1) input_dim = x_train.shape[1] n_classes = len(np.unique(y_train)) clf9 = KerasClassifier(Sequential_model(input_dim, n_classes),epochs=8) clf = [clf1, clf2, clf3, clf4,clf5,clf6,clf7,clf8,clf9] for model in clf: print(f"Classifier: {model}") model.fit(x_train, y_train) score = model.score(x_test, y_test) print("score: {}".format(score)) print(" ") print("Ensembles.................................") print(" ") MaxVoting_esemble = VotingClassifier(estimators=[('lr', clf1), ('bb', clf2), ('svc', clf3), ('rf', clf4), ('xg', clf5), ('knn', clf6), ('grb', clf7), ('ab', clf8), ('nn',clf9)], voting='hard') MaxVoting_esemble.fit(x_train,y_train) score = MaxVoting_esemble.score(x_test,y_test) print("MaxVoting: {}".format(score)) print(" ") #staking S_train, S_test = stacking(clf, x_train, y_train, x_test, regression=False, mode='oof_pred_bag', needs_proba=False, save_dir=None, metric=accuracy_score, n_folds=4, stratified=True, shuffle=True, random_state=0, verbose=2) stacking_ensemble = SVC(C= 1, kernel= 'linear') stacking_ensemble = stacking_ensemble.fit(S_train, y_train) stacking_score = stacking_ensemble.score(S_test, y_test) print("Stacking: {}".format(stacking_score)) return clf, MaxVoting_esemble, stacking_ensemble
def test_oof_pred_bag_mode_shuffle(self): S_test_temp = np.zeros((X_test.shape[0], n_folds)) # Usind StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] X_te = X_train[te_index] y_te = y_train[te_index] model = LogisticRegression() _ = model.fit(X_tr, y_tr) S_test_temp[:, fold_counter] = model.predict(X_test) S_test_1 = st.mode(S_test_temp, axis=1)[0] model = LogisticRegression() # !!! Important. Here we pass CV-generator not number of folds <cv = kf> S_train_1 = cross_val_predict(model, X_train, y=y_train, cv=kf, n_jobs=1, verbose=0, method='predict').reshape(-1, 1) models = [LogisticRegression()] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression=False, n_folds=n_folds, shuffle=True, save_dir=temp_dir, mode='oof_pred_bag', random_state=0, verbose=0, stratified=True) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join( temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def naiveStacking(self, degree, lmodels): print(explain.Stacking) X_train = self.applyPolynomialFeatures(self.X_train, degree) X_test = self.applyPolynomialFeatures(self.X_test, degree) X_train, y_train, X_test, y_test = self.pandas2numpy( X_train, self.y_train, X_test, self.y_test) return stacking( lmodels, X_train, y_train, X_test, regression=self.is_regression, metric=self.metric, n_folds=3, shuffle=True, random_state=0, verbose=1 ), y_train, y_test
def integrated_models(train_data, test_data): print("train data shape:{}, test data shape:{}".format( train_data.shape, test_data.shape)) y = np.ravel(np.array(train_data[['SalePrice']])) X = train_data.drop('SalePrice', axis=1) # load four models random_forest = load('rf_model.joblib') lightgbm = load('lgmb_model.joblib') g_boost = load('gmb_model.joblib') xg_boost = load('xgb_model.joblib') # model stacking models = [g_boost, xg_boost, lightgbm, random_forest] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) S_train, S_test = stacking(models, X_train, y_train, X_test, regression=True, mode='oof_pred_bag', metric=rmse, n_folds=5, random_state=25, verbose=2) print("S_train shape:{} \t S_test shape:{}".format(S_train.shape, S_test.shape)) # 初始化第二层模型 xgb_lev2 = XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=3, n_jobs=-1, random_state=17) # Fit the 2nd level model on the output of level 1 xgb_lev2.fit(S_train, y_train) stacked_pred = xgb_lev2.predict(S_test) print("RMSE of Stacked Model: {}".format(rmse(y_test, stacked_pred))) y1_pred_L1 = models[0].predict(test_data) y2_pred_L1 = models[1].predict(test_data) y3_pred_L1 = models[2].predict(test_data) y4_pred_L1 = models[3].predict(test_data) S_test_L1 = np.c_[y1_pred_L1, y2_pred_L1, y3_pred_L1, y4_pred_L1] print("S_test_L1 shape: {}".format(S_test_L1.shape)) test_stacked_pred = xgb_lev2.predict(S_test_L1) submission = pd.DataFrame() submission['Id'] = np.array(test_data.index) submission['SalePrice'] = test_stacked_pred submission.to_csv("submission.csv", index=False)
def prepare(data): X_train, X_test, y_train, y_test = data S_train, S_test = stacking(models, X_train, y_train, X_test, regression=False, mode='oof_pred_bag', needs_proba=False, save_dir=None, metric=accuracy_score, n_folds=7, stratified=True, shuffle=True, random_state=7, verbose=2) return (S_train, y_train, S_test, y_test)
def test_oof_pred_mode_verbose_1(self): model = LinearRegression() S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) _ = model.fit(X_train, y_train) S_test_1 = model.predict(X_test).reshape(-1, 1) models = [LinearRegression()] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, mode = 'oof_pred', random_state = 0, verbose = 0) models = [LinearRegression()] S_train_3, S_test_3 = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, mode = 'oof_pred', random_state = 0, verbose = 1) models = [LinearRegression()] S_train_4, S_test_4 = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, mode = 'oof_pred', random_state = 0, verbose = 2) models = [LinearRegression()] S_train_5, S_test_5 = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, shuffle = False, mode = 'oof_pred', random_state = 0, verbose = 0) models = [LinearRegression()] S_train_6, S_test_6 = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, shuffle = False, mode = 'oof_pred', random_state = 0, verbose = 1) models = [LinearRegression()] S_train_7, S_test_7 = stacking(models, X_train, y_train, X_test, regression = True, n_folds = n_folds, shuffle = False, mode = 'oof_pred', random_state = 0, verbose = 2) assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3) assert_array_equal(S_train_1, S_train_4) assert_array_equal(S_test_1, S_test_4) assert_array_equal(S_train_1, S_train_5) assert_array_equal(S_test_1, S_test_5) assert_array_equal(S_train_1, S_train_6) assert_array_equal(S_test_1, S_test_6) assert_array_equal(S_train_1, S_train_7) assert_array_equal(S_test_1, S_test_7)
def base_reg_stack(x, y, x_test): X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0) # Caution! All models and parameter values are just # demonstrational and shouldn't be considered as recommended. # Initialize 1st level models. models = [ ExtraTreesRegressor(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3), RandomForestRegressor(random_state=0, n_jobs=-1, n_estimators=100, max_depth=3), GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, max_depth=3) ] # Compute stacking features S_train, S_test = stacking(models, X_train, y_train, x_test, regression=True, metric=r2_score, n_folds=4, shuffle=True, random_state=0, verbose=2) # Initialize 2nd level model model = GradientBoostingRegressor(learning_rate=0.1, n_estimators=100, max_depth=3) print("S_train shape:", S_train.shape) # Fit 2nd level model model = model.fit(S_train, y_train) # Predict y_pred = model.predict(S_test) #print('Final prediction score: [%.8f]' % r2_score(y_test, y_pred)) return y_pred
def getModel(self, _params, _x, _y, _x_eval): estimator_list = [] for idx, job in enumerate(self.cantidate_job_list): if idx == _params['max_estimator']: break estimator_list.append(job.model.getModel(job.best_params)) s_train, s_test = stacking( estimator_list, _x, _y, _x_eval, regression=False, metric=accuracy_score, stratified=_params['stratified'], shuffle=_params['shuffle'], random_state=0, n_jobs=definitions.getNumberOfCore(), )
def Level_1(self, append_model=[]): self.models = [ LogisticRegression(random_state=0), LinearDiscriminantAnalysis(), KNeighborsClassifier(), GaussianNB(), DecisionTreeClassifier(random_state=0), BaggingClassifier(DecisionTreeClassifier(random_state=0), bootstrap=True, oob_score=True, n_jobs=-1, random_state=0), RandomForestClassifier(n_jobs=-1, random_state=0), ExtraTreesClassifier(n_jobs=-1, random_state=0), AdaBoostClassifier(DecisionTreeClassifier(random_state=0), random_state=0), GradientBoostingClassifier(random_state=0), MLPClassifier(random_state=0) ] if len(append_model) == 0: pass else: for m in append_model: self.models.append(append_model) self.S_train, self.S_test = stacking(self.models, np.array(self.X_train), np.array(self.y_train), np.array(self.X_test), regression=False, mode='oof_pred', needs_proba=True, save_dir='.', metric=log_loss, n_folds=5, stratified=True, shuffle=True, random_state=0, verbose=2) return None
def test_N_dim_input(self): """ This is `test_oof_pred_bag_mode` function with `LogisticRegressionUnrolled` estimator """ S_test_temp = np.zeros((X_test_4d_unrolled.shape[0], n_folds)) # Usind StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits = n_folds, shuffle = False, random_state = 0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train_4d_unrolled, y_train_4d)): # Split data and target X_tr = X_train_4d_unrolled[tr_index] y_tr = y_train_4d[tr_index] X_te = X_train_4d_unrolled[te_index] y_te = y_train_4d[te_index] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') _ = model.fit(X_tr, y_tr) S_test_temp[:, fold_counter] = model.predict(X_test_4d_unrolled) S_test_1 = st.mode(S_test_temp, axis = 1)[0] model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr') S_train_1 = cross_val_predict(model, X_train_4d_unrolled, y = y_train_4d, cv = n_folds, n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1) models = [LogisticRegressionUnrolled(random_state=0, solver='liblinear', multi_class='ovr')] S_train_2, S_test_2 = stacking(models, X_train_4d, y_train_4d, X_test_4d, regression = False, n_folds = n_folds, shuffle = False, save_dir=temp_dir, mode = 'oof_pred_bag', random_state = 0, verbose = 0, stratified = True) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name, allow_pickle=True) S_train_3 = S[0] S_test_3 = S[1] assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
def main(): # Data Extraction df = data_extract_e('e_20190609_15.pkl') # Data Transformation and Engineering df = feature_eng(df) df = extract_queues(df) dept_encoder, queue_encoder = load_labels('dept_encoder.pkl', 'queue_encoder.pkl', df=df) df = feature_transform(df, dept_encoder=dept_encoder, queue_encoder=queue_encoder) # Training/Test Split x, y = data_filter(df) x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.2, random_state=1357) # 2468 to use same shuffle as individual models # Load models from persistent files models = load_models() print(models) # Stacking # Produces a new set of features based on the predictions of base models x_train_s, x_test_s = stacking(models, x_train, y_train, x_test, n_folds=10, shuffle=True, verbose=0, regression=True) save_data(x_train_s, 'x_train_s.pkl') save_data(y_train, 'y_train.pkl') save_data(x_test_s, 'x_test_s.pkl') save_data(y_test, 'y_test.pkl')
XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3), LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3), KerasClassifier(build_fn=build_keras_model_1, epochs=2, batch_size=32, verbose=0) ] S_train_1, S_test_1 = stacking(models_1, # list of models X_train, y_train, X_test, # data regression=False, # classification task (if you need # regression - set to True) mode='oof_pred', # mode: oof for train set, fit on full # train and predict test set once needs_proba=True, # predict probabilities (if you need # class labels - set to False) save_dir='.', # save result and log in current dir # (to disable saving - set to None) metric=log_loss, # metric: callable n_folds=5, # number of folds stratified=True, # stratified split for folds shuffle=True, # shuffle the data random_state=0, # ensure reproducibility verbose=2) # print all info print('We have %d classes and %d models so in resulting arrays \ we expect to see %d columns.' % (n_classes, len(models_1), n_classes * len(models_1))) print('S_train_1 shape:', S_train_1.shape) print('S_test_1 shape: ', S_test_1.shape) # Our arrays and log were saved in current dir names = sorted(glob('*.npy'))
def test_oof_pred_bag_mode_proba_2_models(self): # Model a S_test_1_a = np.zeros((X_test.shape[0], n_classes)) S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] X_te = X_train[te_index] y_te = y_train[te_index] model = LogisticRegression() _ = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1_a[:, class_id] = np.mean(S_test_temp[:, class_id::n_classes], axis=1) model = LogisticRegression() S_train_1_a = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') # Model b S_test_1_b = np.zeros((X_test.shape[0], n_classes)) S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes)) # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0) for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)): # Split data and target X_tr = X_train[tr_index] y_tr = y_train[tr_index] X_te = X_train[te_index] y_te = y_train[te_index] model = GaussianNB() _ = model.fit(X_tr, y_tr) col_slice_fold = slice(fold_counter * n_classes, fold_counter * n_classes + n_classes) S_test_temp[:, col_slice_fold] = model.predict_proba(X_test) for class_id in range(n_classes): S_test_1_b[:, class_id] = np.mean(S_test_temp[:, class_id::n_classes], axis=1) model = GaussianNB() S_train_1_b = cross_val_predict(model, X_train, y=y_train, cv=n_folds, n_jobs=1, verbose=0, method='predict_proba') S_train_1 = np.c_[S_train_1_a, S_train_1_b] S_test_1 = np.c_[S_test_1_a, S_test_1_b] models = [LogisticRegression(), GaussianNB()] S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, regression=False, n_folds=n_folds, shuffle=False, save_dir=temp_dir, mode='oof_pred_bag', random_state=0, verbose=0, stratified=True, needs_proba=True) # Load OOF from file # Normally if cleaning is performed there is only one .npy file at given moment # But if we have no cleaning there may be more then one file so we take the latest file_name = sorted(glob.glob(os.path.join( temp_dir, '*.npy')))[-1] # take the latest file S = np.load(file_name) S_train_3 = S[0] S_test_3 = S[1] #@@@@ # Look at proba # print('\nTwo models') # print('etalon') # print(S_test_1[:2]) # print('vecstack') # print(S_test_2[:2]) #@@@@ assert_array_equal(S_train_1, S_train_2) assert_array_equal(S_test_1, S_test_2) assert_array_equal(S_train_1, S_train_3) assert_array_equal(S_test_1, S_test_3)
meta_classifier=logreg) scores = model_selection.cross_val_score(sclf, data1_x_bin, data[Target], cv=3, scoring='f1') print("Accuracy: %0.2f (+/- %0.2f)" # % (scores.mean(), scores.std())) """#### VecStack""" #1st level model X_train, X_test, y_train, y_test = train_test_split(data1_x_bin, data[Target], test_size=0.2) models = [lgbm_cl,rf_cl,gdb_cl] S_train, S_test = stacking(models, X_train, y_train, X_test, regression = False, metric = metrics.f1_score, n_folds = 4 , shuffle = True, random_state = 0, verbose = 2) #2nd level model # Initialize 2nd level model model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3) # Fit 2nd level model model = model.fit(S_train, y_train) # Predict y_pred = model.predict(S_test) # Final prediction score print('Final prediction score: [%.8f]' % metrics.f1_score(y_test, y_pred))
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', metric_params=None, n_jobs=1, n_neighbors=17, p=2, weights='distance') ] # build the stack level 1 S_train, S_test = stacking(lmodels, x_train, y_train, x_test, regression=regression, metric=metric, n_folds=3, shuffle=True, random_state=0, verbose=1) # build model lvel 2 model = DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10, max_features=None, max_leaf_nodes=None, min_impurity_split=1e-07, min_samples_leaf=2, min_samples_split=5, min_weight_fraction_leaf=0.0,
df = feature_eng(df) df = extract_queues(df) dept_encoder, queue_encoder, user_encoder = load_labels() df = feature_transform(df, dept_encoder=dept_encoder, queue_encoder=queue_encoder, user_encoder=user_encoder) df = df[:100000] # Take x number of rows; downscaling dataset due to time constraints # Training/Test Split x, y = data_filter(df) x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1357) # 2468 to use same shuffle as individual models # Load models from persistent files models = load_models() # Stacking # Produces a new set of features based on the predictions of base models x_train_s, x_test_s = stacking(models, x_train, y_train, x_test, n_folds=10, shuffle=True, verbose=0, regression=True) # Stacked Second-Layer Model xgb_l2 = XGBRegressor(objective='reg:linear') xgb_l2 = xgb_l2.fit(x_train_s, y_train) print('Stacking XGBRegressor L2 R2 Training score: ', xgb_l2.score(x_train_s, y_train)) y_pred = xgb_l2.predict(x_train_s) print('Stacking XGBRegressor L2 Training MSE: ', metrics.mean_squared_error(y_pred, y_train)) y_pred = xgb_l2.predict(x_test_s) print('Stacking XGBRegressor L2 R2 Test score: ', xgb_l2.score(x_test_s, y_test)) print('Stacking XGBRegressor L2 Test MSE: ', metrics.mean_squared_error(y_pred, y_test)) # Bayesian Optimization for Hyperparameter Tuning res = bayes_opt(objective_func, param_grid, 'xgb_l2_bo_res.z')
# split into input and output elements y = dataframe.SARSCov # Target variable X = dataframe.drop(['SARSCov'], axis=1) # Features X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) sm = SMOTE(k_neighbors=11) x_res, y_res = sm.fit_sample(X_train, y_train) S_train, S_test = stacking(models, x_res, y_res, X_test, regression=False, mode='oof_pred', needs_proba=False, save_dir=None, metric=metrics.accuracy_score, n_folds=10, stratified=True, shuffle=True, verbose=2) model = XGBClassifier(learning_rate=1.0, n_estimators=300, max_depth=17) # fit model model = model.fit(S_train, y_res) # evaluate model y_pred = model.predict(S_test) score = metrics.accuracy_score(y_test, y_pred)
lgb_pred = lightgbm.predict(test) # List of the models to be stacked models = [g_boost, xg_boost, lightgbm, random_forest] # Perform Stacking S_train, S_test = stacking(models, X_train, y_train, X_test, regression=True, mode='oof_pred_bag', metric=rmse, n_folds=5, random_state=25, verbose=2 ) # Initialize 2nd level model xgb_lev2 = XGBRegressor(learning_rate=0.1, n_estimators=500, max_depth=3, n_jobs=-1, random_state=17 ) # Fit the 2nd level model on the output of level 1
RandomForestClassifier(n_estimators=75, random_state=0, bootstrap=True, oob_score=True, warm_start=True), DecisionTreeClassifier(random_state=0, presort=True), XGBClassifier(random_state=0) ] S_train, S_test = stacking(models, X_train, y_train, X_test, regression=False, mode='oof_pred_bag', needs_proba=False, save_dir=None, metric=accuracy_score, n_folds=4, stratified=True, shuffle=True, random_state=0, verbose=1) model = GradientBoostingClassifier(random_state=0, warm_start=True, loss="deviance", n_estimators=400) model = model.fit(S_train, y_train) y_pred = model.predict(S_test) metrics = [] metrics.append(['f1score', f1_score(y_pred, y_test)])
models = [ ExtraTreesRegressor(random_state=0, n_jobs=-1, n_estimators=300, max_depth=3), RandomForestRegressor(random_state=0, n_jobs=-1, n_estimators=300, max_depth=3), XGBRegressor(seed=0, learning_rate=0.05, n_estimators=300, max_depth=3), LGBMRegressor(num_leaves=8, learning_rate=0.05, n_estimators=300) ] # Compute stacking features S_train, S_test = stacking(models, X_train, y_train, X_test, regression=True, metric=mean_squared_error, n_folds=5, shuffle=True, random_state=0, verbose=2) # Fit 2-nd level model model = LGBMRegressor(num_leaves=8, learning_rate=0.05, n_estimators=300) model = model.fit(S_train, y_train) y_pred = model.predict(S_test) id_test = ts_user['uid'] stacking_sub = pd.DataFrame({'uid': id_test, 'stacking_loan_sum': y_pred}) print(stacking_sub.describe()) stacking_sub.loc[stacking_sub["stacking_loan_sum"] < 0, "stacking_loan_sum"] = 0 print('saving submission...') now_time = time.strftime("%m-%d %H_%M_%S", time.localtime()) stacking_sub[["uid", "stacking_loan_sum"]].to_csv("./submission/" + now_time + '_stacking.csv', index=False, header=False)
calc_feature_importance=True), CatBoostClassifier(depth=9, iterations=30, learning_rate=0.2, eval_metric='AUC', verbose=True, random_seed=1, calc_feature_importance=True) ] S_train, S_test = stacking(models, trainX.values, trainY.values, test[cols_to_use].values, regression=False, metric=accuracy_score, n_folds=4, stratified=True, shuffle=True, random_state=0, verbose=2) catclassifier = CatBoostClassifier(depth=10, iterations=20, learning_rate=0.2, eval_metric='AUC', verbose=True, random_seed=1, calc_feature_importance=True) # Fit 2-nd level model