def test_oof_pred_mode_no_get_params(self):

        S_train_1 = np.ones(X_train.shape[0]).reshape(-1, 1)
        S_test_1 = np.ones(X_test.shape[0]).reshape(-1, 1)

        models = [MinimalEstimator()]
        S_train_2, S_test_2 = stacking(models,
                                       X_train,
                                       y_train,
                                       X_test,
                                       regression=True,
                                       n_folds=n_folds,
                                       shuffle=False,
                                       save_dir=temp_dir,
                                       mode='oof_pred',
                                       random_state=0,
                                       verbose=0)

        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(
            temp_dir, '*.npy')))[-1]  # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #2
0
    def test_oof_mode_xtest_is_none(self):

        model = LinearRegression()
        S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, 
            n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
        S_test_1 = None

        models = [LinearRegression()]
        S_train_2, S_test_2 = stacking(models, X_train, y_train, None, 
            regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, 
            mode = 'oof', random_state = 0, verbose = 0)
            
        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #3
0
    def test_oof_mode_proba(self):

        model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')
        S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, 
            n_jobs = 1, verbose = 0, method = 'predict_proba')
        S_test_1 = None

        models = [LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')]
        S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, 
            regression = False, n_folds = n_folds, shuffle = False, stratified = True, 
            mode = 'oof', random_state = 0, verbose = 0, needs_proba = True, save_dir=temp_dir)
            
        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
        S = np.load(file_name, allow_pickle=True)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #4
0
    def test_pred_mode(self):

        model = LogisticRegression()
        S_train_1 = None
        _ = model.fit(X_train, y_train)
        S_test_1 = model.predict(X_test).reshape(-1, 1)

        models = [LogisticRegression()]
        S_train_2, S_test_2 = stacking(models,
                                       X_train,
                                       y_train,
                                       X_test,
                                       regression=False,
                                       n_folds=n_folds,
                                       shuffle=False,
                                       save_dir=temp_dir,
                                       mode='pred',
                                       random_state=0,
                                       verbose=0,
                                       stratified=True)

        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(
            temp_dir, '*.npy')))[-1]  # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #5
0
    def test_oof_mode_metric(self):

        model = LinearRegression()
        scorer = make_scorer(mean_absolute_error)
        scores = cross_val_score(model, X_train, y = y_train, cv = n_folds, 
            scoring = scorer, n_jobs = 1, verbose = 0)
        mean_str_1 = '%.8f' % np.mean(scores)
        std_str_1 = '%.8f' % np.std(scores)
        

        models = [LinearRegression()]
        S_train, S_test = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, save_dir=temp_dir, 
            mode = 'oof', random_state = 0, verbose = 0)
            
        # Load mean score and std from file
        # Normally if cleaning is performed there is only one .log.txt file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(temp_dir, '*.log.txt')))[-1] # take the latest file
        with open(file_name) as f:
            for line in f:
                if 'MEAN' in line:
                    split = line.strip().split()
                    break

        mean_str_2 = split[1][1:-1]
        std_str_2 = split[3][1:-1]

        assert_equal(mean_str_1, mean_str_2)
        assert_equal(std_str_1, std_str_2)
Exemple #6
0
    def test_oof_pred_mode_sample_weight_random(self):
    
        np.random.seed(0)
        sw = np.random.rand(len(y_train))
    
        model = LinearRegression()
        S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, 
            n_jobs = 1, verbose = 0, method = 'predict', 
            fit_params = {'sample_weight': sw}).reshape(-1, 1)
        _ = model.fit(X_train, y_train, sample_weight = sw)
        S_test_1 = model.predict(X_test).reshape(-1, 1)

        models = [LinearRegression()]
        S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir,
            mode = 'oof_pred', random_state = 0, verbose = 0,
            sample_weight = sw)
            
        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #7
0
    def test_pred_bag_mode(self):
        
        S_test_temp = np.zeros((X_test.shape[0], n_folds))
        kf = KFold(n_splits = n_folds, shuffle = False, random_state = 0)
        for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            X_te = X_train[te_index]
            y_te = y_train[te_index]
            model = LinearRegression()
            _ = model.fit(X_tr, y_tr)
            S_test_temp[:, fold_counter] = model.predict(X_test)
        S_test_1 = np.mean(S_test_temp, axis = 1).reshape(-1, 1)

        S_train_1 = None

        models = [LinearRegression()]
        S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir,
            mode = 'pred_bag', random_state = 0, verbose = 0)
            
        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #8
0
    def Level_1(self, append_model):
        self.models = [
            LogisticRegression(random_state=0),
            LinearDiscriminantAnalysis(),
            KNeighborsClassifier(),
            GaussianNB(),
            DecisionTreeClassifier(random_state=0),
            BaggingClassifier(DecisionTreeClassifier(random_state=0), bootstrap=True, oob_score=True, n_jobs=-1, random_state=0),
            RandomForestClassifier(n_jobs=-1, random_state=0),
            ExtraTreesClassifier(n_jobs=-1, random_state=0),
            AdaBoostClassifier(DecisionTreeClassifier(random_state=0), random_state=0),
            GradientBoostingClassifier(random_state=0),
            MLPClassifier(random_state=0)
        ]
        if append_model==None:
            pass
        else:
            self.models.append(append_model)

        self.S_train, self.S_test = stacking(self.models,
                                             np.array(self.X_train), np.array(self.y_train), np.array(self.X_test),
                                             regression=False,
                                             mode='oof_pred',
                                             needs_proba=True,
                                             save_dir='.',
                                             metric=log_loss,
                                             n_folds=5,
                                             stratified=True,
                                             shuffle=True,
                                             random_state=0,
                                             verbose=2
                                             )
        return None
    def test_pred_bag_mode_proba(self):

        S_test_1 = np.zeros((X_test.shape[0], n_classes))
        S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            X_te = X_train[te_index]
            y_te = y_train[te_index]
            model = LogisticRegression(random_state=0,
                                       solver='liblinear',
                                       multi_class='ovr')
            _ = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1[:, class_id] = np.mean(S_test_temp[:,
                                                        class_id::n_classes],
                                            axis=1)

        S_train_1 = None

        models = [
            LogisticRegression(random_state=0,
                               solver='liblinear',
                               multi_class='ovr')
        ]
        S_train_2, S_test_2 = stacking(models,
                                       X_train,
                                       y_train,
                                       X_test,
                                       regression=False,
                                       n_folds=n_folds,
                                       shuffle=False,
                                       save_dir=temp_dir,
                                       mode='pred_bag',
                                       random_state=0,
                                       verbose=0,
                                       stratified=True,
                                       needs_proba=True)

        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(
            temp_dir, '*.npy')))[-1]  # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
def get_stacking_features(path=None):
    print(f"Training for {N_FOLDS} CV folds")
    if path is None:
        # TODO: Some refactoring.
        df = pd.read_csv("elo/data/augmented_train.csv")
        print(df.sample(5))
        # TODO: Find a better way to impute inf and missing values.
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(df.median())
        X_train = df.drop(FEATS_EXCLUDED, axis=1, errors='ignore').values
        y_train = df.loc[:, "target"].values

        test_df = pd.read_csv("elo/data/augmented_test.csv")
        print(test_df.sample(5))
        # TODO: Find a better way to impute inf and missing values.
        test_df = test_df.replace([np.inf, -np.inf], np.nan)
        test_df = test_df.fillna(test_df.median())
        X_test = test_df.drop(FEATS_EXCLUDED, axis=1, errors='ignore').values
        first_level_models = [
            XGBRegressor(**HYPEROPT_XGBOOST_OPTIMAL_HP),
            LGBMRegressor(**HYPEROPT_LIGHTGBM_OPTIMAL_HP),
            LGBMRegressor(**OPTUNA_LIGTHGBM_OPTIMAL_HP),
            XGBRegressor(seed=SEED),
            LGBMRegressor(seed=SEED),
            KNeighborsRegressor(),
            LinearRegression(),
            ExtraTreesRegressor(random_state=SEED),
            GradientBoostingRegressor(random_state=SEED),
            Lasso(random_state=SEED)
        ]
        # This didn't work at all without proper tuning!!!
        # SGDRegressor(random_state=SEED)]

        # TODO: Should I add "shuffling"?

        stacked_train, stacked_test = stacking(first_level_models,
                                               X_train,
                                               y_train,
                                               X_test,
                                               regression=True,
                                               metric=rmse,
                                               n_folds=N_FOLDS,
                                               random_state=SEED,
                                               verbose=2,
                                               save_dir="elo/data/stacking",
                                               shuffle=True)

    else:
        stacked_train, stacked_test = np.load(path)
        df = pd.read_csv("elo/data/augmented_train.csv")
        # TODO: Find a better way to impute inf and missing values.
        df = df.replace([np.inf, -np.inf], np.nan)
        df = df.fillna(df.median())
        y_train = df.loc[:, "target"].values
    # Observe the data
    print(stacked_train[:5])
    print(stacked_test[:5])
    print(y_train[:5])
    return stacked_train, stacked_test, y_train
Exemple #11
0
    def test_oof_pred_mode_proba_2_models(self):

        # Model a
        model = LogisticRegression()
        S_train_1_a = cross_val_predict(model,
                                        X_train,
                                        y=y_train,
                                        cv=n_folds,
                                        n_jobs=1,
                                        verbose=0,
                                        method='predict_proba')
        _ = model.fit(X_train, y_train)
        S_test_1_a = model.predict_proba(X_test)

        # Model b
        model = GaussianNB()
        S_train_1_b = cross_val_predict(model,
                                        X_train,
                                        y=y_train,
                                        cv=n_folds,
                                        n_jobs=1,
                                        verbose=0,
                                        method='predict_proba')
        _ = model.fit(X_train, y_train)
        S_test_1_b = model.predict_proba(X_test)

        S_train_1 = np.c_[S_train_1_a, S_train_1_b]
        S_test_1 = np.c_[S_test_1_a, S_test_1_b]

        models = [LogisticRegression(), GaussianNB()]
        S_train_2, S_test_2 = stacking(models,
                                       X_train,
                                       y_train,
                                       X_test,
                                       regression=False,
                                       n_folds=n_folds,
                                       shuffle=False,
                                       stratified=True,
                                       mode='oof_pred',
                                       random_state=0,
                                       verbose=0,
                                       needs_proba=True,
                                       save_dir=temp_dir)

        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(
            temp_dir, '*.npy')))[-1]  # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #12
0
def model_sel(x_train, y_train, x_test, y_test):

    clf1 = LogisticRegression(random_state=0)
    clf2 = BernoulliNB()
    clf3 = SVC(C= 1, kernel= 'linear')
    clf4 = RandomForestClassifier(n_estimators=800, min_samples_split=5, min_samples_leaf=1, max_features='sqrt',max_depth=100, bootstrap=False)
    clf5 = xgb.XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.3, 
                      n_estimators=100, max_depth=4)
    clf6 = neighbors.KNeighborsClassifier(n_neighbors=9,p=1)
    clf7 = GradientBoostingClassifier(max_depth=5, min_samples_split=4, min_samples_leaf=1, subsample=1,max_features='sqrt', random_state=10, 
            learning_rate = 0.15, n_estimators = 300)
    clf8 = AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1, n_estimators=200, random_state=1)

    input_dim = x_train.shape[1]
    n_classes = len(np.unique(y_train))
    clf9 = KerasClassifier(Sequential_model(input_dim, n_classes),epochs=8)

    clf = [clf1, clf2, clf3, clf4,clf5,clf6,clf7,clf8,clf9]

    for model in clf:
        print(f"Classifier: {model}")
        model.fit(x_train, y_train)
        score = model.score(x_test, y_test)
        print("score: {}".format(score))
        print(" ")


    print("Ensembles.................................")
    print(" ")

    MaxVoting_esemble = VotingClassifier(estimators=[('lr', clf1), ('bb', clf2), ('svc', clf3), ('rf', clf4), ('xg', clf5), ('knn', clf6), ('grb', clf7), ('ab', clf8), ('nn',clf9)], voting='hard')
    MaxVoting_esemble.fit(x_train,y_train)
    score = MaxVoting_esemble.score(x_test,y_test)
    print("MaxVoting: {}".format(score))
    print(" ")
    
    #staking
    S_train, S_test = stacking(clf,                   
                           x_train, y_train, x_test,   
                           regression=False,      
                           mode='oof_pred_bag',        
                           needs_proba=False,         
                           save_dir=None,             
                           metric=accuracy_score,     
                           n_folds=4,                  
                           stratified=True,            
                           shuffle=True,             
                           random_state=0,           
                           verbose=2)

    stacking_ensemble = SVC(C= 1, kernel= 'linear')
    stacking_ensemble = stacking_ensemble.fit(S_train, y_train)
    stacking_score = stacking_ensemble.score(S_test, y_test)
    print("Stacking: {}".format(stacking_score))

    return clf, MaxVoting_esemble, stacking_ensemble
Exemple #13
0
    def test_oof_pred_bag_mode_shuffle(self):

        S_test_temp = np.zeros((X_test.shape[0], n_folds))
        # Usind StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            X_te = X_train[te_index]
            y_te = y_train[te_index]
            model = LogisticRegression()
            _ = model.fit(X_tr, y_tr)
            S_test_temp[:, fold_counter] = model.predict(X_test)
        S_test_1 = st.mode(S_test_temp, axis=1)[0]

        model = LogisticRegression()
        # !!! Important. Here we pass CV-generator not number of folds <cv = kf>
        S_train_1 = cross_val_predict(model,
                                      X_train,
                                      y=y_train,
                                      cv=kf,
                                      n_jobs=1,
                                      verbose=0,
                                      method='predict').reshape(-1, 1)

        models = [LogisticRegression()]
        S_train_2, S_test_2 = stacking(models,
                                       X_train,
                                       y_train,
                                       X_test,
                                       regression=False,
                                       n_folds=n_folds,
                                       shuffle=True,
                                       save_dir=temp_dir,
                                       mode='oof_pred_bag',
                                       random_state=0,
                                       verbose=0,
                                       stratified=True)

        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(
            temp_dir, '*.npy')))[-1]  # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #14
0
    def naiveStacking(self, degree, lmodels):
        print(explain.Stacking)
        X_train = self.applyPolynomialFeatures(self.X_train, degree)
        X_test = self.applyPolynomialFeatures(self.X_test, degree)
        X_train, y_train, X_test, y_test = self.pandas2numpy(
            X_train, self.y_train, X_test, self.y_test)

        return stacking(
            lmodels, X_train, y_train, X_test,
            regression=self.is_regression, metric=self.metric,
            n_folds=3, shuffle=True, random_state=0, verbose=1
        ), y_train, y_test
Exemple #15
0
def integrated_models(train_data, test_data):
    print("train data shape:{}, test data shape:{}".format(
        train_data.shape, test_data.shape))

    y = np.ravel(np.array(train_data[['SalePrice']]))
    X = train_data.drop('SalePrice', axis=1)

    # load four models
    random_forest = load('rf_model.joblib')
    lightgbm = load('lgmb_model.joblib')
    g_boost = load('gmb_model.joblib')
    xg_boost = load('xgb_model.joblib')

    # model stacking
    models = [g_boost, xg_boost, lightgbm, random_forest]
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    S_train, S_test = stacking(models,
                               X_train,
                               y_train,
                               X_test,
                               regression=True,
                               mode='oof_pred_bag',
                               metric=rmse,
                               n_folds=5,
                               random_state=25,
                               verbose=2)

    print("S_train shape:{} \t S_test shape:{}".format(S_train.shape,
                                                       S_test.shape))

    # 初始化第二层模型
    xgb_lev2 = XGBRegressor(learning_rate=0.1,
                            n_estimators=500,
                            max_depth=3,
                            n_jobs=-1,
                            random_state=17)
    # Fit the 2nd level model on the output of level 1
    xgb_lev2.fit(S_train, y_train)
    stacked_pred = xgb_lev2.predict(S_test)
    print("RMSE of Stacked Model: {}".format(rmse(y_test, stacked_pred)))

    y1_pred_L1 = models[0].predict(test_data)
    y2_pred_L1 = models[1].predict(test_data)
    y3_pred_L1 = models[2].predict(test_data)
    y4_pred_L1 = models[3].predict(test_data)
    S_test_L1 = np.c_[y1_pred_L1, y2_pred_L1, y3_pred_L1, y4_pred_L1]
    print("S_test_L1 shape: {}".format(S_test_L1.shape))
    test_stacked_pred = xgb_lev2.predict(S_test_L1)
    submission = pd.DataFrame()

    submission['Id'] = np.array(test_data.index)
    submission['SalePrice'] = test_stacked_pred
    submission.to_csv("submission.csv", index=False)
Exemple #16
0
def prepare(data):
    X_train, X_test, y_train, y_test = data
    S_train, S_test = stacking(models,
            X_train, y_train, X_test, regression=False,
            mode='oof_pred_bag',
            needs_proba=False,
            save_dir=None,
            metric=accuracy_score,
            n_folds=7,
            stratified=True,
            shuffle=True,
            random_state=7,
            verbose=2)
    return (S_train, y_train, S_test, y_test)
Exemple #17
0
    def test_oof_pred_mode_verbose_1(self):
    
        model = LinearRegression()
        S_train_1 = cross_val_predict(model, X_train, y = y_train, cv = n_folds, 
            n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)
        _ = model.fit(X_train, y_train)
        S_test_1 = model.predict(X_test).reshape(-1, 1)
        
        
        models = [LinearRegression()]
        S_train_2, S_test_2 = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, 
            mode = 'oof_pred', random_state = 0, verbose = 0)

        models = [LinearRegression()]
        S_train_3, S_test_3 = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, 
            mode = 'oof_pred', random_state = 0, verbose = 1)
            
        models = [LinearRegression()]
        S_train_4, S_test_4 = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, shuffle = False, save_dir=temp_dir, 
            mode = 'oof_pred', random_state = 0, verbose = 2)
            
        models = [LinearRegression()]
        S_train_5, S_test_5 = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, shuffle = False, 
            mode = 'oof_pred', random_state = 0, verbose = 0)
            
        models = [LinearRegression()]
        S_train_6, S_test_6 = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, shuffle = False, 
            mode = 'oof_pred', random_state = 0, verbose = 1)
            
        models = [LinearRegression()]
        S_train_7, S_test_7 = stacking(models, X_train, y_train, X_test, 
            regression = True, n_folds = n_folds, shuffle = False, 
            mode = 'oof_pred', random_state = 0, verbose = 2)
            

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)
        
        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
        
        assert_array_equal(S_train_1, S_train_4)
        assert_array_equal(S_test_1, S_test_4)
        
        assert_array_equal(S_train_1, S_train_5)
        assert_array_equal(S_test_1, S_test_5)
        
        assert_array_equal(S_train_1, S_train_6)
        assert_array_equal(S_test_1, S_test_6)
        
        assert_array_equal(S_train_1, S_train_7)
        assert_array_equal(S_test_1, S_test_7)
Exemple #18
0
def base_reg_stack(x, y, x_test):
    X_train, X_test, y_train, y_test = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0)

    # Caution! All models and parameter values are just
    # demonstrational and shouldn't be considered as recommended.
    # Initialize 1st level models.
    models = [
        ExtraTreesRegressor(random_state=0,
                            n_jobs=-1,
                            n_estimators=100,
                            max_depth=3),
        RandomForestRegressor(random_state=0,
                              n_jobs=-1,
                              n_estimators=100,
                              max_depth=3),
        GradientBoostingRegressor(learning_rate=0.1,
                                  n_estimators=100,
                                  max_depth=3)
    ]

    # Compute stacking features
    S_train, S_test = stacking(models,
                               X_train,
                               y_train,
                               x_test,
                               regression=True,
                               metric=r2_score,
                               n_folds=4,
                               shuffle=True,
                               random_state=0,
                               verbose=2)

    # Initialize 2nd level model
    model = GradientBoostingRegressor(learning_rate=0.1,
                                      n_estimators=100,
                                      max_depth=3)
    print("S_train shape:", S_train.shape)
    # Fit 2nd level model
    model = model.fit(S_train, y_train)

    # Predict
    y_pred = model.predict(S_test)

    #print('Final prediction score: [%.8f]' % r2_score(y_test, y_pred))

    return y_pred
Exemple #19
0
 def getModel(self, _params, _x, _y, _x_eval):
     estimator_list = []
     for idx, job in enumerate(self.cantidate_job_list):
         if idx == _params['max_estimator']:
             break
         estimator_list.append(job.model.getModel(job.best_params))
     s_train, s_test = stacking(
         estimator_list,
         _x,
         _y,
         _x_eval,
         regression=False,
         metric=accuracy_score,
         stratified=_params['stratified'],
         shuffle=_params['shuffle'],
         random_state=0,
         n_jobs=definitions.getNumberOfCore(),
     )
Exemple #20
0
    def Level_1(self, append_model=[]):
        self.models = [
            LogisticRegression(random_state=0),
            LinearDiscriminantAnalysis(),
            KNeighborsClassifier(),
            GaussianNB(),
            DecisionTreeClassifier(random_state=0),
            BaggingClassifier(DecisionTreeClassifier(random_state=0),
                              bootstrap=True,
                              oob_score=True,
                              n_jobs=-1,
                              random_state=0),
            RandomForestClassifier(n_jobs=-1, random_state=0),
            ExtraTreesClassifier(n_jobs=-1, random_state=0),
            AdaBoostClassifier(DecisionTreeClassifier(random_state=0),
                               random_state=0),
            GradientBoostingClassifier(random_state=0),
            MLPClassifier(random_state=0)
        ]
        if len(append_model) == 0:
            pass
        else:
            for m in append_model:
                self.models.append(append_model)

        self.S_train, self.S_test = stacking(self.models,
                                             np.array(self.X_train),
                                             np.array(self.y_train),
                                             np.array(self.X_test),
                                             regression=False,
                                             mode='oof_pred',
                                             needs_proba=True,
                                             save_dir='.',
                                             metric=log_loss,
                                             n_folds=5,
                                             stratified=True,
                                             shuffle=True,
                                             random_state=0,
                                             verbose=2)
        return None
Exemple #21
0
    def test_N_dim_input(self):
        """
        This is `test_oof_pred_bag_mode` function with `LogisticRegressionUnrolled` estimator
        """
        S_test_temp = np.zeros((X_test_4d_unrolled.shape[0], n_folds))
        # Usind StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits = n_folds, shuffle = False, random_state = 0)
        for fold_counter, (tr_index, te_index) in enumerate(kf.split(X_train_4d_unrolled, y_train_4d)):
            # Split data and target
            X_tr = X_train_4d_unrolled[tr_index]
            y_tr = y_train_4d[tr_index]
            X_te = X_train_4d_unrolled[te_index]
            y_te = y_train_4d[te_index]
            model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')
            _ = model.fit(X_tr, y_tr)
            S_test_temp[:, fold_counter] = model.predict(X_test_4d_unrolled)
        S_test_1 = st.mode(S_test_temp, axis = 1)[0]
    
        model = LogisticRegression(random_state=0, solver='liblinear', multi_class='ovr')
        S_train_1 = cross_val_predict(model, X_train_4d_unrolled, y = y_train_4d, cv = n_folds,
            n_jobs = 1, verbose = 0, method = 'predict').reshape(-1, 1)

        models = [LogisticRegressionUnrolled(random_state=0, solver='liblinear', multi_class='ovr')]
        S_train_2, S_test_2 = stacking(models, X_train_4d, y_train_4d, X_test_4d,
            regression = False, n_folds = n_folds, shuffle = False, save_dir=temp_dir,
            mode = 'oof_pred_bag', random_state = 0, verbose = 0, stratified = True)

        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(temp_dir, '*.npy')))[-1] # take the latest file
        S = np.load(file_name, allow_pickle=True)
        S_train_3 = S[0]
        S_test_3 = S[1]

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
def main():
    # Data Extraction
    df = data_extract_e('e_20190609_15.pkl')

    # Data Transformation and Engineering
    df = feature_eng(df)
    df = extract_queues(df)
    dept_encoder, queue_encoder = load_labels('dept_encoder.pkl',
                                              'queue_encoder.pkl',
                                              df=df)
    df = feature_transform(df,
                           dept_encoder=dept_encoder,
                           queue_encoder=queue_encoder)

    # Training/Test Split
    x, y = data_filter(df)
    x_train, x_test, y_train, y_test = train_test_split(
        x, y, test_size=0.2,
        random_state=1357)  # 2468 to use same shuffle as individual models

    # Load models from persistent files
    models = load_models()
    print(models)

    # Stacking
    # Produces a new set of features based on the predictions of base models
    x_train_s, x_test_s = stacking(models,
                                   x_train,
                                   y_train,
                                   x_test,
                                   n_folds=10,
                                   shuffle=True,
                                   verbose=0,
                                   regression=True)

    save_data(x_train_s, 'x_train_s.pkl')
    save_data(y_train, 'y_train.pkl')
    save_data(x_test_s, 'x_test_s.pkl')
    save_data(y_test, 'y_test.pkl')
Exemple #23
0
        
    XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3),
                  
    LGBMClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, n_estimators=100, max_depth=3),
                  
    KerasClassifier(build_fn=build_keras_model_1, epochs=2, batch_size=32, verbose=0)
]

S_train_1, S_test_1 = stacking(models_1,                   # list of models
                               X_train, y_train, X_test,   # data
                               regression=False,           # classification task (if you need 
                                                           #     regression - set to True)
                               mode='oof_pred',            # mode: oof for train set, fit on full 
                                                           #     train and predict test set once
                               needs_proba=True,           # predict probabilities (if you need 
                                                           #     class labels - set to False) 
                               save_dir='.',               # save result and log in current dir 
                                                           #     (to disable saving - set to None)
                               metric=log_loss,            # metric: callable
                               n_folds=5,                  # number of folds
                               stratified=True,            # stratified split for folds
                               shuffle=True,               # shuffle the data
                               random_state=0,             # ensure reproducibility
                               verbose=2)                  # print all info

print('We have %d classes and %d models so in resulting arrays \
we expect to see %d columns.' % (n_classes, len(models_1), n_classes * len(models_1)))
print('S_train_1 shape:', S_train_1.shape)
print('S_test_1 shape: ', S_test_1.shape)

# Our arrays and log were saved in current dir
names = sorted(glob('*.npy'))
Exemple #24
0
    def test_oof_pred_bag_mode_proba_2_models(self):

        # Model a
        S_test_1_a = np.zeros((X_test.shape[0], n_classes))
        S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            X_te = X_train[te_index]
            y_te = y_train[te_index]
            model = LogisticRegression()
            _ = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1_a[:, class_id] = np.mean(S_test_temp[:,
                                                          class_id::n_classes],
                                              axis=1)

        model = LogisticRegression()
        S_train_1_a = cross_val_predict(model,
                                        X_train,
                                        y=y_train,
                                        cv=n_folds,
                                        n_jobs=1,
                                        verbose=0,
                                        method='predict_proba')

        # Model b
        S_test_1_b = np.zeros((X_test.shape[0], n_classes))
        S_test_temp = np.zeros((X_test.shape[0], n_folds * n_classes))
        # Using StratifiedKFold because by defauld cross_val_predict uses StratifiedKFold
        kf = StratifiedKFold(n_splits=n_folds, shuffle=False, random_state=0)
        for fold_counter, (tr_index,
                           te_index) in enumerate(kf.split(X_train, y_train)):
            # Split data and target
            X_tr = X_train[tr_index]
            y_tr = y_train[tr_index]
            X_te = X_train[te_index]
            y_te = y_train[te_index]
            model = GaussianNB()
            _ = model.fit(X_tr, y_tr)
            col_slice_fold = slice(fold_counter * n_classes,
                                   fold_counter * n_classes + n_classes)
            S_test_temp[:, col_slice_fold] = model.predict_proba(X_test)
        for class_id in range(n_classes):
            S_test_1_b[:, class_id] = np.mean(S_test_temp[:,
                                                          class_id::n_classes],
                                              axis=1)

        model = GaussianNB()
        S_train_1_b = cross_val_predict(model,
                                        X_train,
                                        y=y_train,
                                        cv=n_folds,
                                        n_jobs=1,
                                        verbose=0,
                                        method='predict_proba')

        S_train_1 = np.c_[S_train_1_a, S_train_1_b]
        S_test_1 = np.c_[S_test_1_a, S_test_1_b]

        models = [LogisticRegression(), GaussianNB()]
        S_train_2, S_test_2 = stacking(models,
                                       X_train,
                                       y_train,
                                       X_test,
                                       regression=False,
                                       n_folds=n_folds,
                                       shuffle=False,
                                       save_dir=temp_dir,
                                       mode='oof_pred_bag',
                                       random_state=0,
                                       verbose=0,
                                       stratified=True,
                                       needs_proba=True)

        # Load OOF from file
        # Normally if cleaning is performed there is only one .npy file at given moment
        # But if we have no cleaning there may be more then one file so we take the latest
        file_name = sorted(glob.glob(os.path.join(
            temp_dir, '*.npy')))[-1]  # take the latest file
        S = np.load(file_name)
        S_train_3 = S[0]
        S_test_3 = S[1]

        #@@@@
        # Look at proba
        # print('\nTwo models')
        # print('etalon')
        # print(S_test_1[:2])
        # print('vecstack')
        # print(S_test_2[:2])
        #@@@@

        assert_array_equal(S_train_1, S_train_2)
        assert_array_equal(S_test_1, S_test_2)

        assert_array_equal(S_train_1, S_train_3)
        assert_array_equal(S_test_1, S_test_3)
Exemple #25
0
                          meta_classifier=logreg)


scores = model_selection.cross_val_score(sclf, data1_x_bin, data[Target], 
                                              cv=3, scoring='f1')
print("Accuracy: %0.2f (+/- %0.2f)" 
#       % (scores.mean(), scores.std()))

"""#### VecStack"""

#1st level model
X_train, X_test, y_train, y_test = train_test_split(data1_x_bin, data[Target], test_size=0.2)

models = [lgbm_cl,rf_cl,gdb_cl]
S_train, S_test = stacking(models, X_train, y_train, X_test, 
    regression = False, metric = metrics.f1_score, n_folds = 4 , 
    shuffle = True, random_state = 0, verbose = 2)

#2nd level model
# Initialize 2nd level model
model = XGBClassifier(random_state=0, n_jobs=-1, learning_rate=0.1, 
                      n_estimators=100, max_depth=3)
    
# Fit 2nd level model
model = model.fit(S_train, y_train)

# Predict
y_pred = model.predict(S_test)

# Final prediction score
print('Final prediction score: [%.8f]' % metrics.f1_score(y_test, y_pred))
    KNeighborsClassifier(algorithm='auto',
                         leaf_size=30,
                         metric='minkowski',
                         metric_params=None,
                         n_jobs=1,
                         n_neighbors=17,
                         p=2,
                         weights='distance')
]

# build the stack level 1
S_train, S_test = stacking(lmodels,
                           x_train,
                           y_train,
                           x_test,
                           regression=regression,
                           metric=metric,
                           n_folds=3,
                           shuffle=True,
                           random_state=0,
                           verbose=1)

# build model lvel 2
model = DecisionTreeClassifier(class_weight=None,
                               criterion='entropy',
                               max_depth=10,
                               max_features=None,
                               max_leaf_nodes=None,
                               min_impurity_split=1e-07,
                               min_samples_leaf=2,
                               min_samples_split=5,
                               min_weight_fraction_leaf=0.0,
Exemple #27
0
    df = feature_eng(df)
    df = extract_queues(df)
    dept_encoder, queue_encoder, user_encoder = load_labels()
    df = feature_transform(df, dept_encoder=dept_encoder, queue_encoder=queue_encoder, user_encoder=user_encoder)
    df = df[:100000] # Take x number of rows; downscaling dataset due to time constraints

    # Training/Test Split
    x, y = data_filter(df)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1357) # 2468 to use same shuffle as individual models

    # Load models from persistent files
    models = load_models()

    # Stacking
    # Produces a new set of features based on the predictions of base models
    x_train_s, x_test_s = stacking(models, x_train, y_train, x_test, 
                                n_folds=10, shuffle=True, verbose=0, regression=True)

    # Stacked Second-Layer Model
    xgb_l2 = XGBRegressor(objective='reg:linear')
    xgb_l2 = xgb_l2.fit(x_train_s, y_train)
    print('Stacking XGBRegressor L2 R2 Training score: ', xgb_l2.score(x_train_s, y_train))

    y_pred = xgb_l2.predict(x_train_s)
    print('Stacking XGBRegressor L2 Training MSE: ', metrics.mean_squared_error(y_pred, y_train))

    y_pred = xgb_l2.predict(x_test_s)
    print('Stacking XGBRegressor L2 R2 Test score: ', xgb_l2.score(x_test_s, y_test))
    print('Stacking XGBRegressor L2 Test MSE: ', metrics.mean_squared_error(y_pred, y_test))

    # Bayesian Optimization for Hyperparameter Tuning
    res = bayes_opt(objective_func, param_grid, 'xgb_l2_bo_res.z')
Exemple #28
0
    # split into input and output elements
    y = dataframe.SARSCov  # Target variable
    X = dataframe.drop(['SARSCov'], axis=1)  # Features

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    sm = SMOTE(k_neighbors=11)
    x_res, y_res = sm.fit_sample(X_train, y_train)

    S_train, S_test = stacking(models,
                               x_res,
                               y_res,
                               X_test,
                               regression=False,
                               mode='oof_pred',
                               needs_proba=False,
                               save_dir=None,
                               metric=metrics.accuracy_score,
                               n_folds=10,
                               stratified=True,
                               shuffle=True,
                               verbose=2)

    model = XGBClassifier(learning_rate=1.0, n_estimators=300, max_depth=17)

    # fit model
    model = model.fit(S_train, y_res)

    # evaluate model
    y_pred = model.predict(S_test)
    score = metrics.accuracy_score(y_test, y_pred)
lgb_pred = lightgbm.predict(test)







# List of the models to be stacked
models = [g_boost, xg_boost, lightgbm, random_forest]
# Perform Stacking
S_train, S_test = stacking(models,
                           X_train, y_train, X_test,
                           regression=True,
                           mode='oof_pred_bag',
                           metric=rmse,
                           n_folds=5,
                           random_state=25,
                           verbose=2
                          )


# Initialize 2nd level model
xgb_lev2 = XGBRegressor(learning_rate=0.1, 
                        n_estimators=500,
                        max_depth=3,
                        n_jobs=-1,
                        random_state=17
                       )

# Fit the 2nd level model on the output of level 1
Exemple #30
0
    RandomForestClassifier(n_estimators=75,
                           random_state=0,
                           bootstrap=True,
                           oob_score=True,
                           warm_start=True),
    DecisionTreeClassifier(random_state=0, presort=True),
    XGBClassifier(random_state=0)
]

S_train, S_test = stacking(models,
                           X_train,
                           y_train,
                           X_test,
                           regression=False,
                           mode='oof_pred_bag',
                           needs_proba=False,
                           save_dir=None,
                           metric=accuracy_score,
                           n_folds=4,
                           stratified=True,
                           shuffle=True,
                           random_state=0,
                           verbose=1)

model = GradientBoostingClassifier(random_state=0,
                                   warm_start=True,
                                   loss="deviance",
                                   n_estimators=400)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
metrics = []
metrics.append(['f1score', f1_score(y_pred, y_test)])
Exemple #31
0
models = [
    ExtraTreesRegressor(random_state=0, n_jobs=-1,
                        n_estimators=300, max_depth=3),

    RandomForestRegressor(random_state=0, n_jobs=-1,
                          n_estimators=300, max_depth=3),

    XGBRegressor(seed=0, learning_rate=0.05,
                 n_estimators=300, max_depth=3),

    LGBMRegressor(num_leaves=8, learning_rate=0.05, n_estimators=300)
]

# Compute stacking features

S_train, S_test = stacking(models, X_train, y_train, X_test, regression=True, metric=mean_squared_error, n_folds=5,
                           shuffle=True, random_state=0, verbose=2)

# Fit 2-nd level model
model = LGBMRegressor(num_leaves=8, learning_rate=0.05, n_estimators=300)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)

id_test = ts_user['uid']
stacking_sub = pd.DataFrame({'uid': id_test, 'stacking_loan_sum': y_pred})
print(stacking_sub.describe())
stacking_sub.loc[stacking_sub["stacking_loan_sum"] < 0, "stacking_loan_sum"] = 0
print('saving submission...')
now_time = time.strftime("%m-%d %H_%M_%S", time.localtime())
stacking_sub[["uid", "stacking_loan_sum"]].to_csv("./submission/" + now_time + '_stacking.csv', index=False,
                                                  header=False)
Exemple #32
0
                       calc_feature_importance=True),
    CatBoostClassifier(depth=9,
                       iterations=30,
                       learning_rate=0.2,
                       eval_metric='AUC',
                       verbose=True,
                       random_seed=1,
                       calc_feature_importance=True)
]

S_train, S_test = stacking(models,
                           trainX.values,
                           trainY.values,
                           test[cols_to_use].values,
                           regression=False,
                           metric=accuracy_score,
                           n_folds=4,
                           stratified=True,
                           shuffle=True,
                           random_state=0,
                           verbose=2)

catclassifier = CatBoostClassifier(depth=10,
                                   iterations=20,
                                   learning_rate=0.2,
                                   eval_metric='AUC',
                                   verbose=True,
                                   random_seed=1,
                                   calc_feature_importance=True)

# Fit 2-nd level model