Exemple #1
0
def __ensemble_test(type, X_train, X_test, y_train, y_test):
    if type.lower() == 'gbr':
        reg = GBR(n_estimators=100, random_state=1)
    elif type.lower() == 'rfr':
        reg = RFR(n_estimators=100, random_state=1)
    elif type.lower() == 'abr':
        reg = ABR(n_estimators=100, random_state=1)
    elif type.lower() == 'etr':
        reg = ETR(n_estimators=100, random_state=1)
    reg.fit(X_train, y_train)
    return reg, reg.score(X_test, y_test), reg.feature_importances_
 def train(self,
           zone,
           num,
           hidden_layer_size=(4),
           n_jobs=1,
           kernel='rbf',
           n_components=15,
           n_estimators=50,
           loss='linear',
           learning_rate=1.0,
           host='127.0.0.1'):
     f = fd(host)
     input_set = f.getTrainData(zone)
     x_train, x_test, y_train, y_test, scaler, pca = self.read_dataset(
         input_set, n_components)
     if num == 1:
         #Linear Regression
         clf = LinearRegression(n_jobs=n_jobs)
         clf.fit(x_train, y_train)
         # storeObj(clf,zone,clf.score(x_test,y_test),'Linear Regression')
         return clf, clf.score(x_test,
                               y_test), 'Linear Regression', scaler, pca
     elif num == 2:
         # SVR sigmoid
         clf = svm.SVR(kernel=kernel)
         clf.fit(x_train, y_train)
         # storeObj(clf, zone, clf.score(x_test, y_test), 'SVR'+','+kernel)
         return clf, clf.score(x_test, y_test), 'SVR' + kernel, scaler, pca
     elif num == 3:
         #Neural Net
         clf = mlpr(hidden_layer_size=hidden_layer_size)
         clf.fit(x_train, y_train)
         str = ''
         for i in hidden_layer_size:
             str += '-> {}'.format(i)
         # storeObj(clf, zone, clf.score(x_test, y_test), 'NeuralNet'+' hidden layer size'+hidden_layer_size)
         return clf, clf.score(
             x_test, y_test), 'NeuralNet hidden_size' + str, scaler, pca
     elif num == 4:
         #Gradient Boosting Regressor
         clf = GBR(loss=loss,
                   n_estimators=n_estimators,
                   learning_rate=learning_rate)
         clf.fit(x_train, y_train)
         # storeObj(clf, zone, clf.score(x_test, y_test), 'Gradient Boosting Regressor')
         return clf, clf.score(
             x_test, y_test), 'Gradient Boosted Regressor', scaler, pca
     elif num == 5:
         clf = ABR()
         clf.fit(x_train, y_train)
         # storeObj(clf, zone, clf.score(x_test, y_test), 'AdaBoost Regressor')
         return clf, clf.score(x_test,
                               y_test), 'AdaBoost Regressor', scaler, pca
 def __init__(self,
              base_estimator=None,
              n_estimators=50,
              learning_rate=1.0,
              random_state=None):
     self.base_estimator = base_estimator
     self.learning_rate = learning_rate
     self.random_state = random_state
     self.n_estimators = n_estimators
     self.model = ABR(n_estimators=self.n_estimators,
                      learning_rate=self.learning_rate,
                      base_estimator=self.base_estimator,
                      random_state=self.random_state)
    def fit(self, X, Y, sample_weight=None):
        from sklearn.ensemble import AdaBoostRegressor as ABR
        from sklearn.tree import DecisionTreeRegressor
        self.n_estimators = int(self.n_estimators)
        self.learning_rate = float(self.learning_rate)
        self.max_depth = int(self.max_depth)
        base_estimator = DecisionTreeRegressor(max_depth=self.max_depth)

        estimator = ABR(base_estimator=base_estimator,
                        n_estimators=self.n_estimators,
                        learning_rate=self.learning_rate,
                        random_state=self.random_state)

        estimator.fit(X, Y, sample_weight=sample_weight)

        self.estimator = estimator
        return self
def main():

    ### parsing and Data pre-processing
    # load the provided data
    train_features_path = os.path.join(data_path, 'dengue_features_train.csv')
    train_labels_path = os.path.join(data_path, 'dengue_labels_train.csv')

    ### pre-processing data
    sj_train, iq_train = preprocess_data(train_features_path,
                                         labels_path=train_labels_path)
    #print(sj_train.describe())
    #print(iq_train.describe())

    ###Define the xgb parameters
    xgb_params = {
        'eta': 0.05,
        'max_depth': 5,
        'subsample': 0.7,
        'colsample_bytree': 0.7,
        'objective': 'reg:linear',
        'eval_metric': 'rmse',
        'silent': 1
    }
    num_boost_rounds = 1000
    ##Use K-fold to create cross validation data
    kf = KFold(n_splits=6)

    ##Do the stacking by adding 5 dataframes 'negbi', 'gb', 'xgb','adaboost','extratree' ,'bagging'which store the training prediction
    sj_train = sj_train.assign(negbi=0)
    sj_train = sj_train.assign(gb=0)
    sj_train = sj_train.assign(xgb=0)
    sj_train = sj_train.assign(abr=0)
    sj_train = sj_train.assign(etr=0)
    sj_train = sj_train.assign(br=0)

    loop = 1
    for train_index, val_index in kf.split(
            sj_train
    ):  #The index will be split into [train_index] and [val_index]
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        ###(1)neg_binomial method
        sj_neg_model = get_best_model(X_train, X_val, 'sj')
        predictions_neg = sj_neg_model.predict(X_val).astype(int)
        #Shift the prediction manually
        for i in range(predictions_neg.shape[0] - 1, 3, -1):
            predictions_neg.ix[i] = predictions_neg.ix[i - 4]

        ###(2)gradient boosting method
        sj_gb_model = gradient_boosting(
            X_train.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1),
            X_val.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
        predictions_gb = sj_gb_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1)).astype(int)

        ###(3)xgboost method
        dtrain = xgb.DMatrix(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        dval = xgb.DMatrix(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))
        sj_xgb_model = xgb.train(dict(xgb_params, silent=0),
                                 dtrain,
                                 num_boost_round=num_boost_rounds)
        predictions_xgb = sj_xgb_model.predict(dval).astype(int)

        ###(4)Adaboost regressor method
        sj_abr_model = ABR(n_estimators=800,
                           learning_rate=0.08,
                           loss='linear',
                           random_state=0)
        sj_abr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_abr = sj_abr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(5)Extra tree regressor method
        sj_etr_model = ETR(n_estimators=800,
                           max_depth=4,
                           random_state=0,
                           verbose=1)
        sj_etr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_etr = sj_etr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(6) Bagging Regressor method
        sj_br_model = BR(n_estimators=800,
                         oob_score=False,
                         n_jobs=5,
                         random_state=0,
                         verbose=1)
        sj_br_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_br = sj_br_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###Store the result in sj_train  predictions_neg -> 'negbi', predictions_gb -> 'gb'
        print(
            "Adding the result of the predictions to sj training data({}/{})".
            format(loop, 6))
        for idx, index in enumerate(val_index):
            sj_train['negbi'].ix[index] = predictions_neg.ix[idx]
            sj_train['gb'].ix[index] = predictions_gb[idx]
            sj_train['xgb'].ix[index] = predictions_xgb[idx]
            sj_train['abr'].ix[index] = predictions_abr[idx]
            sj_train['etr'].ix[index] = predictions_etr[idx]
            sj_train['br'].ix[index] = predictions_br[idx]
        loop += 1

    iq_train = iq_train.assign(negbi=0)
    iq_train = iq_train.assign(gb=0)
    iq_train = iq_train.assign(xgb=0)
    iq_train = iq_train.assign(abr=0)
    iq_train = iq_train.assign(etr=0)
    iq_train = iq_train.assign(br=0)

    loop = 1
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]

        ###(1)neg_binomial method
        iq_neg_model = get_best_model(X_train, X_val, 'iq')
        predictions_neg = iq_neg_model.predict(X_val).astype(int)
        #Shift the prediction manually
        for i in range(predictions_neg.shape[0] - 1, 0, -1):
            predictions_neg.ix[i] = predictions_neg.ix[i - 1]

        ###(2)gradient boosting method
        iq_gb_model = gradient_boosting(
            X_train.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1),
            X_val.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
        predictions_gb = iq_gb_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1)).astype(int)

        ###(3)xgb method
        dtrain = xgb.DMatrix(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        dval = xgb.DMatrix(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))
        iq_xgb_model = xgb.train(dict(xgb_params, silent=0),
                                 dtrain,
                                 num_boost_round=num_boost_rounds)
        predictions_xgb = iq_xgb_model.predict(dval).astype(int)

        ###(4)Adaboost regressor method
        iq_abr_model = ABR(n_estimators=800,
                           learning_rate=0.08,
                           loss='linear',
                           random_state=0)
        iq_abr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_abr = iq_abr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(5)Extra tree regressor method
        iq_etr_model = ETR(n_estimators=800,
                           max_depth=4,
                           random_state=0,
                           verbose=1)
        iq_etr_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_etr = iq_etr_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###(6) Bagging Regressor method
        iq_br_model = BR(n_estimators=800,
                         oob_score=False,
                         n_jobs=5,
                         random_state=0,
                         verbose=1)
        iq_br_model.fit(
            X_train.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1), X_train['total_cases'])
        predictions_br = iq_br_model.predict(
            X_val.drop(
                ['total_cases', 'negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                axis=1))

        ###Store the result in iq_train predictions_neg -> 'negbi', predictions_gb -> 'gb'
        print(
            "Adding the result of the predictions to iq training data({}/{})".
            format(loop, 6))
        for idx, index in enumerate(val_index):
            iq_train['negbi'].ix[index] = predictions_neg.ix[idx]
            iq_train['gb'].ix[index] = predictions_gb[idx]
            iq_train['xgb'].ix[index] = predictions_xgb[idx]
            iq_train['abr'].ix[index] = predictions_abr[idx]
            iq_train['etr'].ix[index] = predictions_etr[idx]
            iq_train['br'].ix[index] = predictions_br[idx]
        loop += 1

    ###Now the training data looks like [feature, total_cases, negbi, gb, xgb]

    ##Accessing testing data
    test_features_path = os.path.join(data_path, 'dengue_features_test.csv')
    sj_test, iq_test = preprocess_data(test_features_path)
    ##Like training, add 'negbi' and 'gb' to the testing dataframe
    sj_test = sj_test.assign(negbi=0)
    sj_test = sj_test.assign(gb=0)
    sj_test = sj_test.assign(xgb=0)
    sj_test = sj_test.assign(abr=0)
    sj_test = sj_test.assign(etr=0)
    sj_test = sj_test.assign(br=0)

    ##(1)neg_binomial prediction
    sj_predictions_neg = sj_neg_model.predict(sj_test).astype(int)
    for i in range(sj_predictions_neg.shape[0] - 1, 3, -1):
        sj_predictions_neg.ix[i] = sj_predictions_neg.ix[i - 4]
    ##(2)gradient boosting prediction
    sj_predictions_gb = sj_gb_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ##(3)xgb prediction
    dtest = xgb.DMatrix(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
    sj_predictions_xgb = sj_xgb_model.predict(dtest).astype(int)
    ###(4)Adaboost regressor method
    sj_predictions_abr = sj_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(5)extra tree regressor method
    sj_predictions_etr = sj_etr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(6)bagging regressor method
    sj_predictions_br = sj_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)

    print("Adding predictions as features to sj testing data...")
    for i in range(len(sj_test['negbi'])
                   ):  #Add the prediction to the corresponding column
        sj_test['negbi'].ix[i] = sj_predictions_neg.ix[i]
        sj_test['gb'].ix[i] = sj_predictions_gb[i]
        sj_test['xgb'].ix[i] = sj_predictions_xgb[i]
        sj_test['abr'].ix[i] = sj_predictions_abr[i]
        sj_test['etr'].ix[i] = sj_predictions_etr[i]
        sj_test['br'].ix[i] = sj_predictions_br[i]

    ##Same process as city sj
    iq_test = iq_test.assign(negbi=0)
    iq_test = iq_test.assign(gb=0)
    iq_test = iq_test.assign(xgb=0)
    iq_test = iq_test.assign(abr=0)
    iq_test = iq_test.assign(etr=0)
    iq_test = iq_test.assign(br=0)

    ###(1)neg_binomial prediction
    iq_predictions_neg = iq_neg_model.predict(iq_test).astype(int)
    for i in range(iq_predictions_neg.shape[0] - 1, 0, -1):
        iq_predictions_neg.ix[i] = iq_predictions_neg.ix[i - 1]
    ##(2)gradient boosting prediction
    iq_predictions_gb = iq_gb_model.predict(
        iq_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ##(3)xgb prediction
    dtest = xgb.DMatrix(
        iq_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'], axis=1))
    iq_predictions_xgb = iq_xgb_model.predict(dtest).astype(int)
    ###(4)Adaboost regressor method
    iq_predictions_abr = iq_abr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(5)extra tree regressor method
    iq_predictions_etr = iq_etr_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)
    ###(6)bagging regressor method
    iq_predictions_br = iq_br_model.predict(
        sj_test.drop(['negbi', 'gb', 'xgb', 'abr', 'etr', 'br'],
                     axis=1)).astype(int)

    print("Adding predictions as features to iq testing data...")
    for i in range(len(iq_test['negbi'])):
        iq_test['negbi'].ix[i] = iq_predictions_neg.ix[i]
        iq_test['gb'].ix[i] = iq_predictions_gb[i]
        iq_test['xgb'].ix[i] = iq_predictions_xgb[i]
        iq_test['abr'].ix[i] = iq_predictions_abr[i]
        iq_test['etr'].ix[i] = iq_predictions_etr[i]
        iq_test['br'].ix[i] = iq_predictions_br[i]

    ##use new information to run a linear regression
    print("Building linear regression model...")
    #Now the linear regression model uses (X = [features, negbi, gb, xgb], y = total_cases )to train(fit)
    sj_lr = LR()
    sj_lr.fit(sj_train.drop('total_cases', axis=1), sj_train['total_cases'])
    iq_lr = LR()
    iq_lr.fit(iq_train.drop('total_cases', axis=1), iq_train['total_cases'])

    #Calculate the k-fold validation error
    sj_score = []
    for train_index, val_index in kf.split(sj_train):
        X_train, X_val = sj_train.ix[train_index], sj_train.ix[val_index]
        train_predict = np.array(
            sj_lr.predict(X_val.drop('total_cases', axis=1))).astype(int)
        sj_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of sj_score is {} (+/- {})".format(
        kf.get_n_splits(sj_train), np.mean(sj_score), np.std(sj_score)))

    iq_score = []
    for train_index, val_index in kf.split(iq_train):
        X_train, X_val = iq_train.ix[train_index], iq_train.ix[val_index]
        train_predict = np.array(
            iq_lr.predict(X_val.drop('total_cases', axis=1))).astype(int)
        iq_score.append(eval_measures.meanabs(train_predict,
                                              X_val.total_cases))
    print("Mean of {} cross validation of iq_score is {} (+/- {})".format(
        kf.get_n_splits(iq_train), np.mean(iq_score), np.std(iq_score)))

    ##Use the model sj_lr and iq_lr trained before to predict the testing data
    print("Predicting testing data...")
    sj_predictions = sj_lr.predict(sj_test)
    iq_predictions = iq_lr.predict(iq_test)
    sj_predictions = np.array(sj_predictions).astype(int)
    iq_predictions = np.array(iq_predictions).astype(int)

    print("Creating submit file...")
    ##Use submission_format as template to write the answer
    sample_path = os.path.join(data_path, 'submission_format.csv')
    submission = pd.read_csv(sample_path, index_col=[0, 1, 2])
    submission.total_cases = np.concatenate([sj_predictions, iq_predictions])
    submission.to_csv("./data/stacking_6_less_feature.csv")
    '''