columns = [
    'Age_categories_Missing', 'Age_categories_Infant', 'Age_categories_Child',
    'Age_categories_Young Adult', 'Age_categories_Adult',
    'Age_categories_Senior', 'Pclass_1', 'Pclass_3', 'Embarked_C',
    'Embarked_Q', 'Embarked_S', 'SibSp_scaled', 'Parch_scaled',
    'Fare_categories_0-12', 'Fare_categories_50-100', 'Fare_categories_100+',
    'Title_Miss', 'Title_Mr', 'Title_Mrs', 'Title_Officer', 'Title_Royalty',
    'Cabin_type_B', 'Cabin_type_C', 'Cabin_type_D', 'Cabin_type_E',
    'Cabin_type_F', 'Cabin_type_G', 'Cabin_type_T', 'Cabin_type_Unknown'
]

all_X = train[columns]
all_y = train["Survived"]
lr = LogisticRegression()
selector = RFECV(lr, cv=10)
selector.fit(all_X, all_y)
optimized_columns = all_X.columns[selector.support_]

## 10. Training A Model Using our Optimized Columns ##

all_X = train[optimized_columns]
all_y = train["Survived"]
lr = LogisticRegression()
scores = cross_val_score(lr, all_X, all_y, cv=10)
accuracy = scores.mean()

## 11. Submitting our Model to Kaggle ##

lr = LogisticRegression()
lr.fit(all_X, all_y)
Ejemplo n.º 2
0
def logistic_regression():

    # ----------------------- Partition Data ----------------------- #
    df = data_prep()
    rs = 0

    y = df['AtRisk']
    X = df.drop(['AtRisk'], axis=1)

    X_mat = X.as_matrix()
    X_train, X_test, y_train, y_test = train_test_split(X_mat,
                                                        y,
                                                        test_size=0.3,
                                                        stratify=y,
                                                        random_state=rs)

    # ------------------ Scale and Build Model --------------------- #

    scaler = StandardScaler()

    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    model = LogisticRegression()
    model.fit(X_train, y_train)

    print("\nDefault Regression\n-----------------")
    print("Train accuracy:", model.score(X_train, y_train))
    print("Test accuracy", model.score(X_test, y_test))

    # --------------------- Default GridSearchCV ---------------------- #

    params = {'C': [pow(10, x) for x in range(-6, 4)]}

    cv = GridSearchCV(param_grid=params,
                      estimator=LogisticRegression(random_state=rs),
                      cv=5,
                      n_jobs=-1)
    cv.fit(X_train, y_train)

    print("\nRegression GridSearchCV\n-----------------")
    print("Train accuracy:", cv.score(X_train, y_train))
    print("Test accuracy:", cv.score(X_test, y_test))
    print("\nBest Parameters:")
    print(cv.best_params_)

    # y_pred = cv.predict(X_test)
    # print(classification_report(y_test, y_pred))
    # print hyperparameters of model

    # ---------------------- Log Transform ----------------------- #

    columns_to_transform = [
        'Age', 'NumYearsEducation', 'CapitalGain', 'CapitalAvg'
    ]

    X_log = X.copy()

    for col in columns_to_transform:
        X_log[col] = X_log[col].apply(lambda x: x + 1)
        X_log[col] = X_log[col].apply(np.log)

    X_mat_log = X_log.as_matrix()
    X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(
        X_mat_log, y, test_size=0.3, stratify=y, random_state=rs)

    scaler_log = StandardScaler()
    X_train_log = scaler_log.fit_transform(X_train_log)
    X_test_log = scaler_log.transform(X_test_log)

    cv = GridSearchCV(param_grid=params,
                      estimator=LogisticRegression(random_state=rs),
                      cv=5,
                      n_jobs=-1)
    cv.fit(X_train_log, y_train_log)

    print("\nLog Regression GridSearchCV\n-----------------")
    print("Train accuracy:", cv.score(X_train_log, y_train_log))
    print("Test accuracy:", cv.score(X_test_log, y_test_log))
    print("\nBest Parameters:")
    print(cv.best_params_)

    # ---------------------- Feature Elimination ----------------------- #

    rfe = RFECV(estimator=LogisticRegression(random_state=rs), cv=5)
    rfe.fit(X_train_log, y_train_log)

    print("Original feature set", X_train_log.shape[1])
    print("Number of features after elimination", rfe.n_features_)

    X_train_sel = rfe.transform(X_train_log)
    X_test_sel = rfe.transform(X_test_log)

    cv = GridSearchCV(param_grid=params,
                      estimator=LogisticRegression(random_state=rs),
                      cv=5,
                      n_jobs=-1)
    cv.fit(X_train_sel, y_train_log)

    print(
        "\nLog Regression GridSearchCV with Feature Elimination\n-----------------"
    )
    print("Train accuracy: ", cv.score(X_train_sel, y_train_log))
    print("Test accuracy: ", cv.score(X_test_sel, y_test_log))
    print("\nBest Parameters:")
    print(cv.best_params_)
Ejemplo n.º 3
0
    def go(self, all_data, cols, polynomialColumns):
        trainingData = all_data.loc[(all_data.SalePrice > 0),
                                    cols].reset_index(drop=True, inplace=False)
        y_train = all_data.SalePrice[all_data.SalePrice > 0].reset_index(
            drop=True, inplace=False)

        robustScaler = RobustScaler()
        robustScalerDataFrame = pd.DataFrame(robustScaler.fit_transform(
            trainingData[cols]),
                                             columns=cols)

        pValueColumns = cols.values
        pValueColumns = self.backwardElimination(robustScalerDataFrame,
                                                 y_train, pValueColumns)

        lasso = Lasso(alpha=0.0005, tol=0.002)
        recursiveFeatureEliminator = RFECV(estimator=lasso,
                                           n_jobs=-1,
                                           step=1,
                                           scoring='neg_mean_squared_error',
                                           cv=5)
        recursiveFeatureEliminator.fit(robustScalerDataFrame, y_train)

        recursivelySelectedFeatures = recursiveFeatureEliminator.get_support()
        recursiveFeatureSelectedColumns = cols[recursivelySelectedFeatures]

        r2Score = r2_score
        lasso = Lasso(alpha=0.0005, tol=0.002)
        sequentialFeatureSelection = SequentialFeatureSelection(
            lasso, k_features=1, scoring=r2Score)
        sequentialFeatureSelection.fit(robustScalerDataFrame, y_train)

        sequentialFeatureSelectionScoreLength = len(
            sequentialFeatureSelection.scores_)
        sequentialFeatureSelectionScoreCriteria = (
            sequentialFeatureSelection.scores_ == max(
                sequentialFeatureSelection.scores_))
        arrangedSequentialFeatures = np.arange(
            0, sequentialFeatureSelectionScoreLength
        )[sequentialFeatureSelectionScoreCriteria]
        maxSequentialFeatureScore = max(arrangedSequentialFeatures)
        sequentialFeatureSelectionSubsets = list(
            sequentialFeatureSelection.subsets_[maxSequentialFeatureScore])
        sequentialBackwardSelection = list(
            robustScalerDataFrame.columns[sequentialFeatureSelectionSubsets])

        kBestSelection = SelectKBest(score_func=f_regression, k=kBestValue)
        kBestSelection.fit(robustScalerDataFrame, y_train)
        select_features_kbest = kBestSelection.get_support()
        kbestWithFRegressionScoringFunction = cols[select_features_kbest]

        kBestSelection = SelectKBest(score_func=mutual_info_regression,
                                     k=kBestValue)
        kBestSelection.fit(robustScalerDataFrame, y_train)
        select_features_kbest = kBestSelection.get_support()
        kbestWithMutualInfoRegressionScoringFunction = cols[
            select_features_kbest]

        X_train, X_test, y, y_test = train_test_split(
            robustScalerDataFrame,
            y_train,
            test_size=0.30,
            random_state=randomStateValue)
        model = XGBRegressor(base_score=0.5,
                             random_state=randomStateValue,
                             n_jobs=4,
                             silent=True)
        model.fit(X_train, y)

        bestValue = 1e36
        bestColumns = 31
        my_model = model
        threshold = 0

        for modelThreshold in np.sort(np.unique(model.feature_importances_)):
            selectionsFromModel = SelectFromModel(model,
                                                  threshold=modelThreshold,
                                                  prefit=True)
            X_trainSelectedFromModel = selectionsFromModel.transform(X_train)
            modelForSelection = XGBRegressor(base_score=0.5,
                                             random_state=randomStateValue,
                                             n_jobs=4,
                                             silent=True)
            modelForSelection.fit(X_trainSelectedFromModel, y)
            X_testSelectedFromModel = selectionsFromModel.transform(X_test)
            y_pred = modelForSelection.predict(X_testSelectedFromModel)
            roundedPredictions = [
                round(predictedValue) for predictedValue in y_pred
            ]
            meanSquaredErrorValue = mean_squared_error(y_test,
                                                       roundedPredictions)
            if (bestValue >= meanSquaredErrorValue):
                bestValue = meanSquaredErrorValue
                bestColumns = X_trainSelectedFromModel.shape[1]
                my_model = modelForSelection
                threshold = modelThreshold

        listOfFeatureImportance = [
            (score, feature)
            for score, feature in zip(model.feature_importances_, cols)
        ]
        XGBestValues = pd.DataFrame(sorted(
            sorted(listOfFeatureImportance, reverse=True)[:bestColumns]),
                                    columns=['Score', 'Feature'])
        XGBestColumns = XGBestValues.iloc[:, 1].tolist()

        unionSetOfBestColumns = set(pValueColumns)
        unionSetOfBestColumns = unionSetOfBestColumns.union(
            set(recursiveFeatureSelectedColumns))
        unionSetOfBestColumns = unionSetOfBestColumns.union(
            set(kbestWithFRegressionScoringFunction))
        unionSetOfBestColumns = unionSetOfBestColumns.union(
            set(kbestWithMutualInfoRegressionScoringFunction))
        unionSetOfBestColumns = unionSetOfBestColumns.union(set(XGBestColumns))
        unionSetOfBestColumns = unionSetOfBestColumns.union(
            set(sequentialBackwardSelection))
        unionSetOfBestColumns = unionSetOfBestColumns.union(
            set(polynomialColumns))
        unionSetOfBestColumnsList = list(unionSetOfBestColumns)

        return DataObject(
            self.trainingData, self.testingData, self.combinedData
        ), unionSetOfBestColumnsList, recursiveFeatureSelectedColumns, XGBestColumns
Ejemplo n.º 4
0
### Step 6: Recursive Feature Elimination

### Collect features from RF and PC
df_pc_gini = pd.merge(
    df_pc, df_gini, on="Features", how="inner"
)  # Join by column while keeping only items that exist in both, select outer or left for other options
df_features = df_pc_gini["Features"]  # Save features from data frame
pc_gini_features = df_features.tolist()  # Convert to list
df_rfecv = df_step3[pc_gini_features]  # Add selected features to df

### Setup RFE model
X = df_rfecv  # Save features columns as predictor data frame
Y = df_step3["outcome"]  # Use outcome data frame
RFE = LinearRegression()  # Use regression coefficient as estimator
selector = RFECV(
    estimator=RFE, min_features_to_select=10
)  # define selection parameters, in this case all features are selected. See Readme for more ifo

### Fit RFE model
selected = selector.fit(X, Y)  # This will take time

### Collect features from RFE model
ar_rfe = selected.support_  # Save Boolean values as numpy array
l_rfe = list(zip(X, ar_rfe))  # Create list of variables alongside RFE value
df_rfe = pd.DataFrame(l_rfe, columns=[
    "Features", "RFE"
])  # Create data frame of importances with variables and gini column names
df_rfe = df_rfe[df_rfe.RFE == True]  # Select Variables that were True
df_rfe = df_rfe.reset_index()  # Reset Index
df_rfe = df_rfe.filter(["Features"])  # Keep only selected columns
Ejemplo n.º 5
0
#                 y_train, params, 'Random Forests')
forest = RandomForestRegressor(n_estimators=10, random_state=0,
                               n_jobs=-1).fit(X_train, y_train)
y_pred = forest.predict(X_test)
error = mean_absolute_error(np.exp(y_test), np.exp(y_pred))
day_pred_error = np.exp(y_test[-1]) - np.exp(y_pred[-1])
print(
    f'Day prediction error {day_pred_error}, Percent Error {100*(day_pred_error/np.exp(y_test[-1]))}'
)
print(f'Random forest mean error: {error}')

# plot forests
plot_prediction(X_test, y_test, y_pred, 'Random Forest Regression - Snohomish')

# feature selection with LinearRegression and cross validation
selector = RFECV(LinearRegression(),
                 cv=TimeSeriesSplit(n_splits=5)).fit(X_test, y_test)
cols = selector.get_support(indices=True)
features = X_train.iloc[:, cols]

# cases-1 and cases-7 on snohomish
print(f'Chosen Features on Snohomish Dataset: {features.columns}')

# # repeat with Elastic Net as a sanity check
# selector = RFECV(ElasticNet(), cv=TimeSeriesSplit(
#     n_splits=5)).fit(X_test, y_test)
# cols = selector.get_support(indices=True)
# features = X_train.iloc[:, cols]

# # should show that cases-1 and cases-7 are the best
# print(f'Chosen Features on Snohomish Dataset: {features.columns}')
Ejemplo n.º 6
0
forest = RandomForestRegressor(n_estimators = 1000, max_depth = 10) #Use default values except for number of trees. For a further explanation see readme included in repository. 
forest.fit(df_X, df_Y['quant']) # Fit Forest model, This will take time
rf = forest.feature_importances_ # Output importances of features
l_rf = list(zip(df_X, rf)) # Create list of variables alongside importance scores 
df_rf = pd.DataFrame(l_rf, columns = ['Feature', 'Gini']) # Create data frame of importances with variables and gini column names
df_rf = df_rf[(df_rf['Gini'] > df_rf['Gini'].mean())] # Subset by Gini values higher than mean
df_rf = df_rf.sort_values(by = ['Gini'], ascending = False) # Sort Columns by Value
df_rf.info() # Get class, memory, and column info: names, data types, obs.

### Fracture: Join RF and PCA 
df_fr = pd.merge(df_pca, df_rf, on = 'Feature', how = 'inner') # Join by column while keeping only items that exist in both, select outer or left for other options
fracture = df_fr['Feature'].tolist() # Save features from data frame
df_fr.info() # Get class, memory, and column info: names, data types, obs.

### Recursive Feature Elimination
recursive = RFECV(estimator = LinearRegression(), min_features_to_select = 5) # define selection parameters, in this case all features are selected. See Readme for more ifo
recursive.fit(df_X[fracture], df_Y['quant']) # This will take time
rfe = recursive.support_ # Save Boolean values as numpy array
l_rfe = list(zip(df_X[fracture], rfe)) # Create list of variables alongside RFE value 
df_rfe = pd.DataFrame(l_rfe, columns = ['Feature', 'RFE']) # Create data frame of importances with variables and gini column names
df_rfe = df_rfe.sort_values(by = ['RFE'], ascending = True) # Sort Columns by Value
df_rfe = df_rfe[df_rfe['RFE'] == True] # Select Variables that were True
df_rfe.info() # Get class, memory, and column info: names, data types, obs.

### FractureProof: Join RFE with Fracture
df_fp = pd.merge(df_fr, df_rfe, on = 'Feature', how = 'inner') # Join by column while keeping only items that exist in both, select outer or left for other options
fractureproof = df_fp['Feature'].tolist() # Save chosen featres as list
df_fp.info() # Get class, memory, and column info: names, data types, obs.

### Get FractureProof feature labels
df_lfp = df_l1_l2[fractureproof] # Save chosen featres as list
Ejemplo n.º 7
0
# features = scaler.fit_transform(features)
# features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.3, random_state = 0)

features_train, features_test, labels_train, labels_test = train_test_split(
    features_resampled, labels_resampled, test_size=0.3, random_state=0)

from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV
print "RandomForestClassifier "
# RFECV:  Select the algorithm to train with:
clf_Ranking = RFECV(GradientBoostingClassifier(random_state=0,
                                               learning_rate=0.05,
                                               max_depth=1),
                    scoring='accuracy',
                    n_jobs=-1)
# RFECV: Fit and transform the RFECV function
clf_Ranking.fit_transform(features_train, labels_train)

print clf_Ranking.score(features_train, labels_train)
print clf_Ranking.ranking_
# result of feature selection : [ 1 13  4 14  1 12 11  8  1  9  5  6  1  2 10  7  3  1]
# [1 4 5 1 1 1 1 1 3 1 1 1 6 2 1 1 1 1]
# [14  5  1 11  1 10  4  1  1  1  6  3  2  9  8 12 13  7  1]
#print scores

# GBC : [13 12 11 10  3  1  1  9  1  1  1  8  1  7  6  2  4  5  1  1]

# =-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
Ejemplo n.º 8
0
rfe = RFE(estimator=clf_rf_3, n_features_to_select=15, step=1)
rfe = rfe.fit(x_train, y_train)

# In[140]:

print('Chosen best 15 feature by rfe:', x_train.columns[rfe.support_])

#  Recursive feature elimination with cross validation and random forest classification

# In[141]:

from sklearn.feature_selection import RFECV

# The "accuracy" scoring is proportional to the number of correct classifications
clf_rf_4 = RandomForestClassifier()
rfecv = RFECV(estimator=clf_rf_4, step=1, cv=5,
              scoring='accuracy')  #5-fold cross-validation
rfecv = rfecv.fit(x_train, y_train)

print('Optimal number of features :', rfecv.n_features_)
print('Best features :', x_train.columns[rfecv.support_])

# In[142]:

# Plot number of features VS. cross-validation scores
import matplotlib.pyplot as plt

plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score of number of selected features")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
def pipeline_feature_selection(fs_data, training_data, test_data,
                               calibration_data, verbose):
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import RandomizedSearchCV
    from scipy.stats import randint as sp_randint
    from sklearn.model_selection import GroupKFold
    from sklearn.feature_selection import RFECV
    import numpy as np
    import pandas as pd
    import matplotlib.pyplot as plt

    feature_to_drop = ['Discharge_Q', 'SOH_discharge_capacity', 'Group']
    feature_to_predict = 'Discharge_Q'

    X_train = fs_data.drop(feature_to_drop, axis=1)
    y_train = fs_data[feature_to_predict]

    no_of_features = 1  # number of features to drop after each iteration

    # Hyper-param for Random Forest
    # 10*len(list(X_train))
    # used to have boostrap = 500
    rf_tuning = RandomForestRegressor(
        n_estimators=500, bootstrap=True,
        n_jobs=-1)  #full_dataset: 700 estimators, 500 boostraps
    #standford_dataset: 70 estimators
    # note: oxford uses: 250 est with 1- iter
    #       standford uses: 250 est with 50 iter
    param = {
        "max_depth": sp_randint(15, 25),  #15-25, 5-10
        "max_features":
        [no_of_features],  #[no_of_features],  # sp_randint(2, 4),
        "min_samples_split": sp_randint(2, 5),
        "min_samples_leaf": sp_randint(5, 15),
        "criterion": ['mse']
    }

    # no_top_models = 5
    no_of_splits = len(np.unique(
        fs_data.Group))  # number of slits is equal to the number of groups
    groups = fs_data.Group
    group_kfold = GroupKFold(
        n_splits=no_of_splits)  # inner test and train using the group KFold

    model = RandomizedSearchCV(
        rf_tuning,
        param_distributions=param,
        cv=group_kfold,
        n_iter=100,  # full_dataset: 150
        iid=False,
        refit=True,
        verbose=verbose)
    model.fit(X_train, y_train, groups=groups)
    RF_f_selection_model = model.best_estimator_
    # RF_f_selection_model_param = model.best_params_

    # '''Recurrent Feature Elimination'''
    names = list(
        fs_data.drop(['Discharge_Q', 'SOH_discharge_capacity', 'Group'],
                     axis=1))

    rf = RF_f_selection_model
    rfe = RFECV(estimator=rf,
                min_features_to_select=no_of_features,
                cv=group_kfold,
                step=1,
                scoring='neg_mean_squared_error',
                verbose=verbose)  # neg_mean_squared_error, r2

    # selector_RF = rfe.fit(X_train_scaled, y_train)
    selector_RF = rfe.fit(X_train, y_train, groups=groups)

    ranking_features = sorted(zip(
        map(lambda x: round(x, 4), selector_RF.ranking_), names),
                              reverse=False)
    optimumum_no_feature = selector_RF.n_features_

    x = range(no_of_features, len(selector_RF.grid_scores_) + no_of_features)
    y = selector_RF.grid_scores_
    '''feature selection resuts'''
    print('Feature rank: \n {}'.format(ranking_features))
    # Plot number of features VS. cross-validation scores
    f = plt.figure(figsize=(7, 5))
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score")
    plt.plot(x, y, 'o--', color='tab:orange')
    plt.plot(x[np.argmax(y)], np.max(y), 'v', markersize=15, color='k')
    # plt.title('Optimum number of features based RF-RFE using neg-mse is: {}'.format(optimumum_no_feature))
    plt.xlabel('Selected no. of features', fontsize=15)
    plt.ylabel('Cross-validation score [Negative MSE]', fontsize=15)
    plt.xticks(fontsize=15)
    plt.yticks(fontsize=15)
    plt.grid(False)
    plt.show()

    # transform dataset based on optimum features
    training_data_opt_fet_X = pd.DataFrame(
        selector_RF.transform(training_data.drop(
            feature_to_drop,
            axis=1)))  # input feature space training as DataFrame
    test_data_opt_fet_X = pd.DataFrame(
        selector_RF.transform(test_data.drop(
            feature_to_drop,
            axis=1)))  # input feature space testing as DataFrame
    calibration_data_opt_fet_X = pd.DataFrame(
        selector_RF.transform(calibration_data.drop(
            feature_to_drop,
            axis=1)))  # input feature space testing as DataFrame
    '''Adding 'Group' feature to the dataset'''
    # add the group so that you can re-tune future models based on
    # training_data_opt_fet_X_new = pd.concat([training_data_opt_fet_X, training_data.Group], axis=1)
    training_data_opt_fet_X['Group'] = np.array(training_data.Group)
    # test_data_opt_fet_X_mew = pd.concat([test_data_opt_fet_X, test_data.Group], axis=1)
    test_data_opt_fet_X['Group'] = np.array(test_data.Group)

    # calibration_data_opt_fet_X_mew = pd.concat([calibration_data_opt_fet_X, calibration_data.Group], axis=1)
    calibration_data_opt_fet_X['Group'] = np.array(calibration_data.Group)

    train_y = training_data[feature_to_predict]
    test_y = test_data[feature_to_predict]
    calibration_y = calibration_data[feature_to_predict]

    return optimumum_no_feature, ranking_features, training_data_opt_fet_X, test_data_opt_fet_X, calibration_data_opt_fet_X, train_y, test_y, calibration_y, f
            ignored_fields.append(c[0])

    X = X.drop(ignored_fields, axis=1)

    columns = list(X.columns.values)

    # Standard scaler
    sc_X = StandardScaler()
    X = sc_X.fit_transform(X)

    # Select top 20 features
    estimator = ltb.LGBMRegressor()
    #rfe = RFE(estimator=estimator, n_features_to_select=40, step=10, verbose=1)
    rfe = RFECV(estimator=estimator,
                min_features_to_select=30,
                step=10,
                cv=4,
                n_jobs=1,
                verbose=1)
    rfe.fit(X, y)
    features = rfe.get_support(indices=True)

    np.array(columns)[features]
    """

    ['NUM_SELLING_DAYS_0', 'NUM_DAYS_0', 'days_btw_order_0', 
    'num_orders_0', 'QUOTA_SELLIN_0', 'NUM_SELLING_DAYS_1', 
    'NUM_DAYS_1', 'days_btw_order_1', 'num_orders_1', 
    'QUOTA_SELLIN_1', 'NUM_SELLING_DAYS_2', 'NUM_DAYS_2', 
    'days_btw_order_2', 'num_orders_2', 'QUOTA_SELLIN_2', 
    'NUM_SELLING_DAYS_3', 'NUM_DAYS_3', 'days_btw_order_3', 
    'num_orders_3', 'QUOTA_SELLIN_3', 'NUM_SELLING_DAYS_4', 
Ejemplo n.º 11
0
def RepeatCV(times, Clinical, NumOfFea, Criteria, Method, NumFold=10):
    '''
    k-fold CV. In each fold, the training set are further split into training and validation set.
    The training set is used to build the model.
    The validation set is used to determine the CUTOFF in order to ensure the precision.
    Once the cutoff is determined, it will be used on testing set to obtain precision and PPR.
    :param times: Number of repetitions
    
    :param Clinical: Which variable to predict. Should be index of columns.
    
    :param NumOfFea: Number of genes to be selected
    :param Criteria: the score used in building model, Possible options: FSCORE, PPR90, PPR95
    :param Method: RN_only, RN_SMOTE, no_RN_no_SMOTE
    :param NumFold: number of folds in k-fold CV
    :return: F05score_fold, PPR_fold, PRECISION_fold, F05PPR_fold, AUROC_fold, valid_Precision
    '''

    workDir = "/gpfs/home/dz16e/Reusability/NewExperiment/"
    if Criteria == 'FSCORE':
        myscorer = make_scorer(fbeta_score, beta=1.0)
        target_Precision = 0.9
    elif Criteria == 'FSCORE_95':
        myscorer = make_scorer(fbeta_score, beta=1.0)
        target_Precision = 0.95
    elif Criteria == 'PPR90':
        myscorer = make_scorer(PPR_percentile_score,
                               needs_proba=True,
                               Precision=0.9,
                               Return_cutoff=0)
        target_Precision = 0.9
    elif Criteria == 'PPR95':
        myscorer = make_scorer(PPR_percentile_score,
                               needs_proba=True,
                               Precision=0.95,
                               Return_cutoff=0)
        target_Precision = 0.95

    if Method in ['RN_only', 'RN_SMOTE']:
        X_data = np.loadtxt(workDir + 'TCGA_Data/Predictors/' +
                            'predictor_rank.txt')
    elif Method in ['no_RN_no_SMOTE']:
        X_data = np.loadtxt(workDir + 'TCGA_Data/Predictors/' +
                            'predictor.txt')
    Y = np.loadtxt(workDir + 'TCGA_Data/Responses/' + 'response.txt')

    CLI = Clinical
    Y_data = Y[:, CLI]
    X_data = X_data[Y_data < 2, :]
    Y_data = Y_data[Y_data < 2]

    F05score_fold = np.zeros([NumFold, 4])  #because we have 4 different models
    PPR_fold = np.zeros([NumFold, 4])
    AUROC_fold = np.zeros([NumFold, 4])
    PRECISION_fold = np.zeros([NumFold, 4])
    F05PPR_fold = np.zeros([NumFold, 4])

    valid_Precision = np.zeros([NumFold, 4])
    valid_PPR = np.zeros([NumFold, 4])
    Pred_Prob_test = np.zeros([X_data.shape[0], 6])
    CUTOFF = np.zeros([NumFold, 4])

    kf = StratifiedKFold(n_splits=NumFold,
                         shuffle=True,
                         random_state=times * 10 + 1)
    val = 0
    for train_idx, test_idx in kf.split(X_data, Y_data):

        train_X = X_data[train_idx, :]
        train_Y = Y_data[train_idx]
        test_X = X_data[test_idx, :]
        test_Y = Y_data[test_idx]
        Pred_Prob_test[test_idx, 0] = test_Y
        Pred_Prob_test[test_idx, 1] = val + 1

        # Filter out genes using T-test
        DEGs = Filter_Ttest(train_X, train_Y, significant_level=0.1)
        train_X = train_X[:, DEGs]
        test_X = test_X[:, DEGs]

        if Method == 'RN_SMOTE':
            smote = SMOTE(random_state=2020)
            smox, smoy = smote.fit_sample(train_X, train_Y)
            isSMOTE = 1
        else:
            smox, smoy = train_X, train_Y
            isSMOTE = 0

        Clf_name = ['LASSO', 'RF', 'XGB', 'SVM']
        Classifiers = {
            'LASSO':
            LogisticRegression(C=1,
                               penalty='l1',
                               solver='liblinear',
                               random_state=2020),
            'RF':
            RandomForestClassifier(n_estimators=500,
                                   n_jobs=-1,
                                   random_state=2020),
            'XGB':
            XGBClassifier(learning_rate=0.05,
                          n_estimators=100,
                          max_depth=5,
                          seed=2020),
            'SVM':
            SVC(C=0.01,
                probability=True,
                kernel='linear',
                random_state=2020,
                max_iter=10000)
        }
        Parameters = {
            'LASSO': {
                'C': [.01, .05, .1, .5, 1.0, 5.0, 10.0],
                'fit_intercept': [True, False]
            },
            'RF': {
                'criterion': ['gini', 'entropy'],
                'max_depth': sp_randint(1, 11)
            },
            'XGB': {
                'learning_rate': [.01, .05, .1, .5],
                'max_depth': sp_randint(1, 11),
                'min_child_weight': [1, 2, 3]
            },
            'SVM': {
                'C':
                [.0001, .001, .005, .01, .05, .1, .5, 1.0, 3.0, 5.0, 10.0]
            }
        }
        # -----------------------------------------------------------
        # Feature Selection
        for idx_clf in range(4):
            os.system("echo 'Starting " + Clf_name[idx_clf] + "...'")
            estimator = Classifiers[Clf_name[idx_clf]]
            if NumOfFea == 0:
                selector = RFECV(estimator, step=0.2, cv=3, scoring=myscorer)
            else:
                selector = RFE(estimator, NumOfFea, step=0.2)

            selector = selector.fit(smox, smoy)
            smox_reduced = selector.transform(smox)
            test_X_reduced = selector.transform(test_X)

            cv_parameter = Parameters[Clf_name[idx_clf]]
            searcher = RandomizedSearchCV(estimator,
                                          cv_parameter,
                                          scoring=myscorer,
                                          random_state=2020).fit(
                                              smox_reduced, smoy)
            Pred_test = searcher.predict(test_X_reduced)
            Prob_test = searcher.predict_proba(test_X_reduced)[:, 1]
            F05score_fold[val, idx_clf] = fbeta_score(test_Y, Pred_test, 1.0)
            percentile_CUTOFF, valid_Precision[val, idx_clf], valid_PPR[val, idx_clf] = \
                PPR_percentile_findCutoff(smox_reduced, smoy, searcher.best_estimator_, target_Precision, 10, isSMOTE)
            PPR_fold[val, idx_clf] = recall_score(
                test_Y,
                Prob_test >= np.percentile(Prob_test, percentile_CUTOFF))
            PRECISION_fold[val, idx_clf] = \
                precision_score(test_Y, Prob_test >= np.percentile(Prob_test, percentile_CUTOFF))
            F05PPR_fold[val, idx_clf] = \
                fbeta_score(test_Y, Prob_test >= np.percentile(Prob_test, percentile_CUTOFF), beta=1.0)
            AUROC_fold[val, idx_clf] = roc_auc_score(test_Y, Prob_test)
            Pred_Prob_test[test_idx, idx_clf + 2] = Prob_test
            CUTOFF[val, idx_clf] = percentile_CUTOFF
        val += 1
    return F05score_fold, PPR_fold, PRECISION_fold, F05PPR_fold, AUROC_fold, valid_Precision, valid_PPR, \
           Pred_Prob_test, CUTOFF
excel_name = "Scores/%i_SVC_Score (%i sec).csv" % (time_Start, elapsed)
scores.to_csv(excel_name)

# In[Feature selections]
from sklearn.feature_selection import RFECV
from sklearn.svm import LinearSVC
time_Start = time.time()

days = [1]
#stocks = tickers
stocks = ['OMXIPI']
features_to = petur.get_tickers()
features_to.append(stocks)

clf = LinearSVC()
clf = RFECV(clf, step=1, cv=10, n_jobs=-1)

feature_dic = {}

for stock in stocks:
    acc = []
    for day in days:
        name = "%s-%s" % (stock, day)
        training = df_train.copy()
        training = training[features_to]

        x_train, y_train = petur.create_better_labels(stock, training, day)

        clf = clf.fit(x_train, y_train)
        output = clf.support_
        feature_dic[name] = output
print(classification_report(y_test, y_prediction))
print(confusion_matrix(y_test, y_prediction))


#SVM Model 
X= np.array(df.drop(['region_id'],1))
y= np.array(df['region_id'])
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.2)
clf = svm.SVC()
clf.fit(X_train,y_train)
svm_accuracy = clf.score(X_test,y_test)
#SVM evaluate: 
print('The accuracy of the SVM test was :') 
print(svm_accuracy)
y_prediction = clf.predict(X_test)
print(classification_report(y_test, y_prediction))
print(confusion_matrix(y_test, y_prediction))


#Random Forest 
#Use Recursive Feature Elimination 
X= np.array(df.drop(['region_id'],1))
y= np.array(df['region_id'])
X_train,X_test,y_train,y_test = cross_validation.train_test_split(X,y,test_size=0.2)
m = RFECV(RandomForestClassifier(n_jobs=-1), scoring='accuracy',verbose=1)
m.fit(X, y)
k = m.score(X, y)
print("Using recursive feature elimination of a random forest, best model produces the folloiwng accuracy: ")
print(k)

model.fit(xt_train, y_train)

# Make predictions on test data and look at the results.
preds = model.predict(xt_test)
pprint.pprint(pd.DataFrame({'Actual': y_test, 'Predicted': preds}))

print('MSE, MAE, R^2, EVS (Top 3 Model): ' + \
          str([mean_squared_error(y_test, preds), \
          median_absolute_error(y_test, preds), \
          r2_score(y_test, preds), \
          explained_variance_score(y_test, preds)]))

# ---------------- Part 3: Use Recursive Feature Elimination with Cross Validation -

# Use RFECV to arrive at the approximate best set of predictors. RFECV is a greedy method.
selector_f = RFECV(estimator=linear_model.LinearRegression(), \
                   cv=5, scoring=make_scorer(r2_score))
selector_f.fit(x_train, y_train)

# Get the columns of the best 25% features.
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

# Create a least squares linear regression model.
model = linear_model.LinearRegression()

# Fit the model.
model.fit(xt_train, y_train)

# Make predictions on test data and look at the results.
preds = model.predict(xt_test)
pprint.pprint(pd.DataFrame({'Actual': y_test, 'Predicted': preds}))
Ejemplo n.º 15
0
### Extracting features and labels from dataset for local testing
data = featureFormat(my_dataset, features_list_original, sort_keys = True)
labels, features = targetFeatureSplit(data)


#####################
# FEATURE SELECTION #
#####################
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.feature_selection import RFECV
lr = LogisticRegression()

### Optimal number of features:
rfecv = RFECV(estimator=lr, step=1, cv=StratifiedKFold(labels, 3),
          scoring='precision')
rfecv.fit(features, labels)
print("Optimal number of features : %d" % rfecv.n_features_) #Answer: 5


'''
### Plotting number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
'''


### Choosing the 5 most important features
Ejemplo n.º 16
0
print(rfe.ranking_)

# Plot pixel ranking
#plt.matshow(ranking, cmap=plt.cm.Blues)
#plt.colorbar()
#plt.title("Ranking of pixels with RFE")
#plt.show()

# Create the RFE object and compute a cross-validated score.
svc = SVR(kernel="linear")

min_features_to_select = 1  # Minimum number of features to consider
rfecv = RFECV(
    estimator=svc,
    step=1,
    cv=KFold(n_splits=5),
    scoring="r2",
    min_features_to_select=min_features_to_select,
)
rfecv.fit(X, y)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (r2)")
plt.plot(
    range(min_features_to_select,
          len(rfecv.grid_scores_) + min_features_to_select),
    rfecv.grid_scores_,
##############################################################
###Features Selection RFE with the logistic regression estimator
features = [
    'FamilyCateg', 'Age', 'Embarked_S', 'Embarked_Q', 'Embarked_C', 'Other',
    'Mr', 'Mrs', 'Miss', 'Master', 'Fare', 'SibSp', 'Parch', 'Sex', 'Pclass'
]
X = train_data[features].values
Y = train_data['Survived'].values

from sklearn.feature_selection import RFECV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold

estimator = LogisticRegression()
selector = RFECV(estimator, step=1, cv=StratifiedKFold(2), scoring='accuracy')
selector = selector.fit(X, Y)
print("Optimal number of features : %d" % selector.n_features_)
selected_features = []
for i in range(len(features)):
    if selector.support_[i] == True:
        selected_features.append(features[i])
print(selected_features)
###########################################################""""

#Variable to display final scores of the following models
models = []
final_scores = []

#Logistic Regression
Ejemplo n.º 18
0
                               (target_left - target_viewport))).reshape(
                                   1, dataset.shape[0])

X = np.array([dataset[:, index] for index in features_index], dtype=float).T
X = np.concatenate((X, diff_out_viewport_left.T, diff_out_viewport_right.T),
                   axis=1)
X_cross = np.array([dataset[:, index] for index in crosscheck_index],
                   dtype=float).T
y = np.array(dataset[:, class_index], dtype=int)
urls = dataset[:, url_index]

random.seed(42)
model = tree.DecisionTreeClassifier(criterion='entropy', random_state=42)
#model = ensemble.RandomForestClassifier(criterion='entropy', random_state=42)
X_new = X
rfecv = RFECV(model, cv=GroupKFold(n_splits=5), scoring='f1_macro')
rfecv.fit(X, y, groups=urls)
X_new = rfecv.transform(X)
print(X.shape)
print(X_new.shape)
headers.append('diff_out_viewport_left')
headers.append('diff_out_viewport_right')
features_index.append(headers.index('diff_out_viewport_left'))
features_index.append(headers.index('diff_out_viewport_right'))
print([
    headers[features_index[i]] for i in range(0, len(rfecv.ranking_))
    if rfecv.ranking_[i] == 1
])

params = {
    #    'n_estimators': [1, 5, 10, 20, 50],
    def predict_features(self, df_features, df_target, idx=0, **kwargs):
        estimator = SVR(kernel='linear')
        selector = RFECV(estimator, step=1)
        selector = selector.fit(df_features.values, df_target.values[:, 0])

        return selector.grid_scores_
Ejemplo n.º 20
0
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params))
    print()
#------------------------------------------------------------------------------
#Permet de selectionner

knn = RandomForestClassifier(criterion='entropy',
                             min_samples_leaf=1,
                             min_samples_split=12,
                             n_estimators=134,
                             n_jobs=-1)

# classifications
rfecv = RFECV(estimator=knn, step=3, cv=StratifiedKFold(2), scoring='roc_auc')
rfecv.fit(X_LS, y_LS)

print("Optimal number of features : %d" % rfecv.n_features_)

# Plot number of features VS. cross-validation scores
plt.figure()
plt.xlabel("Number of features selected")
plt.ylabel("Cross validation score (nb of correct classifications)")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
from sklearn.feature_selection import RFECV
from xgboost import XGBClassifier
from numpy import loadtxt
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

dataset = loadtxt('../Dataset/heart.data', delimiter=",")
# split data into X and y
X = dataset[:, 0:np.array(dataset).shape[1] - 1]
Y = dataset[:, np.array(dataset).shape[1] - 1]
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.23,
                                                    random_state=22)
#use linear regression as the model
xg = XGBClassifier()
#rank all features, i.e continue the elimination until the last one
rfe = RFECV(xg, cv=5, step=1)
rfe.fit(X_train, y_train)
y_important_pred = rfe.predict(X_test)

print("Features sorted by their rank:")
print(sorted(zip(map(lambda x: x, rfe.ranking_))))
print(sorted(zip(map(lambda x: x, rfe.support_))))
print(accuracy_score(y_test, y_important_pred.round()) * 100)
Ejemplo n.º 22
0
clf = RandomForestClassifier(2000, n_jobs=4)
cv = StratifiedKFold(n_splits=20, shuffle=True)

y_pred = cross_val_predict(clf,
                           data_x,
                           data_y,
                           cv=cv,
                           method='predict_proba',
                           n_jobs=4,
                           verbose=10)

misc.save_results(data_y, y_pred, name, ss=ss, clf=clf)

stop
# %% feat select
selector = RFECV(clf, cv=5, n_jobs=-1, verbose=100, step=1, scoring='f1')
selector.fit(data_x, data_y)
selector.grid_scores_

print(
    f'These were the {selector.n_features_} features that were deemed important'
)
print([feature_names[i] for i in np.nonzero(selector.support_)[0]])

#%% feature importances
clf = RandomForestClassifier(2000, n_jobs=-1)
# create feature importances
clf.fit(data_x, data_y)
ranking = clf.feature_importances_

type_importances = np.zeros(len(feature_types))
Ejemplo n.º 23
0
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.feature_selection import RFECV

from sklearn.model_selection import StratifiedKFold
import numpy as np
from common.import_data import ImportData

if __name__ == "__main__":
    data_set = ImportData()
x: np.ndarray = data_set.import_all_data()
y: np.ndarray = data_set.import_columns(np.array(['Class']))
svc = SVC(kernel="linear")
rfecv = RFECV(estimator=svc, step=1, cv=StratifiedKFold(2), scoring='accuracy')
rfecv.fit(x, y.ravel())

print("Optymalna liczba cech : %d" % rfecv.n_features_)
plt.figure()
plt.xlabel("Liczba wybranych cech")
plt.ylabel("Wynik walidacji krzyżowej")
plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
plt.show()
Ejemplo n.º 24
0
 poly_reg = PolynomialFeatures(degree=2)
 X_temp_train_q = poly_reg.fit_transform(X_temp_train[feature_names_quant])
 poly_feature_names_quant = poly_reg.get_feature_names(feature_names_quant)
 
 #Selecting by univariate measures, get top 50 percentile
 #removes any highly correlated variables
 #uses greedy search to choose which variables to keep by f-score and p-value
 
 selector_f = SelectPercentile(f_regression, percentile=50)
 selector_f.fit(X_temp_train_q, y_temp_train)
 
 for n,s,p in zip(poly_feature_names_quant, selector_f.scores_, selector_f.pvalues_):
     print ("F-score: %3.2f, p-value: %3.2f for feature %s " % (s,p,n))
     
 regression = LinearRegression()    
 greedy_selector = RFECV(estimator=regression, cv=10,
                  scoring='neg_mean_squared_error')
 greedy_selector.fit(X_temp_train_q, y_temp_train)    
 print('Optimal number of features: %d' % greedy_selector.n_features_)
 
 #create dataframe with optimal polynomial features
 X_temp_train_df = pd.DataFrame(X_temp_train_q)
 X_temp_train_df.columns = poly_feature_names_quant
 X_temp_train_df = X_temp_train_df.loc[:, greedy_selector.support_]
     
 age_regressor = LinearRegression()
 age_regressor = age_regressor.fit(X_temp_train_df, y_temp_train)
 
 #prep X test data, perform polynomial features selection then only use features as already output by
 #greedy selection
 
 X_temp_test_q = pd.DataFrame(poly_reg.fit_transform(X_temp_test[feature_names_quant]))
Ejemplo n.º 25
0
standardize_X(X)

# list to hold features selected by the algorithm
selections = []

# 100 times do...
for _ in range(100):
	# shuffle X and y
	X,y = shuffle_X_and_y(X,y)
	# choose the underlying model. to use log. reg. change which line is commented out
	model = SVC(kernel="linear", C = 0.5)
	#model = LogisticRegression(C = 0.5)
	# Create the RFECV object and compute a cross-validated score.
	# The "accuracy" scoring is proportional to the number of correct
	# classifications
	rfecv = RFECV(estimator=model, step=1, cv=StratifiedKFold(4),
	              scoring='accuracy')
	rfecv.fit(X, y)

	# create list of selected features for this iteration
	mask = rfecv.support_
	chosen = [i for i in range(len(mask)) if mask[i]]
	# add this to selections
	selections += chosen

# once complete, create a dict to hold occurence counts for each feature and populate
fcount = {}
for f in selections:
	if f not in fcount:
		fcount[f] = 1
	else:
		fcount[f] += 1
Ejemplo n.º 26
0
# Test accuracy
acc = accuracy_score(y_test, est.predict(X_test))
print('Test Accuracy {}'.format(acc))

# Plot confusion matrix
cm = confusion_matrix(y_test, est.predict(X_test))
sns.heatmap(cm, fmt='d', cmap='GnBu', cbar=False, annot=True)
         
#==============================================================================
# Recursive Feature Selection
#==============================================================================
from sklearn.feature_selection import RFECV

# RFE
rfe = RFECV(estimator=LogisticRegression(), cv=4, scoring='accuracy')
rfe = rfe.fit(X_train, y_train)

# Select variables and calulate test accuracy
cols = X_train.columns[rfe.support_]
acc = accuracy_score(y_test, rfe.estimator_.predict(X_test[cols]))
print('Number of features selected: {}'.format(rfe.n_features_))
print('Test Accuracy {}'.format(acc))

# Plot number of features vs CV scores
plt.figure()
plt.xlabel('k')
plt.ylabel('CV accuracy')
plt.plot(np.arange(1, rfe.grid_scores_.size+1), rfe.grid_scores_)
plt.show()
Ejemplo n.º 27
0
         'estimator': [AdaBoostClassifier(random_state=1986)],
         'estimator__n_estimators': [3, 10],
    },
    {
         'estimator': [GradientBoostingClassifier(random_state=1986)],
         'estimator__criterion': ['friedman_mse', 'mse', 'mae'], 
         'estimator__n_estimators': [3, 10],
         #'estimator__max_depth': [None, 3, 5],
         #'estimator__loss': ['deviance', 'exponential'],
    },
    {
        'estimator': [SVC(kernel="linear", C=0.025, random_state=1986)],
    }]

#Feature Selection nas mesmas condições de classificador e folders
rfecv = RFECV(estimator=None, step=1, cv=kfold, scoring=score)

#Faz o processamento de treinamento com Tuning e Feature Selection
gridSearch = GridSearchCV(rfecv, paramGrid, scoring=score, n_jobs=3, verbose=25)
gridSearch.fit(xTreino, yTreino)

classifier = gridSearch.best_estimator_
indFeatures = np.where(classifier.support_ == True)[0]

print('\nMelhor estimador: %s' % gridSearch.best_estimator_)
print('Melhor parametrização: %s' % gridSearch.best_params_)
print('Melhor pontuação: %.2f' % gridSearch.best_score_)
print('Qtde features selecionadas: ', len(indFeatures))

#K-fold
print('========== VALIDAÇÃO MÉTODO K-FOLD ==========')
Ejemplo n.º 28
0
def main():

    input_file = "tcd ml 2019-20 income prediction training (with labels).csv"
    # Uncomment to find current path directory, for debuggin reasons
    # print(os.path.isfile(input_file))

    df = pd.read_csv(input_file, header=0)

    #  (df,ohe) = clean_data(df)

    # remove nans in column
    # df = df[np.isfinite(df['Year of Record'])]

    train, test = train_test_split(df, test_size=0.2)
    # train = process_features(train)
    # #Output state of df to csv for debugging reasons
    # testing_file = "Testing.csv"
    # train.to_csv(testing_file, index=False)
    scaler = StandardScaler()
    (train, ohe, rep_points) = clean_train_data(train)
    # testing_file = "Testing.csv"
    # train.to_csv(testing_file, index=False)
    print(np.shape(train))
    print(np.shape(test))
    #train = reject_all_outliers

    # To potentially threshold the features based on variance of values in column
    # sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

    train_y = train.iloc[:, -1]
    # print(train_y.columns)
    # print(train_y["Income in EUR"])

    # To discretize incomes
    # train_y = pd.DataFrame(train_y,columns = ["Income in EUR"])
    # train_y["Income in EUR"] = incomeBinner.fit_transform(train_y[["Income in EUR"]])
    # train_y["Income in EUR"] = incomeBinner.inverse_transform(train_y[["Income in EUR"]])
    # train_y = train_y.iloc[:,-1]

    train_X = scaler.fit_transform(train.iloc[:, :-1])

    # pca = PCA(svd_solver='auto', n_components="mle")
    # pca.fit(train_X)
    # n_components_pca_mle = pca.n_components_
    # print("best n_components by PCA MLE = %d" % n_components_pca_mle)

    # train_X = pca.transform(train_X)

    # train_X = sel.fit_transform(train_X)
    test_y = test.iloc[:, -1]

    test_X = test.iloc[:, :-1]
    test_X = clean_data(test_X, ohe, rep_points)
    test_X = scaler.transform(test_X)
    # test_X = pca.transform(test_X)
    # test_X = sel.transform(test_X)

    print(np.shape(train_X))
    print(np.shape(test_X))
    #
    # catboost = CatBoostRegressor(
    #                       task_type="GPU",
    #                        devices='0:1')

    regr = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance')
    lasso = linear_model.LassoCV(cv=5, verbose=0)

    # model = linear_model.SGDRegressor()
    # # Grid search - this will take about 1 minute.
    # param_grid = {
    #      'alpha': 10.0 ** -np.arange(1, 7),
    #      'loss': ['squared_loss', 'huber', 'epsilon_insensitive'],
    #      'penalty': ['l2', 'l1', 'elasticnet'],
    #      'learning_rate': ['constant', 'optimal', 'invscaling'],
    # }
    # sgd = GridSearchCV(model, param_grid)
    # regr = DecisionTreeRegressor()

    # Good performance
    # trees = ExtraTreesRegressor()

    # regr = linear_model.MultiTaskElasticNetCV(cv=5)
    #lasso = linear_model.LassoLarsCV(cv=5)
    # regr = linear_model.ElasticNetCV(cv=5)
    #regr = ensemble.RandomForestRegressor(n_estimators=1000)
    # regr = ensemble.GradientBoostingRegressor(n_estimators=1000, subsample=0.5)
    # regr = ensemble.VotingRegressor(estimators=[('knn', knn), ('lr', sgd)])
    # regr = linear_model.SGDRegressor(alpha =0.0001,average=False,early_stopping=False,
    #     epsilon=0.1,eta0=0.0001,fit_intercept=True,l1_ratio=0.15,learning_rate='invscaling',
    #     loss='squared_loss',max_iter=1000,n_iter_no_change=5,penalty='l2',power_t=0.25,
    #     random_state=None,shuffle=True,tol=0.001,validation_fraction=0.1,verbose=2,
    #     warm_start=False)
    #regr = linear_model.RidgeCV(cv=5)
    # regr = xgb.XGBRegressor(objective ='reg:linear', colsample_bytree = 0.3, learning_rate = 0.1,
    #             max_depth = 5, alpha = 10, n_estimators = 10)
    # regr = svm.SVR(gamma='scale')
    # Never use this it, takes too long and requires meta transofrmer as well
    selector = RFECV(lasso, step=1, cv=3)

    # This works ok but requires LassoCV basically
    #selector = SelectFromModel(regr)

    # Very bad performance
    #selector = cluster.FeatureAgglomeration(n_clusters=150)

    # the worst performance so far :(((
    #selector = SelectFromModel(ExtraTreesRegressor(n_estimators=100))

    # Performed ok
    #selector = SelectKBest(f_regression,k = 150)

    train_X = selector.fit_transform(train_X, train_y)
    test_X = selector.transform(test_X)

    print(np.shape(train_X))
    print(np.shape(test_X))

    # Train the model using the training sets
    regr.fit(train_X, train_y)

    # Make predictions using the testing set
    pred = regr.predict(test_X)
    X = test["Country"]
    le = LabelEncoder()
    X = le.fit_transform(X)
    plt.scatter(X, test_y, color='red', alpha=0.005)
    plt.scatter(X, pred, color='blue', alpha=0.005)
    plt.ylabel('Income')
    plt.xlabel('Country')
    plt.title('Predicting income')
    plt.show()
    # catboost.fit(train_X, train_y)
    # catpreds = catboost.predict(test_X)

    # The coefficients
    # print('Coefficients: \n', regr.coef_)
    # The mean squared error
    print("Root Mean squared error: %.2f" %
          sqrt(mean_squared_error(test_y, pred)))
    # Explained variance score: 1 is perfect prediction
    print('Variance score: %.2f' % r2_score(test_y, pred))
Ejemplo n.º 29
0
 def __init__(self, estimator):
     self._rfecv = RFECV(estimator=estimator,
                         cv=StratifiedKFold(5),
                         scoring='recall')
Ejemplo n.º 30
0
for model in models:
    print(model)
    rfe = RFE(model, n_features_to_select=1, verbose=2)
    fit = rfe.fit(df_raw_train[x_columns], df_raw_train[y_column])

    for number in [x * 10 for x in list(range(1, 15))]:
        indexes_to_delete = []
        for i in range(len(fit.ranking_)):
            if(fit.ranking_[i] > number):
                indexes_to_delete.append(i)
        selected_features = [i for j, i in enumerate(
            x_columns) if j not in indexes_to_delete]
        selected_features_dict[str(number) + "_" +
                               str(model)] = selected_features
    print("-------\nCV\n---------")
    rfe = RFECV(estimator=model, verbose=2)
    fit = rfe.fit(df_raw_train[x_columns], df_raw_train[y_column])

    indexes_to_delete = []
    for i in range(len(fit.ranking_)):
        if(fit.ranking_[i] != 1):
            indexes_to_delete.append(i)
    selected_features = [i for j, i in enumerate(
        x_columns) if j not in indexes_to_delete]
    selected_features_dict["A_" + str(model)] = selected_features

selected_features_dict_new = {}
for k in selected_features_dict.keys():
    new_key = ' '.join(str(k).replace("\\n", " ").split())
    selected_features_dict_new[new_key] = selected_features_dict[k]