Ejemplo n.º 1
0
def feature_select(data: pd.DataFrame, test_data: pd.DataFrame):
    run_model(data, ExtraTreesClassifier())
    clf = ExtraTreesClassifier()

    from sklearn.feature_selection import SelectFromModel
    x = data.drop(columns=['match'])
    y = data['match']
    clf.fit(x, y)
    model = SelectFromModel(clf, prefit=True)
    x_new = model.transform(x)
    data = pd.concat([pd.DataFrame(x_new), data[['match']]], axis=1)

    print('--------- feature params -----------', model.get_params())

    test_data_x = test_data.drop(columns=['match'])
    test_data_y = np.array(test_data['match'])
    test_data_x_new = model.transform(test_data_x)
    test_data_y = pd.DataFrame({'match': test_data_y})
    print('--------- test data y after feature ---------', test_data_y)
    test_data = pd.concat([pd.DataFrame(test_data_x_new), test_data_y], axis=1)

    return data, test_data
Ejemplo n.º 2
0
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV

m_RFERFC = RFECV(RandomForestClassifier(n_estimators=100), scoring='accuracy')
m_RFERFC.fit(X, Y)  # returns model
X_RFERFC = m_RFERFC.predict(X)
m_RFERFC.score(X, Y)

from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectFromModel
m_lasso = SelectFromModel(LassoCV())
m_lasso.fit(X, Y)
m_lasso.transform(X).shape
X_lasso = m_lasso.transform(X)
m_lasso.get_params()
mask = m_lasso.get_support()
print(mask)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
X.columns[mask]
#Using CV helps reduce selection bias due to the observations in the training set

#X_test_selected = modelfit.transform(X_test)
#predmodel = logisticRegression()
#predmodel.fit(X_train,Y_train)
#print('The score on all features: {:.3f}'.format(predmodel.score(X_test,Y_test)))
#score = predmodel.fit(X_train_selected, y_train).score(X_test_selected,y_test)
#print('The score on all features: {:.3f}'.format(score))

from sklearn.ensemble import RandomForestClassifier
fs_SFM_RFC = SelectFromModel(RandomForestClassifier(n_estimators=100))
Ejemplo n.º 3
0
    X_selected_main = []
    X_val_selected_main = []
    selector_params = []  # To store dicts from `get_params()` method of each selector below.
    print("End of step 4, time taken: ", timer() - start, '\n')

    # fit and select apt. features for each estimator then refit on them.
    # GBC
    start = timer()
    print("Fitting GradientBoost Classifier:")
    gbc_clf.fit(X, y)
    print("Score using all features:Training ", gbc_clf.score(X, y))
    print("Score using all features:Validation ", gbc_clf.score(X_val, y_val))

    selector = SelectFromModel(gbc_clf, prefit=True)
    selector_params.append(selector.get_params())
    X_selected = selector.transform(X)
    X_selected_main.append(X_selected)
    X_val_selected = selector.transform(X_val)
    X_val_selected_main.append(X_val_selected)
    print("Shaped reduced from {} to {}, difference is {}".format(X.shape[1],
                                                                  X_selected.shape[1],
                                                                  X.shape[1] - X_selected.shape[1]))
    print("Refitting using selected features.")
    gbc_clf.fit(X_selected, y)
    print("Score using selected features:Training ", gbc_clf.score(X_selected, y))
    print("Score using selected features:Validation ", gbc_clf.score(X_val_selected, y_val))

    # ABC
    print("Fitting AdaBoost Classifier: ")
    abc_clf.fit(X, y)
Ejemplo n.º 4
0
    for j in range(i+1,len(cols)):
        if np.array_equal(v,train[cols[j]].values):
            remove.append(cols[j])

train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)

# split data into train and test
test_id = test.ID
test = test.drop(["ID"],axis=1)

X = train.drop(["TARGET","ID"],axis=1)
y = train.TARGET.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=1729)
print(X_train.shape, X_test.shape, test.shape)

## # Feature selection
clf = ExtraTreesClassifier(random_state=1729)
selector = clf.fit(X_train, y_train)
# clf.feature_importances_ 
fs = SelectFromModel(selector, prefit=True)

X_train = fs.transform(X_train)
X_test = fs.transform(X_test)
test = fs.transform(test)

print(X_train.shape, X_test.shape, test.shape)

print (fs.get_params(deep=True))
Ejemplo n.º 5
0
train.drop(remove, axis=1, inplace=True)
test.drop(remove, axis=1, inplace=True)

# split data into train and test
test_id = test.ID
test = test.drop(["ID"], axis=1)

X = train.drop(["TARGET", "ID"], axis=1)
y = train.TARGET.values

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.20,
                                                    random_state=1729)
print(X_train.shape, X_test.shape, test.shape)

## # Feature selection
clf = ExtraTreesClassifier(random_state=1729)
selector = clf.fit(X_train, y_train)
# clf.feature_importances_
fs = SelectFromModel(selector, prefit=True)

X_train = fs.transform(X_train)
X_test = fs.transform(X_test)
test = fs.transform(test)

print(X_train.shape, X_test.shape, test.shape)

print(fs.get_params(deep=True))