y_train = house_train['SalePrice'] sns.distplot(y_train, hist=True) y_trans = np.log1p(y_train) sns.distplot(y_trans, hist=True) scoring = metrics.make_scorer(log_rmse, greater_is_better=False) #union of 3 feature selectors lasso_estimator = linear_model.Lasso() lasso_grid = {'alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.2, 0.5]} lasso_selector = get_best_model(lasso_estimator, lasso_grid, X_train, y_trans, scoring=scoring) plot_feature_importances(lasso_selector, X_train, 50) important_features_lasso = get_important_features(lasso_selector, X_train) rf_estimator = ensemble.RandomForestRegressor(random_state=100) rf_grid = { 'n_estimators': list(range(100, 501, 200)), 'max_features': [14, 16, 18, 20], 'max_depth': [3, 5, 7] } rf_selector = get_best_model(rf_estimator, rf_grid, X_train, y_trans, scoring=scoring) plot_feature_importances(rf_selector, X_train, 50) important_features_rf = get_important_features(rf_selector, X_train)
X_train = utils.ohe(titanic_train1, cat_features) y_train = titanic_train['Survived'] #embedded feature selectors rf_estimator = ensemble.RandomForestClassifier() rf_grid = { 'max_depth': list(range(1, 9)), 'n_estimators': list(range(1, 300, 100)) } rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, X_train, y_train) embedded_selector = feature_selection.SelectFromModel(rf_final_estimator, prefit=True, threshold='mean') X_train1 = embedded_selector.transform(X_train) utils.plot_feature_importances(rf_final_estimator, X_train) gb_estimator = ensemble.GradientBoostingClassifier() gb_grid = { 'max_depth': [1, 2, 3], 'n_estimators': list(range(50, 300, 100)), 'learning_rate': [0.001, 0.1, 1.0] } gb_final_estimator = cutils.grid_search_best_model(gb_estimator, gb_grid, X_train, y_train) embedded_selector = feature_selection.SelectFromModel(gb_final_estimator, prefit=True, threshold='mean') X_train1 = embedded_selector.transform(X_train) utils.plot_feature_importances(gb_final_estimator, X_train)
house1[imputable_cont_features]) house1.info() house2 = utils.ohe(house1, imputable_cat_features) scaler = utils.get_scaler(house2) house3 = scaler.transform(house2) house3 = pd.DataFrame(house3, columns=house2.columns) X_train = house3[:house_train.shape[0]] y_train = house_train['SalePrice'] lasso_selector = linear_model.Lasso() lasso_selector.fit(X_train, y_train) print(lasso_selector.coef_) utils.plot_feature_importances(lasso_selector, X_train, 40) X_train1 = utils.select_features(lasso_selector, X_train) utils.corr_heatmap(X_train1) lpca = decomposition.PCA(0.95) lpca.fit(X_train1) print(np.cumsum(lpca.explained_variance_ratio_)) pca_data = lpca.transform(X_train1) print(pca_data.shape) tsne = manifold.TSNE(n_components=2) tsne_data = tsne.fit_transform(pca_data) rutils.plot_data_3d_regression(tsne_data, y_train) scoring = metrics.make_scorer(log_rmse, greater_is_better=False)
#filter zero-variance features variance = feature_selection.VarianceThreshold() train2 = variance.fit_transform(train1) #embedded feature selection rf_estimator = ensemble.RandomForestClassifier() rf_grid = { 'max_depth': list(range(1, 9)), 'n_estimators': list(range(1, 300, 100)) } rf_final_estimator = cutils.grid_search_best_model(rf_estimator, rf_grid, train1, y) embedded_selector = feature_selection.SelectFromModel(rf_final_estimator, prefit=True, threshold='mean') utils.plot_feature_importances(rf_final_estimator, train1, cutoff=50) train2 = embedded_selector.transform(train1) #statistical feature selection statistical_selector = feature_selection.SelectKBest( feature_selection.f_classif, k=20) train2 = statistical_selector.fit_transform(train1, y) print(statistical_selector.scores_) #recursive feature elimination(rfe) rf_estimator = ensemble.RandomForestClassifier() rfe_selector = feature_selection.RFE(rf_estimator, n_features_to_select=10, step=5) train2 = rfe_selector.fit_transform(train1, y) print(rfe_selector.ranking_)