def variance_sweep(features, labels, seed, save_plot, show_plot, step=500): """ Performs a sweep across the range of variance to establish the value of removing low variance pixels """ model = KMeans(n_clusters=10) variance_analysis = [] selector = VarianceThreshold() selector.fit(features) # Sweeps through variance range print("Performing sweep of variance thresholding...") # The bounds correspond to approximately 0 features selected and all features selected # lower bound 2900 # upper bound 6450 with click.progressbar(range(2900, 6450, step)) as variance_range: for variance in variance_range: selector.set_params(threshold=variance) selected_features = selector.transform(features) numpy.random.set_state(seed) predictions = model.fit_predict(selected_features) variance_analysis.append( (variance, score_clustering(labels, predictions))) # Plots results from variance sweep if show_plot or save_plot: data = list(zip(*[(x, *y.values()) for x, y in variance_analysis])) name = "Variance" handles = plt.plot(data[0], data[3], '-b', label=name + " V Score") handles += plt.plot(data[0], data[4], '--b', label=name + " Rand") plt.legend(handles, loc="lower left") plt.xlabel("Variance Threshold for Feature Selection") plt.title("Effect of Variance Threshold Feature Selection") if save_plot is not None: path = os.path.join(save_plot, "variance_sweep.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf() # Plots the variances of each pixel as a heatmap plt.imshow(selector.variances_.reshape(48, -1), cmap='hot', interpolation='lanczos') plt.title("Heatmap of Variances between Images") if save_plot is not None: path = os.path.join(save_plot, "variance_heatmap.png") plt.savefig(path) print("") print("saved figure to " + path) if show_plot: plt.show() plt.clf()
class VarianceThreshold(FeatureSelectionAlgorithm): r"""Implementation of feature selection using variance threshold. Date: 2020 Author: Luka Pečnik License: MIT Documentation: https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html See Also: * :class:`niaaml.preprocessing.feature_selection.feature_selection_algorithm.FeatureSelectionAlgorithm` """ Name = 'Variance Threshold' def __init__(self, **kwargs): r"""Initialize VarianceThreshold feature selection algorithm. """ self._params = dict( threshold=ParameterDefinition(MinMax(0, 0.1), np.float)) self.__variance_threshold = VarThr() def set_parameters(self, **kwargs): r"""Set the parameters/arguments of the algorithm. """ self.__variance_threshold.set_params(**kwargs) def select_features(self, x, y, **kwargs): r"""Perform the feature selection process. Arguments: x (pandas.core.frame.DataFrame): Array of original features. y (pandas.core.series.Series) Expected classifier results. Returns: numpy.ndarray[bool]: Mask of selected features. """ self.__variance_threshold.fit(x) return self.__variance_threshold.get_support() def to_string(self): r"""User friendly representation of the object. Returns: str: User friendly representation of the object. """ return FeatureSelectionAlgorithm.to_string(self).format( name=self.Name, args=self._parameters_to_string( self.__variance_threshold.get_params()))
def main(): # get_ipython().run_line_magic('matplotlib', 'inline') data_dir = 'data' file_name = 'credit_card_default.csv' #column_names = ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6,BILL_AMT1,BILL_AMT2,BILL_AMT3,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default payment next month file_url = '' if not is_running_from_ipython(): abspath = os.path.abspath('.') file_url = 'file://' + abspath + os.path.sep + data_dir + os.path.sep + file_name else: file_url = os.path.join('../', data_dir, file_name) # logging.debug('abspath %s', abspath) pp = PredictiveProcessor() pp.file_url = file_url df = pp.data_read_csv() pp.df = df # pp.problem_understanding() pp.data_preparation() pp.data_analysis_exploratory() pp.data_model_building() pp.model_evaluation() pp.model_deployment() logger.info("LL: -----------------------------------------------") # check is any null values for column in df.columns: # logger.info("LL: column %s datatype %s is having null : %s , about %s", column, df.dtypes[column], df[column].isnull().values.any(), df[column].isnull().values.sum()) # logger.info("LL: column %s datatype %s is having NA : %s , about %s", column, df.dtypes[column], df[column].isna().values.any(), df[column].isna().values.sum()) if (df.dtypes[column], df[column].isnull().values.any() and df.dtypes[column], df[column].isna().values.any()): if (df.dtypes[column] == 'int64'): logger.info("LL: yes int64") # df_example = df[column].fillna(0) else: logger.info("LL: not int64") #alternatively use missingno #-----------missingno logger.info("LL: df.shape \n%s", df.shape) logger.info("LL: df correlation \n%s", df.corr().round(2)) logger.info("LL: df correlation columns \n%s", df.corr().columns) logger.info("LL: df correlation index \n%s", df.corr().index) logger.info("LL: df covariance \n%s", df.cov().round(2)) logger.info("LL: df covariance columns \n%s", df.cov().columns) logger.info("LL: df covariance index \n%s", df.cov().index) # drop rows with null values df_dropped_rows_na = df.dropna(axis=0) logger.info("LL: df_dropped_rows_na.shape %s", df_dropped_rows_na.shape) # drop columns with null values df_dropped_cols_na = df.dropna(axis=1) logger.info("LL: df_dropped_cols_na.shape %s", df_dropped_cols_na.shape) #impute nan with new values # missing_values = form of missing values in your data. (For example nan, 0, or "n/a". # strategy = how to impute (choices are "mean", "median", "most frequent", and "constant". # If you pass strategy=constant, then you can use the optional argument fill_value to pass your constant. imputer = SimpleImputer(missing_values=np.nan, strategy='mean') cols_to_impute = df.columns out_imp = imputer.fit_transform(df[cols_to_impute]) df_new = pd.DataFrame(data=out_imp, columns=cols_to_impute) # df_new = pd.concat([df_new, df[['species']]], axis = 1) minmax_scaler = MinMaxScaler() cols_to_minmaxscale = df.columns out_scaled_minmax = minmax_scaler.fit_transform(df[cols_to_minmaxscale]) standard_scaler = StandardScaler() cols_to_standardscale = df.columns out_scaled_standard = standard_scaler.fit_transform( df[cols_to_standardscale]) # encode categorical nonint features categorical_features_nonint = [] for column in df.columns: if (df.dtypes[column] != 'int64' or df.dtypes[column] != 'int32'): logger.info("LL: no int64 or int32") categorical_features_nonint.append(column) # df_example = df[column].fillna(0) else: logger.info("LL: yes int64 or int32") from sklearn.preprocessing import OrdinalEncoder enc_ordinal = OrdinalEncoder() out_enc_ord_catg_feat_nonint = enc_ordinal.fit_transform( df[categorical_features_nonint]) logger.info("LL: out_enc categories \n%s", enc_ordinal.categories_) logger.info("LL: out_enc \n%s", out_enc_ord_catg_feat_nonint) df[categorical_features_nonint] = out_enc_ord_catg_feat_nonint logger.info("LL: df_new \n%s", df.head()) # One-hot Enconding from sklearn.preprocessing import OneHotEncoder enc_onehot = OneHotEncoder(sparse=False) out_enc_onehot_catg_feat_nonint = enc_onehot.fit_transform( df[categorical_features_nonint]) new_cols_onehot_catg_feat_nonint = enc_onehot.get_feature_names( categorical_features_nonint).tolist() logger.info("LL: new_cols \n%s", new_cols_onehot_catg_feat_nonint) # Label encoding from sklearn import preprocessing enc_label = preprocessing.LabelEncoder() out_enc_label = enc_label.fit_transform(categorical_features_nonint) # Dimension Reduction # Feature Selection # Feature Filtering # Variance Treshold # Correlation Coefficient #Variance Treshold from sklearn.feature_selection import VarianceThreshold selector = VarianceThreshold() cols = df.columns # cols = categorical_features_nonint selector.fit(df[cols]) # check feature variances before selection logger.info("LL: variance treshold \n%s", selector.variances_) # set threshold into selector object selector.set_params(threshold=1.0) out_sel = selector.fit_transform(df[cols]) logger.info("LL: selector.get_support() \n%s", selector.get_support) df_sel = df.iloc[:, selector.get_support()] # add labels to new dataframe and sanity check df_sel = pd.concat([df_sel, df[['default payment next month']]], axis=1) logger.info("LL: df_sel.head() \n%s", df_sel.head()) #Correlation Coefficient cor = df.corr() sns.heatmap(cor, annot=False, cmap=plt.cm.Blues) logger.info("LL: plt.show() \n%s", plt.show()) # get correlation values with target variable cor_target = abs(cor['default payment next month']) logger.info("LL: cor_target \n%s", cor_target) #For demonstration purposes, we will choose 0.6 as the threshold and then filter. From the output, you should expect columns 5 and 12 (0.69 and 0.74) to be selected: selected_cols = cor_target[cor_target > 0.6] logger.info("LL: selected columns, correlation with target > 0.6") logger.info("LL: selected_cols \n%s", selected_cols) # filter in the selected features df_sel = df[selected_cols.index] logger.info("LL: df_sel.head() \n%s", def_sel.head()) # Wrapper Methods # Sequential Feature Selection # Forward Sequential Selection and Backward Sequential Selection # LinearRegression() for continuous target variables and RandomForestClassifier() for categorical target variables # We will use the Support Vector Machine Classifier ("SVC") as the estimator for our example RFE. # Now let's import our modules and define the independent (X) and dependent (y) variables for the SVC: from sklearn.feature_selection import RFE from sklearn.svm import SVC cols = df.columns X = df[cols] y = df['default payment next month'] svc = SVC(kernel="linear", C=1) rfe = RFE(estimator=svc, n_features_to_select=2, step=1) rfe.fit(X, y) logger.info("LL: cols \n%s", cols) logger.info("LL: rfe.ranking_") logger.info("LL: -----------------------------------------------")