class SupervisedClassificationModels: """ Need to change this docstring. Fits multiple supervised classification models on the data. Args: preedictors (numpy-matrix): Matrix of predictor variables (or features) outcome (numpy-array): Array of outcome (or target variable) test_frac (float): Fraction of data to be considered as test set col_ind (list): List containing indices of columns of categorical-type class_repot (boolean): Default 'False', if 'True' then classification-report is saved in a dataframe. """ def __init__(self, predictors, outcome, test_frac, col_ind, class_report=False, matrix=False): self._predictors = predictors self._outcome = outcome self._test_frac = test_frac self._col_ind = col_ind self._class_report = class_report self._matrix = matrix self._predictors_temp = None self._rf = None ## A Random-Forest object self._lr = None ## A Logistic-Regression object self._sv = None ## A Support-Vector object self._classification_report_rf = None ## Classification dataframe: RF self._classification_report_lr = None ## Classification dataframe: LR self._classification_report_sv = None ## Classification dataframe: SV def _feature_engineering(self): """ Convert categorical features (including binary features as well) into labels and then create dummies of those features. Args: None Return: Transformed version of predictors """ if self._matrix: self._predictors[:, self._col_ind] = np.apply_along_axis( lambda col: LabelEncoder().fit_transform(col), 0, self._predictors[:, self._col_ind]) self._predictors = OneHotEncoder( categorical_features=[self._col_ind], sparse=False).fit_transform(self._predictors) return self._predictors else: cat_data = self._predictors.iloc[:, self._col_ind] # self._predictors.drop(self._predictors.columns[self._col_ind], axis = 1, inplace = True) self._predictors_temp = self._predictors.drop( self._predictors.columns[self._col_ind], axis=1) # original_col_names = list(self._predictors.columns) self._predictors = pd.DataFrame() dummy_col_names = [] for col in self._col_ind: LE = LabelEncoder() cat_data.iloc[:, col - 1] = LE.fit_transform( list(cat_data.iloc[:, col - 1])) new_names_temp = [ LE.classes_[i] + '_' + str(i) for i in range(len(LE.classes_)) ] dummy_col_names.extend(new_names_temp) OHE = OneHotEncoder(sparse=False) cat_data = OHE.fit_transform(cat_data) cat_data = pd.DataFrame(cat_data, columns=dummy_col_names) self._predictors = pd.concat([self._predictors_temp, cat_data], axis=1) # final_col_names = [] # final_col_names.extend(original_col_names) # final_col_names.extend(dummy_col_names) return self._predictors def _train_test_split(self): """ Split the data into train-test set for further modeling process. Returns a tuple of train and test set. Args: None Return: X_train (): Matrix of predictors-train set X_test (): Matrix of predictors-test set y_train (): Array of outcome-train set y_test (): Array of outcome-test set """ X_train, X_test, y_train, y_test = train_test_split( self._predictors, self._outcome, test_size=self._test_frac, random_state=0) return X_train, X_test, y_train, y_test def fit_logistic_regression(self): """ Fit a Logistic-Regression model on the data. Args: None Return: lr: Logistic-Regression object cm: Confusion-Matrix """ self._predictors = self._feature_engineering() X_train, X_test, y_train, y_test = self._train_test_split() ## Feature Scaling scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ##Fitting a model: lr = LogisticRegression(random_state=101) lr = lr.fit(X_train, y_train) y_pred = lr.predict(X_test) cm = confusion_matrix(y_pred=y_pred, y_true=y_test) p, r, f, s = precision_recall_fscore_support(y_true=y_test, y_pred=y_pred) if self._class_report: self._classification_report_lr = pd.DataFrame({ 'Precision': p, 'Recall': r, 'F_score': f, 'Support': s, 'Class': np.unique(y_test), 'Model': ['LR'] * len(set(y_test)) }) return lr, cm, self._classification_report_lr return lr, cm def fit_random_forest(self): """ Fit a Random-Forest model on the data. Args: None Return: rf: Random-Forest object cm: Confusion-Matrix """ self._predictors = self._feature_engineering() X_train, X_test, y_train, y_test = self._train_test_split() ## Feature Scaling scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ##Fitting a model: self._rf = RandomForestClassifier(random_state=101) self._rf = self._rf.fit(X_train, y_train) y_pred = self._rf.predict(X_test) cm = confusion_matrix(y_pred=y_pred, y_true=y_test) p, r, f, s = precision_recall_fscore_support(y_true=y_test, y_pred=y_pred) if self._class_report: self._classification_report_rf = pd.DataFrame({ 'Precision': p, 'Recall': r, 'F_score': f, 'Support': s, 'Class': np.unique(y_test), 'Model': ['RF'] * len(set(y_test)) }) return self._rf, cm, self._classification_report_rf return self._rf, cm def plot_feature_importance(self): plt.rc('ytick', labelsize=14) plt.rc('xtick', labelsize=12) ## Plot feature importance for Random-Forest: feature_importance = self._rf.feature_importances_ features = self._predictors.columns ft_imp_df = pd.DataFrame({ 'Features': features, 'Feature_Importance': feature_importance }) ft_imp_df.sort_values(by='Feature_Importance', ascending=False, inplace=True) ft_imp_df.reset_index(inplace=True, drop=True) fig, ax = plt.subplots(figsize=(7, 4)) ft_imp_df.head().plot(y='Feature_Importance', x='Features', ax=ax, kind='barh') ax.set_xlabel('Relative Importance of Features', fontsize=18) ax.set_ylabel('Features', fontsize=18) ax.set_title('RF: Feature Importance Plot \n (Customer Churn Example)', fontsize=20) return fig, ax def fit_support_vector_classifier(self): """ Fit a Support-Vector-Classifier on the data. Args: None Return: sv: Support-Vector-Classifier object cm: Confusion-Matrix """ self._predictors = self._feature_engineering() X_train, X_test, y_train, y_test = self._train_test_split() ## Feature Scaling scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) ##Fitting a model: sv = svm.SVC(random_state=101) sv = sv.fit(X_train, y_train) y_pred = sv.predict(X_test) cm = confusion_matrix(y_pred=y_pred, y_true=y_test) p, r, f, s = precision_recall_fscore_support(y_true=y_test, y_pred=y_pred) if self._class_report: self._classification_report_sv = pd.DataFrame({ 'Precision': p, 'Recall': r, 'F_score': f, 'Support': s, 'Class': np.unique(y_test), 'Model': ['SV'] * len(set(y_test)) }) return sv, cm, self._classification_report_sv return sv, cm def compare_models(lr_df, rf_df, sv_df): # comparison_df = pd.concat([self._classification_report_lr, self._classification_report_rf, self._classification_report_sv]) comparison_df = pd.concat([lr_df, rf_df, sv_df]) plt.rc('xtick', labelsize=14) plt.rc('legend', fontsize=14) fig, ax = plt.subplots(figsize=(10, 5)) comparison_df.plot(x=['Model', 'Class'], y=['Precision', 'Recall', 'F_score'], kind='bar', ax=ax) ax.set_xlabel('Model & Class', fontsize=18) ax.set_ylabel('Performace', fontsize=18) ax.set_ylim(0, 1.2) ax.set_title('Comparison of Models \n (Customer Churn Example)', fontsize=20) ax.legend(bbox_to_anchor=(0.88, 1), ncol=3) return fig, ax
# In[134]: df=df.apply(LabelEncoder().fit_transform) # In[135]: from sklearn.feature_selection import chi2 # In[137]: X = df.drop(['transportation_issues','person_id_syn'],axis=1) y = df['transportation_issues'] # In[ ]: # In[138]: X