def modelPrompt(x_train, y_train, x_standard, y, classifier, name): while True: value = input( "Please choose the maximum number of features to select: ") try: k_value = int(value) except ValueError: print("The input you entered is not an integer. Please try again.") else: if k_value > 0: result = feature_selection.modelFeatSelect( x_train, y_train, classifier, k_value) print(result) if not result.empty: features = list(result['Feature']) title = "Learning Curves (" + name + ", log regression model)" x_selected = x_standard[features] x_train_selected = x_train[features] logreg = LogisticRegression(C=1) logreg.fit(x_train_selected, y_train) evaluate_model.plot_learning_curve(logreg, title, x_selected, y, cv=5) break else: print( "The integer you entered is not positive. Please try again." )
def uniPrompt(x_train, y_train, x_standard, y): print("Starting Univariance Feature Selection: ") while True: value = input("Please choose the number of features to select: ") try: k_value = int(value) except ValueError: print("The input you entered is not an integer. Please try again.") else: if k_value > 0: result = feature_selection.univariance(x_train, y_train, k_value) print(result) features = list(result['Feature']) title = "Learning Curves (univariance, log regression model)" x_selected = x_standard[features] x_train_selected = x_train[features] logreg = LogisticRegression(C=1) logreg.fit(x_train_selected, y_train) evaluate_model.plot_learning_curve(logreg, title, x_selected, y, cv=5) break else: print( "The integer you entered is not positive. Please try again." )
def evalModelPrompt(x_train, x_test, y_train, y_test, x_standard, y, classifier, name): print("Find the best number of features using " + name + "... ") selected_features = feature_selection.getBestK(x_train, x_test, y_train, y_test, classifier, name) title = "Learning Curves with " + name + " (use selected features)" x_selected = x_standard[list(selected_features)] x_train_selected = x_train[list(selected_features)] classifier.fit(x_train_selected, y_train) evaluate_model.plot_learning_curve(classifier, title, x_selected, y, cv=5)
def RFEPrompt(x_train, y_train, x_standard, y, classifier, name): print("Starting RFE: ") result = feature_selection.RFEFeatSelect(x_train, y_train, classifier) print(result) if not result.empty: features = list(result['Feature']) title = "Learning Curves with " + name + " (use current feature subset)" x_selected = x_standard[features] x_train_selected = x_train[features] classifier.fit(x_train_selected, y_train) evaluate_model.plot_learning_curve(classifier, title, x_selected, y, cv=5)
def main(): print("Begin Operation... ") print( "Notice: Non-numeric data besides the target class (can be categorical) will be ignored... " ) # get path to csv file csv_path = None while True: csv_path = input("Please provide a path to a csv file: ") if os.path.exists(csv_path) == False: print("The path that you provided is incorrect. Please try again.") elif os.path.isfile(csv_path) == False: print( "The path that you provided is not a file. Please try again.") elif csv_path.endswith('.csv') == False: print( "The path that you provided is not a csv file. Please try again." ) else: break df = pd.read_csv(csv_path, index_col=0) # get target class while True: target_name = input("Please provide the name of the target class: ") if target_name in df.head(): x, y = preprocessing.separateVars(df, target_name) break else: print( "The name that you provided doesn't exist in this csv file. Please try again." ) # get target class encoding y = encodePrompt(y, target_name) # remove non numeric data from feature columns x = preprocessing.cleanData(x) print("Non-numeric columns and columns with missing values removed.") # normalize data x_standard = preprocessing.standardization(x) print("Data Normalized (z-score).") # visualize data visualPrompt(x_standard, y) # prompt for feature transformation x_standard, y = transformationPrompt(x_standard, y) x_standard = preprocessing.standardization(x_standard) answer = input("Would you like to perform feature selection? (y/n) ") if answer == 'y': # split dataset to train and test x_train, x_test, y_train, y_test = train_test_split(x_standard, y, test_size=0.3, random_state=0) method = input( "Would you like to perform classification or regression? (c/r) ") if method == 'c': # find optimal classifier print("Begin to test classifiers... ") classifier1, name1, classifier2, name2 = feature_selection.optimalClassifier( x_train, x_test, y_train, y_test) # plot learning curve of current classifier title = "Learning Curves (" + name1 + ")" evaluate_model.plot_learning_curve(classifier1, title, x_standard, y, cv=5) title = "Learning Curves (" + name2 + ")" evaluate_model.plot_learning_curve(classifier2, title, x_standard, y, cv=5) answer = input( "Do you want to find the best number of feature using " + name1 + "? (y/n) ") if answer == 'y': evalModelPrompt(x_train, x_test, y_train, y_test, x_standard, y, classifier1, name1) else: print("Skip for " + name1 + ".") answer = input( "Do you want to find the best number of feature using " + name2 + "? (y/n) ") if answer == 'y': evalModelPrompt(x_train, x_test, y_train, y_test, x_standard, y, classifier2, name2) else: print("Skip for " + name2 + ".") elif method == 'r': print("Begin to test regression models... ") model1, name1, model2, name2 = feature_selection.optimalRegression( x_train, x_test, y_train, y_test) # plot learning curve of current classifier title = "Learning Curves (" + name1 + ")" evaluate_model.plot_learning_curve(model1, title, x_standard, y, cv=5) title = "Learning Curves (" + name2 + ")" evaluate_model.plot_learning_curve(model2, title, x_standard, y, cv=5) else: print("Your input doesn't match classification or regression.") # feature selection while True: method = input( "Please choose a feature selection method(RFE/univariance/lasso/optimal_classifier/MRMR): " ) if method == 'RFE': print("Using " + name1) RFEPrompt(x_train, y_train, x_standard, y, classifier1, name1) print("Using " + name2) RFEPrompt(x_train, y_train, x_standard, y, classifier2, name2) break elif method == 'univariance': uniPrompt(x_train, y_train, x_standard, y) break elif method == 'lasso': print("Using lasso") modelPrompt(x_train, y_train, x_standard, y, LassoCV(cv=5), 'Lasso') break elif method == 'optimal_classifier': print("Using " + name1) modelPrompt(x_train, y_train, x_standard, y, classifier1, name1) print("Using " + name2) modelPrompt(x_train, y_train, x_standard, y, classifier2, name2) break elif method == 'MRMR': result = feature_selection.mrmr(x_standard, y) print(result) break else: print( "The method you provide is not an option. Please try again. " ) else: print("Feature Selection is skipped. ") print("End of Operation.")