def prepare_data_for_training(file_dir): dataframe = pd.read_csv(file_dir) # replacing the labels dataframe['feature_class']. \ replace({'ASD': 1.0, 'TD': -1.0}, inplace=True) # splitting data into train and test x_train, x_test, y_train, y_test = Utils.train_test_split(dataframe=dataframe, test_size=0.2) # convert to dataframe to numpy array and return return x_train.to_numpy(), x_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()
def process_training_data(fl_dir, min_max_scalar=None): df = pd.read_csv(fl_dir) # replace labels df['feature_class'].replace({'ASD': 1.0, 'TD': -1.0}, inplace=True) x_train, x_test, y_train, y_test = Utils.train_test_split(df, 0.2) # if min_max_scalar is not None: # x_train = Utils.normalize_dataset(x_train, min_max_scalar) # x_test = Utils.normalize_dataset(x_test, min_max_scalar) return x_train, x_test, y_train, y_test
def process_training_data(fl_dir): df = pd.read_csv(fl_dir) # finding min max scalar # min_max_scalar = Utils.calculate_min_max_scalar(pd.read_csv(fl_dir)) # replace labels df['feature_class'].replace({'ASD': 1.0, 'TD': -1.0}, inplace=True) X_train, X_test, y_train, y_test = Utils.train_test_split(df, 0.2) # X_train = Utils.normalize_dataset(X_train, min_max_scalar) # X_test = Utils.normalize_dataset(X_test, min_max_scalar) X_train = X_train.to_numpy() X_test = X_test.to_numpy() y_train = y_train.to_numpy() y_test = y_test.to_numpy() return X_train, X_test, y_train, y_test
def process_training_data(fl_dir, min_max_scalar): dataframe = pd.read_csv(fl_dir) # calculates the min max value of each col in the dataframe # min_max_scalar = Utils.calculate_min_max_scalar(dataset=dataframe) # replacing the labels dataframe['feature_class']. \ replace({'ASD': 1.0, 'TD': -1.0}, inplace=True) # splitting data into train and test x_train, x_test, y_train, y_test = Utils.train_test_split( dataframe=dataframe, test_size=0.2) # normalizing train and test data # x_train = Utils.normalize_dataset(df=x_train, min_max=min_max_scalar) # x_test = Utils.normalize_dataset(df=x_test, min_max=min_max_scalar) # # # insert intercept col 'b' in (W * Xi + b) # x_train.insert(loc=len(x_train.columns), column='intercept', value=1) # x_test.insert(loc=len(x_test.columns), column='intercept', value=1) return x_train.to_numpy(), x_test.to_numpy(), y_train.to_numpy( ), y_test.to_numpy()
def is_leaf_node(self): # if self.value is not None: # return True # else: # return False return self.value is not None if __name__ == "__main__": df = pd.read_csv("D:/TrainingDataset_YEAR_PROJECT/TrainingSet.csv") # replace labels df['feature_class'].replace({'ASD': 1.0, 'TD': -1.0}, inplace=True) X_train, X_test, y_train, y_test = Utils.train_test_split(df, 0.2) X_train = X_train.values X_test = X_test.values y_train = y_train.values y_test = y_test.values decision_tree = DecisionTree(max_depth=10) decision_tree.train_decision_tree(X_train, y_train) y_pred = decision_tree.predict(X_test) accuracy_score = Utils.calculate_accuracy_score(y_test, y_pred) print("Accuracy: ", accuracy_score) # References:
def train_svm_model(fl_dir): # X_train, X_test, Y_train, Y_test = SupportVectorMachine.proc_CSV_data(fl_dir) dataframe = pd.read_csv(fl_dir) dataframe["feature_class"]. \ replace({"ASD": 1, "TD": 0}, inplace=True) dataframe = dataframe.sample(frac=1) # DATA NORMALIZATION # min_max_scalar = Utils.calculate_min_max_scalar(dataframe) # dataframe = Utils.normalize_dataset(dataframe, min_max_scalar) # # X = dataframe.drop(labels="feature_class", axis=1) # Y = dataframe['feature_class'] # # # feature selection BEGIN # print("Shape before feature selection", X.shape) # # # L1-based feature selection # lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, Y) # model = SelectFromModel(lsvc, prefit=True) # X = model.transform # # # Univariate feature selection # X = SelectKBest(chi2, k=6).fit_transform(X, Y) # Tree-based feature selection # clf = ExtraTreesClassifier(n_estimators=50) # clf = clf.fit(X, Y) # clf.feature_importances_ # model = SelectFromModel(clf, prefit=True) # X = model.transform(X) # print("Shape after feature seleciton", X.shape) # feature selection END # train test split # X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20) X_train, X_test, Y_train, Y_test = Utils.train_test_split( dataframe=dataframe, test_size=0.20) # SVM # scalar = MinMaxScaler() # X_train = pd.DataFrame(scalar.fit_transform(X_train.values)) # X_test = pd.DataFrame(scalar.transform(X_test.values)) # # ns_probs = [0 for _ in range(len(Y_test))] # svm_model_linear = SVC(kernel='rbf', probability=True).fit(X_train, Y_train) # polynomial kernel # # # load the saved model # # load_model = joblib.load(filename=saved_mdl_path) # # svm_prediction = svm_model_linear.predict(X_test) # # accuracy = svm_model_linear.score(X_test, Y_test) # print(accuracy) # debug # # # creating a confusion matrix # cm = confusion_matrix(Y_test, svm_prediction) # # print(cm) # print(classification_report(Y_test, svm_prediction)) # # # predict probabilities # lr_probs = svm_model_linear.predict_proba(X_test) # # keep probabilities for the positive outcome only # lr_probs = lr_probs[:, 1] # # calculate scores # ns_auc = roc_auc_score(Y_test, ns_probs) # lr_auc = roc_auc_score(Y_test, lr_probs) # # summarize scores # print('No Skill: ROC AUC=%.3f' % (ns_auc)) # print('Logistic: ROC AUC=%.3f' % (lr_auc)) # # calculate roc curves # ns_fpr, ns_tpr, _ = roc_curve(Y_test, ns_probs) # lr_fpr, lr_tpr, _ = roc_curve(Y_test, lr_probs) # # plot the roc curve for the model # pyplot.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill') # pyplot.plot(lr_fpr, lr_tpr, marker='.', label='Logistic') # # axis labels # pyplot.xlabel('False Positive Rate') # pyplot.ylabel('True Positive Rate') # # # save the model # # saved_mdl_path = 'normal_leaf_model.sav' # # joblib.dump(svm_model_linear, saved_mdl_path) # # # show the legend # pyplot.legend() # # show the plot # pyplot.show() # RANDOM FOREST CLASSIFIER # scalar = MinMaxScaler() # X_train = pd.DataFrame(scalar.fit_transform(X_train.values)) # X_test = pd.DataFrame(scalar.transform(X_test.values)) # # ns_probs = [0 for _ in range(len(Y_test))] # regressor = RandomForestClassifier(n_estimators=18, max_depth=10).fit(X_train, Y_train) # y_pred = regressor.predict(X_test) # # print(confusion_matrix(Y_test, y_pred)) # print(classification_report(Y_test, y_pred)) # # print(Y_test) # # print(y_pred) # print(SupportVectorMachine.calc_accuracy_score(Y_test, y_pred)) # # accuracy = regressor.score(X_test, Y_test) # print(accuracy) # debug # # print(accuracy_score(Y_test, y_pred.round(), normalize=False)) # NN.............................................................. # mlp = MLPClassifier(hidden_layer_sizes=9, activation='relu', max_iter=400, solver='adam').fit(X_train, Y_train) # # predictions = mlp.predict(X_test) # print(confusion_matrix(Y_test, predictions)) # print(classification_report(Y_test, predictions)) # print(Utils.calculate_accuracy_score(Y_test, predictions)) # KNN # scalar = MinMaxScaler() # X_train = pd.DataFrame(scalar.fit_transform(X_train.values)) # X_test = pd.DataFrame(scalar.transform(X_test.values)) model = KNeighborsClassifier(n_neighbors=2).fit(X_train, Y_train) y_pred = model.predict(X_test) # accuracy = model.score(X_test, Y_test) print(accuracy) # debug print(confusion_matrix(Y_test, y_pred)) print(classification_report(Y_test, y_pred)) accuracy = model.score(X_test, Y_test) print(accuracy) # debug