def preprocess(df): """ :param df: data frame which contains mean activation values in 116 brainn areas :return: male and female combined training and testing data where testing data is standardized using mean and variance of the respective gender training data. """ # Split the data into 80% training and 20% testing. train, test = mlu.train_test_split(df) # Obtaining male and female dataframes train_male = train.loc[train["gender"] == 1] train_female = train.loc[train["gender"] == 2] test_male = test.loc[test["gender"] == 1] test_female = test.loc[test["gender"] == 2] # Removing age and gender info from dataframe, so that only mean activation values in 116 brain regions are considered. train_male = train_male.drop(['gender', 'age'], axis=1, errors='ignore') train_female = train_female.drop(['gender', 'age'], axis=1, errors='ignore') test_male = test_male.drop(['gender', 'age'], axis=1, errors='ignore') test_female = test_female.drop(['gender', 'age'], axis=1, errors='ignore') # Converting dataframes into X and Y arrays wrt male and female x_train_male, y_train_male = mlu.get_features_labels(train_male) x_train_female, y_train_female = mlu.get_features_labels(train_female) x_test_male, y_test_male = mlu.get_features_labels(test_male) x_test_female, y_test_female = mlu.get_features_labels(test_female) # Standardisation of male training data and female training data scaler_male = StandardScaler() scaler_female = StandardScaler() x_train_male = scaler_male.fit_transform(x_train_male, y_train_male) x_train_female = scaler_female.fit_transform(x_train_female, y_train_female) # Standardisation of male testing data using mean and variance from male training data scale. x_test_male = scaler_male.transform(x_test_male) # Standardisation of female testing data using mean and variance from female training data scale. x_test_female = scaler_female.transform(x_test_female) # Combining male training data and female training data. x_train = np.concatenate((x_train_male, x_train_female)) y_train = np.concatenate((y_train_male, y_train_female)) # Combining male testing data and female testing data. x_test = np.concatenate((x_test_male, x_test_female)) y_test = np.concatenate((y_test_male, y_test_female)) return x_train, y_train, x_test, y_test
def run_perm_test(df, contrast_name, classifier_no, out, n_iterations): models = ["svm_kernel_default", "svm_kernel_tuned", "rfc", 'logistic_regression'] X, y = mlu.get_features_labels(df) for i in range(n_iterations): train, test = mlu.train_test_split(df) #x_train, y_train = mlu.get_features_labels(train) #x_test, y_test = mlu.get_features_labels(test) x_train, y_train, x_test, y_test = mlu.preprocess_remove_gender(df) for model_name in models: if model_name == "svm_kernel_default": model = svm.SVC(kernel='rbf', C=4, gamma=2 ** -5) elif model_name == "svm_kernel_tuned": param_grid = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 2 ** -5, 2 ** -10, 2 ** 5], 'kernel': ['rbf']} grid = GridSearchCV(svm.SVC(), param_grid, refit=True, cv=10, iid=False) grid.fit(X, y) best_param = grid.best_params_ model = svm.SVC(kernel=best_param['kernel'], C=best_param['C'], gamma=best_param['gamma']) elif model_name == "rfc": model = RandomForestClassifier(n_estimators=200) elif model_name == "logistic_regression": model = LogisticRegression(solver="liblinear", multi_class='auto') trained_model = model.fit(x_train,y_train) scores = mlu.balanced_accuracy(trained_model.predict(x_test), y_test) if os.path.isfile(options.input): df_res = pd.read_csv(options.input) else: df_res = pd.DataFrame( columns=['contrast', 'class', 'Model', 'original_accuracy']) df_res = df_res.append( {'contrast': contrast_name, 'class': classifier_no, 'Model': model_name, 'original_accuracy': scores}, ignore_index=True) df_res.to_csv(out + "permutation_result_%s_%s.csv" % (contrast_name, classifier_no), index=False) ## Only at the last iteration, permutation test is run 10000 times and using the performance scores and # mean of non-permutated accuracy of n_iterations, p-value can be calculated if i == n_iterations-1: scores, permutation_scores, p_value = mlu.permutation_test(X, y, model, 10000, 10) performance_file = contrast_name[0]+contrast_name[-1]+"_"+classifier_no+"_"+model_name np.savetxt(out+"%s.csv" % performance_file, permutation_scores, fmt="%10.18f")
def main(): input = "../Data" df, contrast_name = tools.data_extraction(input, 3, "Faces_con_0001.mat") df.fillna(df.mean(), inplace=True) scoresdf = pd.DataFrame(columns=['Score', 'Type', 'Model', 'Classifier']) # Model : model name for i in range(1): train, test = mlu.train_test_split(df) X, y = mlu.get_features_labels(train) tX, ty = mlu.get_features_labels(test) model = svm.SVC(kernel='rbf', C=4, gamma=2**-5) model.fit(X, y) train_score = model.score(X, y) test_score = model.score(tX, ty) predictions = model.predict(tX) print(len(ty)) print(confusion_matrix(ty, predictions)) print(classification_report(ty, predictions)) param_grid = { 'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.01, 0.001, 0.0001, 0.00001, 2**-5, 2**-10, 2**5], 'kernel': ['rbf'] } grid = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=3, cv=10) grid.fit(X, y) best_param = grid.best_params_ print((best_param)) grid_predictions = grid.predict(tX) print(confusion_matrix(ty, grid_predictions)) print(classification_report(ty, grid_predictions)) ### finding scores after hyperparamter tuning model = svm.SVC(kernel=best_param['kernel'], C=best_param['C'], gamma=best_param['gamma']) model.fit(X, y) train_score = model.score(X, y) test_score = model.score(tX, ty) scoresdf = scoresdf.append( { 'Score': train_score, 'Type': 'train', 'Model': 'svm_kernel', 'Classifier': 123, 'Contrast_name': contrast_name }, ignore_index=True) scoresdf = scoresdf.append( { 'Score': test_score, 'Type': 'test', 'Model': 'svm_kernel', 'Classifier': 123, 'Contrast_name': contrast_name }, ignore_index=True) fig, axes = plt.subplots(nrows=2, ncols=2) axs = axes.ravel() for j in range(4): models = scoresdf['Model'].unique() sns.boxplot(x='Model', y='Score', data=scoresdf[(scoresdf['Type'] == 'test') & (scoresdf['Model'] == 'svm_kernel')], ax=axs[j])
def run_basic_ml(df, options, n, scoresdf, contrast_name): print(contrast_name) models = [ "svm_kernel_default", "svm_kernel_tuned", "naive_bayes", "decision_tree", "rfc", 'logistic_regression' ] #models = ["svm_kernel_tuned"] for i in range(options.number_iterations): train, test = mlu.train_test_split(df) x_train, y_train = mlu.get_features_labels(train) x_test, y_test = mlu.get_features_labels(test) if options.model == "all": for model_name in models: #logger.debug("Running the %s model of the %s th iteration for %s contrast" %(model_name, i, contrast_name)) train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting( model_name, x_train, y_train, options.kFold, options.normalize) if options.normalize: x_test_minmax = min_max_scaler.transform(x_test) x_test = x_test_minmax test_score = trained_model.score(x_test, y_test) test_balanced_score = mlu.balanced_accuracy( trained_model.predict(x_test), y_test) #print(model_name + " Train:"+ str(train_score) + " Test:" +str(test_score) +" Contrast:" +contrast_name) scoresdf = scoresdf.append( { 'Score': train_score, 'Type': 'train', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score }, ignore_index=True) scoresdf = scoresdf.append( { 'Score': test_score, 'Type': 'test', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score }, ignore_index=True) else: train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting( options.model, x_train, y_train, options.kFold, True) test_score = trained_model.score(x_test, y_test) scoresdf = scoresdf.append( { 'Score': train_score, 'Type': 'train', 'Model': options.model, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score }, ignore_index=True) scoresdf = scoresdf.append( { 'Score': test_score, 'Type': 'test', 'Model': options.model, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score }, ignore_index=True) return scoresdf
def run_gender_cor(df, options, n, scoresdf, contrast_name, label): classification = True if label == 'gender': df.drop(['label', 'age'], axis=1, inplace=True) models = ["svm_kernel_default", "svm_kernel_tuned", "naive_bayes", "decision_tree", "rfc", 'logistic_regression'] elif label == 'age': df.drop(['label', 'gender'], axis=1, inplace=True) models = ['linear_reg', 'lasso', 'polynomial_reg'] models = ['svr_kernel_default', 'svr_kernel_tuned', 'gpr_default'] classification = False df = df.rename(columns={label: 'label'}) for i in range(options.number_iterations): train, test = mlu.train_test_split(df) x_train, y_train = mlu.get_features_labels(train) x_test, y_test = mlu.get_features_labels(test) if classification: scaler = StandardScaler() x_train = scaler.fit_transform(x_train, y_train) x_test = scaler.transform(x_test) if options.model == 'all': for model_name in models: train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(model_name, x_train, y_train, options.kFold, options.normalize) if options.normalize: x_test_minmax = min_max_scaler.transform(x_test) x_test = x_test_minmax test_score = trained_model.score(x_test, y_test) test_balanced_score = mlu.balanced_accuracy(trained_model.predict(x_test), y_test) if not classification: if model_name == "gpr_default": pred, sigma = trained_model.predict(x_test, return_std=True) else: pred = trained_model.predict(x_test) test_balanced_score = mean_squared_error(y_test, pred, multioutput='raw_values') # print(model_name + " Train:"+ str(train_score) + " Test:" +str(test_score) +" Contrast:" +contrast_name) scoresdf = scoresdf.append( {'Score': train_score, 'Type': 'train', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score}, ignore_index=True) scoresdf = scoresdf.append( {'Score': test_score, 'Type': 'test', 'Model': model_name, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score}, ignore_index=True) else: train_score, train_balanced_score, trained_model, min_max_scaler = mlu.model_fitting(options.model, x_train, y_train) test_score = trained_model.score(x_test, y_test) test_balanced_score = mlu.balanced_accuracy(trained_model.predict(x_test), y_test) scoresdf = scoresdf.append( {'Score': train_score, 'Type': 'train', 'Model': options.model, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': train_balanced_score}, ignore_index=True) scoresdf = scoresdf.append( {'Score': test_score, 'Type': 'test', 'Model': options.model, 'Classifier': n, 'Contrast_name': contrast_name, 'Balanced_accuracy': test_balanced_score}, ignore_index=True) return scoresdf