def run_3(): fold = 1 for train, test in StratifiedKFold(subject_labels, n_folds=5): # Create empty table for holding predictions predictions = dict() predictions['diagnosis'] = subject_labels[train] # For each ROI, get voxels corresponding to training subjects for roi_name in roi_names: predictions[roi_name] = [] X, y = get_xy( rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Get out-of-sample predictions for each fold in the CV scores = [] for train1, test1 in StratifiedKFold(subject_labels[train], n_folds=4): classifier = SVC() classifier.fit(X[train1], y[train1]) y_pred = classifier.predict(X[test1]) predictions[roi_name].extend(y_pred) scores.append(accuracy_score(y[test1], y_pred)) print('mean score: {}'.format(np.mean(scores))) print('complete score: {}'.format(accuracy_score(y, predictions[roi_name]))) # Create data frame from the predictions and save it to file predictions = pd.DataFrame(predictions, index=subject_ids[train]) predictions.to_csv('outputs/roi_predictions_fold{}.txt'.format(fold)) fold += 1 # Now we have, for each ROI, out-of-sample predictions for all training points # in the initial training set. Next, fit the second model to the out-of-sample # predictions. X, y = get_xy(predictions, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_combi = SVC(kernel='rbf') classifier_combi.fit(X, y) # Now we train a classifier on all training points of each ROI classifiers = {} for roi_name in roi_names: X, y = get_xy( rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) classifier = SVC() classifier.fit(X, y) classifiers[roi_name] = classifier # Next, we apply the ROI classifiers and combined classifier to the test data predictions = dict() predictions['diagnosis'] = subject_labels[test] for roi_name in roi_names: predictions[roi_name] = [] X, y = get_xy( rois[roi_name].loc[subject_ids[test]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) y_pred = classifiers[roi_name].predict(X) predictions[roi_name].extend(y_pred) predictions = pd.DataFrame(predictions) X, y = get_xy(predictions, label_column='diagnosis', exclude_columns=['diagnosis']) y_pred = classifier_combi.predict(X) print('overall score: {}'.format(accuracy_score(y, y_pred)))
def get_predictions(rois, roi_name, subject_ids, classifier): X, y = get_xy( rois[roi_name].loc[subject_ids], label_column='diagnosis1', exclude_columns=['age', 'gender', 'diagnosis1', 'diagnosis2']) return np.array(classifier.predict(X))
def get_probabilities(rois, roi_name, subject_ids, classifier): X, y = get_xy( rois[roi_name].loc[subject_ids], label_column='diagnosis1', exclude_columns=['age', 'gender', 'diagnosis1', 'diagnosis2']) probabilities = classifier.predict_proba(X) probabilities = np.transpose(probabilities) return probabilities[0], probabilities[1]
def save_classifier(rois, roi_name, subject_ids, probability): X, y = get_xy( rois[roi_name].loc[subject_ids], label_column='diagnosis1', exclude_columns=['age', 'gender', 'diagnosis1', 'diagnosis2']) param_grid = [{ 'C': [2**x for x in range(-5, 15, 2)]}] classifier = GridSearchCV(SVC(kernel='linear', probability=probability), param_grid=param_grid) classifier.fit(X, y) if not os.path.isdir('outputs/' + roi_name): os.mkdir('outputs/' + roi_name) joblib.dump(classifier, 'outputs/' + roi_name + '/classifier.pkl')
def save_classifier(rois, roi_name, subject_ids, probability): X, y = get_xy( rois[roi_name].loc[subject_ids], label_column='diagnosis1', exclude_columns=['age', 'gender', 'diagnosis1', 'diagnosis2']) param_grid = [{'C': [2**x for x in range(-5, 15, 2)]}] classifier = GridSearchCV(SVC(kernel='linear', probability=probability), param_grid=param_grid) classifier.fit(X, y) if not os.path.isdir('outputs/' + roi_name): os.mkdir('outputs/' + roi_name) joblib.dump(classifier, 'outputs/' + roi_name + '/classifier.pkl')
def train_classifier(rois, roi_name, subject_ids, queue, probability): X, y = get_xy( rois[roi_name].loc[subject_ids], label_column='diagnosis1', exclude_columns=['age', 'gender', 'diagnosis1', 'diagnosis2']) param_grid = [{ 'C': [2**x for x in range(-5, 15, 2)]}] scores = [] for train, test in StratifiedKFold(y, n_folds=10, shuffle=True): classifier = GridSearchCV(SVC(kernel='linear', probability=probability), param_grid=param_grid) classifier.fit(X[train], y[train]) score = accuracy_score(y[test], classifier.predict(X[test])) scores.append(score) queue.put((roi_name, np.mean(scores)))
def train_classifier(rois, roi_name, subject_ids, queue, probability): X, y = get_xy( rois[roi_name].loc[subject_ids], label_column='diagnosis1', exclude_columns=['age', 'gender', 'diagnosis1', 'diagnosis2']) param_grid = [{'C': [2**x for x in range(-5, 15, 2)]}] scores = [] for train, test in StratifiedKFold(y, n_folds=10, shuffle=True): classifier = GridSearchCV(SVC(kernel='linear', probability=probability), param_grid=param_grid) classifier.fit(X[train], y[train]) score = accuracy_score(y[test], classifier.predict(X[test])) scores.append(score) queue.put((roi_name, np.mean(scores)))
def run_3(): fold = 1 for train, test in StratifiedKFold(subject_labels, n_folds=5): # Create empty table for holding predictions predictions = dict() predictions['diagnosis'] = subject_labels[train] # For each ROI, get voxels corresponding to training subjects for roi_name in roi_names: predictions[roi_name] = [] X, y = get_xy(rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Get out-of-sample predictions for each fold in the CV scores = [] for train1, test1 in StratifiedKFold(subject_labels[train], n_folds=4): classifier = SVC() classifier.fit(X[train1], y[train1]) y_pred = classifier.predict(X[test1]) predictions[roi_name].extend(y_pred) scores.append(accuracy_score(y[test1], y_pred)) print('mean score: {}'.format(np.mean(scores))) print('complete score: {}'.format( accuracy_score(y, predictions[roi_name]))) # Create data frame from the predictions and save it to file predictions = pd.DataFrame(predictions, index=subject_ids[train]) predictions.to_csv('outputs/roi_predictions_fold{}.txt'.format(fold)) fold += 1 # Now we have, for each ROI, out-of-sample predictions for all training points # in the initial training set. Next, fit the second model to the out-of-sample # predictions. X, y = get_xy(predictions, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_combi = SVC(kernel='rbf') classifier_combi.fit(X, y) # Now we train a classifier on all training points of each ROI classifiers = {} for roi_name in roi_names: X, y = get_xy(rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) classifier = SVC() classifier.fit(X, y) classifiers[roi_name] = classifier # Next, we apply the ROI classifiers and combined classifier to the test data predictions = dict() predictions['diagnosis'] = subject_labels[test] for roi_name in roi_names: predictions[roi_name] = [] X, y = get_xy(rois[roi_name].loc[subject_ids[test]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) y_pred = classifiers[roi_name].predict(X) predictions[roi_name].extend(y_pred) predictions = pd.DataFrame(predictions) X, y = get_xy(predictions, label_column='diagnosis', exclude_columns=['diagnosis']) y_pred = classifier_combi.predict(X) print('overall score: {}'.format(accuracy_score(y, y_pred)))
def run(): # Create log file and grab script text create_log() script_text = get_file_text('run.py') # Create output directory if it does not exist if not os.path.isdir(OUTPUTS_DIR): os.mkdir(OUTPUTS_DIR) # The code below follows a performance estimation procedure suggested by the following # post on Stack Overflow: https://stats.stackexchange.com/questions/102631/k-fold-cross-validation-of-ensemble-learning # Load ROIs roi_names = load_roi_names(FILE_HC_SZ) rois = {} for roi_name in roi_names: roi = load_roi( os.path.join(ROIS_DIR, 'hc_sz', roi_name + '_age_matched.txt')) for i in roi.index: diagnosis = roi.loc[i, 'diagnosis1'] roi.set_value(i, 'diagnosis1', 0 if diagnosis == 'HC' else 1) roi['diagnosis1'] = roi['diagnosis1'].astype(int) rois[roi_name] = roi log('added ROI: {}'.format(roi_name)) # Define parameter range for grid search later param_grid = [{'C': [2**x for x in range(-5, 15, 2)]}] # Get subject IDs and labels roi = rois[roi_names[0]] subject_ids = roi.index subject_labels = roi['diagnosis1'] log('nr. subjects: {}'.format(len(subject_ids))) scores_pred = [] scores_dist = [] fold = 1 # This outer CV loop is meant for averaging scores for train, test in StratifiedKFold(subject_labels, n_folds=10, shuffle=True): predictions_file = 'outputs/predictions_train{}.txt'.format(fold) distances_file = 'outputs/distances_train{}.txt'.format(fold) if not os.path.isfile(predictions_file): # Create empty tables for holding predictions and distances predictions = dict() predictions['diagnosis'] = subject_labels[train] distances = dict() distances['diagnosis'] = subject_labels[train] # Run through all ROIs for roi_name in roi_names: log('calculating out-of-sample predictions for {}'.format( roi_name)) # Initialize prediction table for this ROI's column predictions[roi_name] = [] distances[roi_name] = [] # Get training data from the data frame X, y = get_xy(rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Use 4-fold CV to get out-of-sample predictions for all training points i = 1 for train1, test1 in StratifiedKFold(subject_labels[train], n_folds=4): # Do grid search to find optimal C parameter classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5) classifier.fit(X[train1], y[train1]) # Store predictions and distances for this ROI y_pred = classifier.predict(X[test1]) predictions[roi_name].extend(y_pred) y_dist = classifier.decision_function(X[test1]) distances[roi_name].extend(y_dist) print(' step {} - {}'.format(i, 4)) i += 1 # Save predictions to file log('saving file: {}'.format(predictions_file)) predictions = pd.DataFrame(predictions, index=subject_ids[train]) predictions.to_csv(predictions_file, index_label='id') # Save distances to file log('saving file: {}'.format(distances_file)) distances = pd.DataFrame(distances, index=subject_ids[train]) distances.to_csv(distances_file, index_label='id') # --------------------- param_grid_rbf = [{ 'C': [2**x for x in range(-5, 15, 2)], 'gamma': [2**x for x in range(-15, 4, 2)] }] # Train classifier on predictions log('training level-2 prediction classifier') predictions = pd.read_csv(predictions_file, index_col='id') X, y = get_xy(predictions, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_pred = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5) classifier_pred.fit(X, y) log('saving level-2 prediction classifier') joblib.dump(classifier_pred, 'outputs/classifier_pred{}.pkl'.format(fold)) # Train classifier on distances log('training level-2 distance classifier') distances = pd.read_csv(distances_file, index_col='id') X, y = get_xy(distances, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_dist = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5) classifier_dist.fit(X, y) log('saving level-2 distance classifier') joblib.dump(classifier_pred, 'outputs/classifier_dist{}.pkl'.format(fold)) # --------------------- # Train each ROI classifier on all training points and save it to disk for roi_name in roi_names: log('training {} on all training points'.format(roi_name)) # Skip this step if exported classifier already exists classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format( fold) if os.path.isfile(classifier_file): continue # Get training data for this fold X, y = get_xy(rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Train classifier using grid search classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5) classifier.fit(X, y) # Save best classifier to file log('saving {} classifier to disk'.format(roi_name)) joblib.dump(classifier, classifier_file) # --------------------- # Load ROI classifiers from file classifiers = {} for roi_name in roi_names: classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format( fold) classifiers[roi_name] = joblib.load(classifier_file) # --------------------- predictions_test_file = 'outputs/predictions_test{}.txt'.format(fold) distances_test_file = 'outputs/distances_test{}.txt'.format(fold) if not os.path.isfile(predictions_test_file): predictions_test = dict() predictions_test['diagnosis'] = subject_labels[test] distances_test = dict() distances_test['diagnosis'] = subject_labels[test] for roi_name in roi_names: predictions_test[roi_name] = [] distances_test[roi_name] = [] # Get test data from the data frame X, y = get_xy(rois[roi_name].loc[subject_ids[test]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) log('calculating predictions and distances for {}'.format( roi_name)) # Store predictions and distances y_pred = classifiers[roi_name].predict(X) predictions_test[roi_name].extend(y_pred) y_dist = classifiers[roi_name].decision_function(X) distances_test[roi_name].extend(y_dist) # Save predictions to file log('saving predictions to file') predictions_test = pd.DataFrame(predictions_test, index=subject_ids[test]) predictions_test.to_csv(predictions_test_file, index_label='id') # Save distances to file log('saving distances to file') distances_test = pd.DataFrame(distances_test, index=subject_ids[test]) distances_test.to_csv(distances_test_file, index_label='id') # --------------------- # Load prediction classifier and run it on test predictions predictions_test = pd.read_csv(predictions_test_file, index_col='id') X_test, y_test = get_xy(predictions_test, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_pred = joblib.load( 'outputs/classifier_pred{}.pkl'.format(fold)) y_pred = classifier_pred.predict(X_test) scores_pred.append(accuracy_score(y_test, y_pred)) log('score: {} (predictions)'.format(scores_pred[-1])) # Load distance classifier and run it on test distances distances_test = pd.read_csv(distances_file, index_col='id') X_test, y_test = get_xy(distances_test, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_dist = joblib.load( 'outputs/classifier_dist{}.pkl'.format(fold)) y_pred = classifier_dist.predict(X_test) scores_dist.append(accuracy_score(y_test, y_pred)) log('score: {} (distances)'.format(scores_dist[-1])) fold += 1 log('overall score: {} (predictions)'.format(np.mean(scores_pred))) log('overall score: {} (distances)'.format(np.mean(scores_dist))) # Append script to log and close it add_text_to_log(script_text) finish_log()
def run(): # Create log file and grab script text create_log() script_text = get_file_text('run.py') # Create output directory if it does not exist if not os.path.isdir(OUTPUTS_DIR): os.mkdir(OUTPUTS_DIR) # The code below follows a performance estimation procedure suggested by the following # post on Stack Overflow: https://stats.stackexchange.com/questions/102631/k-fold-cross-validation-of-ensemble-learning # Load ROIs roi_names = load_roi_names(FILE_HC_SZ) rois = {} for roi_name in roi_names: roi = load_roi(os.path.join(ROIS_DIR, 'hc_sz', roi_name + '_age_matched.txt')) for i in roi.index: diagnosis = roi.loc[i, 'diagnosis1'] roi.set_value(i, 'diagnosis1', 0 if diagnosis == 'HC' else 1) roi['diagnosis1'] = roi['diagnosis1'].astype(int) rois[roi_name] = roi log('added ROI: {}'.format(roi_name)) # Define parameter range for grid search later param_grid = [{ 'C': [2**x for x in range(-5, 15, 2)]}] # Get subject IDs and labels roi = rois[roi_names[0]] subject_ids = roi.index subject_labels = roi['diagnosis1'] log('nr. subjects: {}'.format(len(subject_ids))) scores_pred = [] scores_dist = [] fold = 1 # This outer CV loop is meant for averaging scores for train, test in StratifiedKFold(subject_labels, n_folds=10, shuffle=True): predictions_file = 'outputs/predictions_train{}.txt'.format(fold) distances_file = 'outputs/distances_train{}.txt'.format(fold) if not os.path.isfile(predictions_file): # Create empty tables for holding predictions and distances predictions = dict() predictions['diagnosis'] = subject_labels[train] distances = dict() distances['diagnosis'] = subject_labels[train] # Run through all ROIs for roi_name in roi_names: log('calculating out-of-sample predictions for {}'.format(roi_name)) # Initialize prediction table for this ROI's column predictions[roi_name] = [] distances[roi_name] = [] # Get training data from the data frame X, y = get_xy( rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Use 4-fold CV to get out-of-sample predictions for all training points i = 1 for train1, test1 in StratifiedKFold(subject_labels[train], n_folds=4): # Do grid search to find optimal C parameter classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5) classifier.fit(X[train1], y[train1]) # Store predictions and distances for this ROI y_pred = classifier.predict(X[test1]) predictions[roi_name].extend(y_pred) y_dist = classifier.decision_function(X[test1]) distances[roi_name].extend(y_dist) print(' step {} - {}'.format(i, 4)) i += 1 # Save predictions to file log('saving file: {}'.format(predictions_file)) predictions = pd.DataFrame(predictions, index=subject_ids[train]) predictions.to_csv(predictions_file, index_label='id') # Save distances to file log('saving file: {}'.format(distances_file)) distances = pd.DataFrame(distances, index=subject_ids[train]) distances.to_csv(distances_file, index_label='id') # --------------------- param_grid_rbf = [{ 'C': [2**x for x in range(-5, 15, 2)], 'gamma': [2**x for x in range(-15, 4, 2)]}] # Train classifier on predictions log('training level-2 prediction classifier') predictions = pd.read_csv(predictions_file, index_col='id') X, y = get_xy(predictions, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_pred = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5) classifier_pred.fit(X, y) log('saving level-2 prediction classifier') joblib.dump(classifier_pred, 'outputs/classifier_pred{}.pkl'.format(fold)) # Train classifier on distances log('training level-2 distance classifier') distances = pd.read_csv(distances_file, index_col='id') X, y = get_xy(distances, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_dist = GridSearchCV(SVC(kernel='rbf'), param_grid=param_grid_rbf, cv=5) classifier_dist.fit(X, y) log('saving level-2 distance classifier') joblib.dump(classifier_pred, 'outputs/classifier_dist{}.pkl'.format(fold)) # --------------------- # Train each ROI classifier on all training points and save it to disk for roi_name in roi_names: log('training {} on all training points'.format(roi_name)) # Skip this step if exported classifier already exists classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format(fold) if os.path.isfile(classifier_file): continue # Get training data for this fold X, y = get_xy( rois[roi_name].loc[subject_ids[train]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) # Train classifier using grid search classifier = GridSearchCV(SVC(kernel='linear'), param_grid=param_grid, cv=5) classifier.fit(X, y) # Save best classifier to file log('saving {} classifier to disk'.format(roi_name)) joblib.dump(classifier, classifier_file) # --------------------- # Load ROI classifiers from file classifiers = {} for roi_name in roi_names: classifier_file = 'outputs/classifier_' + roi_name + '_train{}.pkl'.format(fold) classifiers[roi_name] = joblib.load(classifier_file) # --------------------- predictions_test_file = 'outputs/predictions_test{}.txt'.format(fold) distances_test_file = 'outputs/distances_test{}.txt'.format(fold) if not os.path.isfile(predictions_test_file): predictions_test = dict() predictions_test['diagnosis'] = subject_labels[test] distances_test = dict() distances_test['diagnosis'] = subject_labels[test] for roi_name in roi_names: predictions_test[roi_name] = [] distances_test[roi_name] = [] # Get test data from the data frame X, y = get_xy( rois[roi_name].loc[subject_ids[test]], label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2']) log('calculating predictions and distances for {}'.format(roi_name)) # Store predictions and distances y_pred = classifiers[roi_name].predict(X) predictions_test[roi_name].extend(y_pred) y_dist = classifiers[roi_name].decision_function(X) distances_test[roi_name].extend(y_dist) # Save predictions to file log('saving predictions to file') predictions_test = pd.DataFrame(predictions_test, index=subject_ids[test]) predictions_test.to_csv(predictions_test_file, index_label='id') # Save distances to file log('saving distances to file') distances_test = pd.DataFrame(distances_test, index=subject_ids[test]) distances_test.to_csv(distances_test_file, index_label='id') # --------------------- # Load prediction classifier and run it on test predictions predictions_test = pd.read_csv(predictions_test_file, index_col='id') X_test, y_test = get_xy(predictions_test, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_pred = joblib.load('outputs/classifier_pred{}.pkl'.format(fold)) y_pred = classifier_pred.predict(X_test) scores_pred.append(accuracy_score(y_test, y_pred)) log('score: {} (predictions)'.format(scores_pred[-1])) # Load distance classifier and run it on test distances distances_test = pd.read_csv(distances_file, index_col='id') X_test, y_test = get_xy(distances_test, label_column='diagnosis', exclude_columns=['diagnosis']) classifier_dist = joblib.load('outputs/classifier_dist{}.pkl'.format(fold)) y_pred = classifier_dist.predict(X_test) scores_dist.append(accuracy_score(y_test, y_pred)) log('score: {} (distances)'.format(scores_dist[-1])) fold += 1 log('overall score: {} (predictions)'.format(np.mean(scores_pred))) log('overall score: {} (distances)'.format(np.mean(scores_dist))) # Append script to log and close it add_text_to_log(script_text) finish_log()
def run(): """ This script runs both linear and nonlinear SVMs on each ROI and creates a list ranking the ROIs ordered by classification power. :return: None """ # Create new log file create_log(subdir=LOGS_DIR) # Grab script text for inclusion in the log file at the end script_text = get_file_text('run.py') # Load ROI file names roi_names = [] with open(ROI_FILE_HC_SZ, 'r') as f: for line in f.readlines(): if line.startswith('#'): continue roi_name = line.strip().split('.')[0] roi_names.append(roi_name) # Initialize dictionary with ROI info like performance scores roi_info = init_output_dict(roi_names) # Run through each ROI file name for roi_name in roi_names: # BOTH GENDERS roi_file_age_matched = roi_name + '_age_matched.txt' log('loading {}'.format(roi_file_age_matched)) features = pd.read_csv(ROI_ROOT_DIR + '/hc_sz/' + roi_file_age_matched, index_col='id') # Make sure to select only schizophrenia patients and controls features_HC = features[features['diagnosis1'] == 'HC'] features_SZ = features[features['diagnosis1'] == 'SZ'] features = pd.concat([features_HC, features_SZ]) # Get X, y for both genders X, y = get_xy( features=features, label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2', 'age', 'gender']) # Run SVM scores, cs, gammas = run_svm(X, y, get_accuracy, kernel='linear') log('accuracy: {} +/- {}'.format(np.mean(scores), np.std(scores))) roi_info[roi_name]['all']['linear']['accuracy']['mean'] = np.mean( scores) roi_info[roi_name]['all']['linear']['accuracy']['stddev'] = np.std( scores) roi_info[roi_name]['all']['linear']['cs'] = cs # MALES roi_file_M_age_matched = roi_name + '_male_age_matched.txt' log('loading {}'.format(roi_file_M_age_matched)) features = pd.read_csv(ROI_ROOT_DIR + '/hc_sz/' + roi_file_M_age_matched, index_col='id') # Select only schizophrenia patients and controls features_HC = features[features['diagnosis1'] == 'HC'] features_SZ = features[features['diagnosis1'] == 'SZ'] features = pd.concat([features_HC, features_SZ]) # Get X, y for males X, y = get_xy( features=features, label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2', 'age', 'gender']) # Run linear SVM scores, cs, gammas = run_svm(X, y, get_accuracy, kernel='linear') log('accuracy: {} +/- {} (linear)'.format(np.mean(scores), np.std(scores))) roi_info[roi_name]['males']['linear']['accuracy']['mean'] = np.mean( scores) roi_info[roi_name]['males']['linear']['accuracy']['stddev'] = np.std( scores) roi_info[roi_name]['males']['linear']['cs'] = cs # FEMALES roi_file_F_age_matched = roi_name + '_female_age_matched.txt' log('loading {}'.format(roi_file_F_age_matched)) features = pd.read_csv(ROI_ROOT_DIR + '/hc_sz/' + roi_file_F_age_matched, index_col='id') # Select only schizophrenia patients and controls features_HC = features[features['diagnosis1'] == 'HC'] features_SZ = features[features['diagnosis1'] == 'SZ'] features = pd.concat([features_HC, features_SZ]) # Get X, y for females X, y = get_xy( features=features, label_column='diagnosis1', exclude_columns=['diagnosis1', 'diagnosis2', 'age', 'gender']) # Run linear SVM scores, cs, gammas = run_svm(X, y, get_accuracy, kernel='linear') log('accuracy: {} +/- {} (linear)'.format(np.mean(scores), np.std(scores))) roi_info[roi_name]['females']['linear']['accuracy']['mean'] = np.mean( scores) roi_info[roi_name]['females']['linear']['accuracy']['stddev'] = np.std( scores) roi_info[roi_name]['females']['linear']['cs'] = cs # Write scores to JSON if not os.path.isdir(OUTPUTS_DIR): os.mkdir(OUTPUTS_DIR) output_file = os.path.join(OUTPUTS_DIR, get_log_timestamp() + '_scores.json') write_dict_to_json(output_file, roi_info) # Append this script's text to the log file for backup add_text_to_log(script_text) finish_log()