def main(): pdata = pu.proj_data() rois = pdata.roiLabels meg_subj, meg_sess = pdata.get_meg_metadata() phase_amp_file = '../data/MEG_phase_amp_data.hdf5' attn_rois = ['IPS1_R', 'FEF_R', 'TPOJ1_R', 'AVI_R', '7m_R'] # ['IPS1_R', 'FEF_R', 'TPOJ1_R', 'AVI_R'] # --Phase-phase coupling-- # first_level_tests_ppc = first_level_ppc(phase_amp_file, meg_subj, meg_sess, rois, attn_rois) first_level_tests_ppc.to_excel( '../data/attention_networks/ppc_first_level.xlsx') first_level_tests_ppc = pd.read_excel( '../data/attention_networks/ppc_first_level.xlsx', index_col=0) second_level_res = second_level(first_level_tests_ppc) cron_alpha_res = cron_alpha_test(first_level_tests_ppc, attn_rois, meg_sess) res = { 'first_level_tests': first_level_tests_ppc, 'second_level_tests': second_level_res, 'cron_alpha_tests': cron_alpha_res } pu.save_xls(res, '../data/attention_networks/ppc_second_level.xlsx') plot_grouped_boxplot(first_level_tests_ppc, attn_rois, cron_alpha_df=cron_alpha_res, fname='../figures/attention_networks/ppc_boxplot.pdf') # --Phase-amplitude coupling-- # first_level_tests_pac = first_level_pac(phase_amp_file, meg_subj, meg_sess, rois, attn_rois) first_level_tests_pac.to_excel( '../data/attention_networks/pac_first_level.xlsx') first_level_tests_pac = pd.read_excel( '../data/attention_networks/pac_first_level.xlsx', index_col=0) second_level_res = second_level(first_level_tests_pac) cron_alpha_res = cron_alpha_test(first_level_tests_pac, attn_rois, meg_sess) res = { 'first_level_tests': first_level_tests_pac, 'second_level_tests': second_level_res, 'cron_alpha_tests': cron_alpha_res } pu.save_xls(res, '../data/attention_networks/pac_second_level.xlsx') plot_grouped_boxplot(first_level_tests_pac, attn_rois, cron_alpha_df=cron_alpha_res, fname='../figures/attention_networks/pac_boxplot.pdf')
def save_output(output_dir, behavior, scores, confusion_matrices, features, grid_df=None, model=None, resamp_method=None, covariates=True): if covariates: cov_check = 'with_covariates' else: cov_check = 'without_covariates' folder_name = '%s %s %s %s' % (behavior, model, resamp_method, cov_check) res_dir = join(output_dir, folder_name) if not isdir(res_dir): mkdir(res_dir) pu.save_xls(scores, join(res_dir, 'performance.xlsx')) if features is not None: pu.save_xls(features, join(res_dir, 'coefficients.xlsx')) pu.save_xls(confusion_matrices, join(res_dir, 'confusion_matrices.xlsx')) normalized_cms = {} for fold in confusion_matrices: cm = confusion_matrices[fold] norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] normalized_cms[fold] = norm pu.save_xls(normalized_cms, join(res_dir, 'confusion_matrices_normalized.xlsx'))
def get_variable_data(): def _count_data(data_to_count, vartype): data_df = pd.DataFrame(data_to_count, columns=[vartype]) count_df = data_df[vartype].value_counts() return count_df output_dir = './../data/eeg_classification' if not isdir(output_dir): mkdir(output_dir) behavior_data, conn_data = pu.load_data_full_subjects() side_data = pu.convert_tin_to_str( behavior_data['tinnitus_side'].values.astype(float), 'tinnitus_side') side_count = _count_data(side_data, 'Side') type_data = pu.convert_tin_to_str( behavior_data['tinnitus_type'].values.astype(float), 'tinnitus_type') type_count = _count_data(type_data, 'Type') tq_data = behavior_data['distress_TQ'].values high_low_thresholds = [0, 46, 84] binned_high_low = np.digitize(tq_data, bins=high_low_thresholds, right=True) tq_high_low = ['Low' if t < 2 else 'High' for t in binned_high_low] hl_count = _count_data(tq_high_low, 'TQ (High/Low)') grade_thresholds = [0, 30, 46, 59, 84] binned_grade = np.digitize(tq_data, bins=grade_thresholds, right=True) tq_grade = ['Grade_%d' % t for t in binned_grade] grade_count = _count_data(tq_grade, 'TQ (Grade)') gender = behavior_data['sex'] gender_str = ['Male' if g > 0 else 'Female' for g in gender.values] gender_count = _count_data(gender_str, 'Gender') # categorical_variables = ['smoking', 'deanxit_antidepressants', 'rivotril_antianxiety', 'sex'] # categorical_data = behavior_data[categorical_variables] output = { 'side': side_count, 'type': type_count, 'tq_high_low': hl_count, 'tq_grade': grade_count, 'gender': gender_count } pu.save_xls(output, join(output_dir, 'tin_variables_classcount.xlsx'))
def pls_psqi_with_power(sessions, rois, fig_dir, run_check=False): logging.info('%s: Running PLSC on PSQI components with power' % pu.ctime()) if not os.path.isdir(fig_dir): os.mkdir(fig_dir) meg_df = pd.read_excel('../data/MEG_infraslow_power.xlsx', sheet_name='location', index_col=0) sleep_df, sleep_variables = load_psqi_data() if run_check: pres, bres = run_pls(x=meg_df.values, y=sleep_df.values, output_dir=fig_dir) else: logging.info('%s: Loading raw output' % pu.ctime()) with open(fig_dir + '/pls_sleep.pkl', 'rb') as file: res = pkl.load(file) pres = res['permutation tests'] bres = res['bootstrap_tests'] alpha = .001 nv = len(np.where(pres['p_values'] < alpha)[0]) latent_vars = ['LV_%d' % (v + 1) for v in range(nv)] pls_functions.plot_scree(eigs=pres['true_eigs'], pvals=pres['p_values'], alpha=alpha, fname=fig_dir + '/scree.png') behavior_df = pd.DataFrame(bres['y_zscores'][:nv, :], index=latent_vars, columns=sleep_variables) behavior_df.to_excel(fig_dir + '/behavior_res.xlsx') brain_res = organize_brain_sals(bres['x_zscores'], rois, sessions, latent_vars, comp='sign') pu.save_xls(brain_res, fig_dir + '/brain_res.xlsx') conj = brain_res['brain_conjunction'] plot_roi_saliences(rois, conj, fig_dir, maxv=120, create_rois=False) logging.info('%s: Finished' % pu.ctime())
def eeg_multilabel_classify(ml_data, target_data, target_type, model, outdir): target_outdir = join(outdir, target_type) if not isdir(target_outdir): mkdir(target_outdir) feature_names = list(ml_data) # Create score dataframes, k-fold splitter n_splits = 10 skf = model_selection.StratifiedKFold(n_splits=n_splits, random_state=seed) # Oversample connectivity data, apply k-fold splitter """Note: LP-transformation has to be applied for resampling, even though we're not treating it as a OVR problem""" x_res, y_res, y_res_lp_transformed = resample_multilabel( ml_data, target_data) skf.get_n_splits(x_res, y_res_lp_transformed) fold_count = 0 classifier_objects, classifier_coefficients = {}, {} anx_balanced_acc, anx_chance_acc, anx_f1_scores = [], [], [] dep_balanced_acc, dep_chance_acc, dep_f1_scores = [], [], [] anx_cm_dict, anx_norm_cm_dict, dep_cm_dict, dep_norm_cm_dict = {}, {}, {}, {} for train_idx, test_idx in skf.split(x_res, y_res_lp_transformed): fold_count += 1 print('%s: Running FOLD %d for %s' % (pu.ctime(), fold_count, target_type)) foldname = 'Fold %02d' % fold_count # Stratified k-fold splitting x_train, x_test = x_res[train_idx], x_res[test_idx, :] y_train, y_test = y_res[train_idx], y_res[test_idx, :] if "categorical_sex_male" in feature_names: continuous_features = [ f for f in feature_names if 'categorical' not in f ] continuous_indices = [ ml_data.columns.get_loc(cont) for cont in continuous_features ] categorical_features = [ f for f in feature_names if 'categorical' in f ] categorical_indices = [ ml_data.columns.get_loc(cat) for cat in categorical_features ] x_train_feature_selected, x_test_feature_selected, cleaned_features = feature_selection_with_covariates( x_train, x_test, y_train, continuous_indices, categorical_indices, feature_names) else: x_train_feature_selected, x_test_feature_selected, cleaned_features = feature_selection_without_covariates( x_train, x_test, y_train, feature_names) if model is 'extra_trees': predicted, feature_importances, clf = extra_trees( x_train_feature_selected, y_train, x_test_feature_selected, cleaned_features) classifier_coefficients[foldname] = feature_importances elif model is 'knn': predicted, clf = knn(x_train_feature_selected, y_train, x_test_feature_selected) classifier_objects[foldname] = clf # Anxiety predictions yt, pred = y_test[:, 0], predicted[:, 0] balanced, chance, f1 = calc_scores(yt, pred) anx_balanced_acc.append(balanced) anx_chance_acc.append(chance) anx_f1_scores.append(f1) # Calculating fold confusion matrix anx_cm = metrics.confusion_matrix(yt, pred) anx_normalized_cm = anx_cm.astype('float') / anx_cm.sum( axis=1)[:, np.newaxis] classes = [] for subclass_list in clf.classes_: classes.extend(list(subclass_list)) anx_classes = [c for c in classes if 'anxiety' in c] dep_classes = [c for c in classes if 'depression' in c] anx_cm_dict[foldname] = pd.DataFrame(anx_cm, index=anx_classes, columns=anx_classes) anx_norm_cm_dict[foldname] = pd.DataFrame(anx_normalized_cm, index=anx_classes, columns=anx_classes) # Depression predictions yt, pred = y_test[:, 1], predicted[:, 1] balanced, chance, f1 = calc_scores(yt, pred) dep_balanced_acc.append(balanced) dep_chance_acc.append(chance) dep_f1_scores.append(f1) # Calculating fold confusion matrix dep_cm = metrics.confusion_matrix(yt, pred) dep_normalized_cm = dep_cm.astype('float') / dep_cm.sum( axis=1)[:, np.newaxis] dep_cm_dict[foldname] = pd.DataFrame(dep_cm, index=dep_classes, columns=dep_classes) dep_norm_cm_dict[foldname] = pd.DataFrame(dep_normalized_cm, index=dep_classes, columns=dep_classes) # Saving anxiety performance scores anx_f1_array = np.asarray(anx_f1_scores) anx_f1_class_averages = np.mean(anx_f1_array, axis=0) anx_f1_data = np.vstack((anx_f1_array, anx_f1_class_averages)) balanced_acc_avg = np.mean(anx_balanced_acc) chance_acc_avg = np.mean(anx_chance_acc) anx_balanced_acc.append(balanced_acc_avg) anx_chance_acc.append(chance_acc_avg) accuracy_data = np.asarray([anx_balanced_acc, anx_chance_acc]).T rownames = ['Fold %02d' % (n + 1) for n in range(n_splits)] rownames.append('Average') score_df = pd.DataFrame(data=accuracy_data, index=rownames, columns=['Balanced accuracy', 'Chance accuracy']) f1_df = pd.DataFrame(data=np.asarray(anx_f1_data), index=rownames, columns=anx_classes) scores_dict = {'accuracy scores': score_df, 'f1 scores': f1_df} pu.save_xls(scores_dict, join(target_outdir, 'anxiety_performance.xlsx')) # Saving performance scores dep_f1_array = np.asarray(dep_f1_scores) dep_f1_class_averages = np.mean(dep_f1_array, axis=0) dep_f1_data = np.vstack((dep_f1_array, dep_f1_class_averages)) balanced_acc_avg = np.mean(dep_balanced_acc) chance_acc_avg = np.mean(dep_chance_acc) dep_balanced_acc.append(balanced_acc_avg) dep_chance_acc.append(chance_acc_avg) accuracy_data = np.asarray([dep_balanced_acc, dep_chance_acc]).T rownames = ['Fold %02d' % (n + 1) for n in range(n_splits)] rownames.append('Average') score_df = pd.DataFrame(data=accuracy_data, index=rownames, columns=['Balanced accuracy', 'Chance accuracy']) f1_df = pd.DataFrame(data=np.asarray(dep_f1_data), index=rownames, columns=dep_classes) scores_dict = {'accuracy scores': score_df, 'f1 scores': f1_df} pu.save_xls(scores_dict, join(target_outdir, 'depression_performance.xlsx')) # Saving coefficients if bool(classifier_coefficients): pu.save_xls(classifier_coefficients, join(target_outdir, 'coefficients.xlsx')) # Saving confusion matrices pu.save_xls(anx_cm_dict, join(target_outdir, 'anxiety_confusion_matrices.xlsx')) pu.save_xls( anx_norm_cm_dict, join(target_outdir, 'anxiety_confusion_matrices_normalized.xlsx')) pu.save_xls(dep_cm_dict, join(target_outdir, 'depression_confusion_matrices.xlsx')) pu.save_xls( dep_norm_cm_dict, join(target_outdir, 'depression_confusion_matrices_normalized.xlsx')) # Saving classifier object with open(join(target_outdir, 'classifier_object.pkl'), 'wb') as file: pkl.dump(classifier_objects, file)
for subj in subjects: f = h5py.File('../data/downsampled_MEG_truncated.hdf5', 'r') data = f[subj + '/MEG/' + sess + '/resampled_truncated'][...] f.close() data = _butter_filter(data, fs=500, cutoffs=[.01, .1]) fft_power = np.absolute(np.fft.rfft(data, axis=0)) ** 2 average_power = np.mean(fft_power, axis=0) session_df.loc[subj] = average_power df_list.append(session_df) grand_df = pd.concat(df_list, axis=1) return grand_df print('%s: Finished' % pu.ctime()) if __name__ == "__main__": location_df = calc_phase_amp_power(how='location') bp_df = calc_phase_amp_power(how='bandpass') trunc_df = calc_phase_amp_power(how='truncated') power_dict = {'location': location_df, 'bandpass': bp_df, 'truncated': trunc_df} pu.save_xls(power_dict, '../data/MEG_infraslow_power.xlsx')
def pls_psqi_with_ppc_roi_version(fig_dir, run_check=False): import matplotlib.pyplot as plt from seaborn import heatmap logging.info( '%s: Running PLSC on PSQI components with phase-phase coupling' % pu.ctime()) if not os.path.isdir(fig_dir): os.mkdir(fig_dir) ppc_first_level = pd.read_excel( '../data/attention_networks/ppc_first_level.xlsx', index_col=0) colnames = list(ppc_first_level) connections = [c.split(' ')[1] for c in colnames] rois = pd.unique([c.split('-')[0].replace('\n', '') for c in connections]) same, mirror = mirror_strfind(rois) columns_to_drop = [c for m in mirror for c in colnames if m in c] meg_df = ppc_first_level.drop(columns=columns_to_drop) sessions = pd.unique([t.split(' ')[0] for t in list(meg_df)]) connections = pd.unique([t.split(' ')[1] for t in list(meg_df)]) sleep_df, sleep_variables = load_psqi_data() if run_check: pres, bres = run_pls(x=meg_df.values, y=sleep_df.values, output_dir=fig_dir) else: logging.info('%s: Loading raw output' % pu.ctime()) with open(fig_dir + '/pls_sleep.pkl', 'rb') as file: res = pkl.load(file) pres = res['permutation tests'] bres = res['bootstrap_tests'] print(pres['p_values']) alpha = .001 nv = 1 # len(np.where(pres['p_values'] < alpha)[0]) latent_vars = ['LV_%d' % (v + 1) for v in range(nv)] pls_functions.plot_scree(eigs=pres['true_eigs'], pvals=pres['p_values'], alpha=alpha, fname=fig_dir + '/scree.png') behavior_df = pd.DataFrame(bres['y_zscores'][:nv, :], index=latent_vars, columns=sleep_variables) behavior_df.to_excel(fig_dir + '/behavior_res.xlsx') brain_res = organize_brain_sals(bres['x_zscores'], connections, sessions, latent_vars, comp='sign') pu.save_xls(brain_res, fig_dir + '/brain_res.xlsx') conj_res = brain_res['brain_conjunction'] heatmap_data = pd.DataFrame(np.full(shape=(len(rois), len(rois)), fill_value=np.nan), index=rois, columns=rois) for roi1 in rois: for roi2 in rois: idx_label = '%s-%s' % (roi1, roi2) if idx_label not in conj_res.index: continue else: val = conj_res.loc[idx_label]['LV_1'] heatmap_data.loc[roi2][roi1] = val fig, ax = plt.subplots(figsize=(8, 6)) heatmap(data=heatmap_data, cmap='coolwarm', center=0.0, annot=True, fmt='.2f', cbar=True, square=True, ax=ax) fig.savefig(fig_dir + '/heatmap.svg') # plt.show() logging.info('%s: Finished' % pu.ctime())
def pls_psqi_with_bold_alpha_pac(fig_dir, run_check=True): logging.info( '%s: Running PLSC on PSQI components with phase-amplitude coupling' % pu.ctime()) if not os.path.isdir(fig_dir): os.mkdir(fig_dir) # Extracting metadata h5_file = h5py.File('../data/MEG_phase_amp_coupling.hdf5') sessions = list(h5_file) meg_subj = list(h5_file[sessions[0]]) rois = list(h5_file[sessions[0] + '/' + meg_subj[0]]) h5_file.close() bold_pac_index = 0 alpha_pac_index = 3 meg_data = [] for sess in sessions: session_df = pd.DataFrame(index=meg_subj, columns=rois) for roi in rois: h5_file = h5py.File('../data/MEG_phase_amp_coupling.hdf5') for subj in meg_subj: key = sess + '/' + subj + '/' + roi + '/r_vals' dset = h5_file[key][...] session_df.loc[subj][roi] = dset[bold_pac_index, alpha_pac_index] h5_file.close() meg_data.append(session_df) meg_df = pd.concat(meg_data, axis=1) sleep_df, sleep_variables = load_psqi_data() if run_check: pres, bres = run_pls(x=meg_df.values, y=sleep_df.values, output_dir=fig_dir) else: logging.info('%s: Loading raw output' % pu.ctime()) with open(fig_dir + '/pls_sleep.pkl', 'rb') as file: res = pkl.load(file) pres = res['permutation tests'] bres = res['bootstrap_tests'] alpha = .001 nv = len(np.where(pres['p_values'] < alpha)[0]) latent_vars = ['LV_%d' % (v + 1) for v in range(nv)] pls_functions.plot_scree(eigs=pres['true_eigs'], pvals=pres['p_values'], alpha=alpha, fname=fig_dir + '/scree.png') behavior_df = pd.DataFrame(bres['y_zscores'][:nv, :], index=latent_vars, columns=sleep_variables) behavior_df.to_excel(fig_dir + '/behavior_res.xlsx') brain_res = organize_brain_sals(np.abs(bres['x_zscores']), rois, sessions, latent_vars, comp='sign') pu.save_xls(brain_res, fig_dir + '/brain_res.xlsx') conj = brain_res['brain_conjunction'] plot_roi_saliences(rois, conj, fig_dir, maxv=120, create_rois=False) logging.info('%s: Finished' % pu.ctime())
def eeg_classify(eeg_data, target_data, target_type, model, outdir=None, resample='SMOTE'): feature_names = list(eeg_data) if "categorical_sex_male" in feature_names: cv_check = 'with_covariates' else: cv_check = 'without_covariates' if resample is 'no_resample': class NoResample: # for convenience @staticmethod def fit_resample(a, b): return a.values, np.asarray(b) resampler = NoResample() elif resample is 'ROS': resampler = RandomOverSampler(sampling_strategy='not majority', random_state=seed) elif resample is 'SMOTE': resampler = SMOTE(sampling_strategy='not majority', random_state=seed) elif resample is 'RUS': resampler = RandomUnderSampler(sampling_strategy='not minority', random_state=seed) x_res, y_res = resampler.fit_resample(eeg_data, target_data) if outdir is not None: model_outdir = join( outdir, '%s %s %s %s' % (target_type, model, cv_check, resample)) if not isdir(model_outdir): mkdir(model_outdir) print('%s: Running classification - %s %s %s %s' % (pu.ctime(), target_type, model, cv_check, resample)) # Apply k-fold splitter n_splits = 50 skf = model_selection.StratifiedKFold(n_splits=n_splits, random_state=seed) skf.get_n_splits(x_res, y_res) fold_count = 0 classifier_objects, classifier_coefficients, cm_dict, norm_cm_dict = {}, {}, {}, {} balanced_acc, chance_acc, f1_scores = [], [], [] for train_idx, test_idx in skf.split(x_res, y_res): fold_count += 1 print('%s: Running FOLD %d for %s' % (pu.ctime(), fold_count, target_type)) foldname = 'Fold %02d' % fold_count # Stratified k-fold splitting x_train, x_test = x_res[train_idx], x_res[test_idx] y_train, y_test = y_res[train_idx], y_res[test_idx] if "categorical_sex_male" in feature_names: continuous_features = [ f for f in feature_names if 'categorical' not in f ] continuous_indices = [ eeg_data.columns.get_loc(cont) for cont in continuous_features ] categorical_features = [ f for f in feature_names if 'categorical' in f ] categorical_indices = [ eeg_data.columns.get_loc(cat) for cat in categorical_features ] x_train_fs, x_test_fs, cleaned_features = feature_selection_with_covariates( x_train, x_test, y_train, continuous_indices, categorical_indices, feature_names) else: x_train_fs, x_test_fs, cleaned_features = feature_selection_without_covariates( x_train, x_test, y_train, feature_names) if model is 'svm': predicted, coef_df, clf = svmc(x_train_fs, y_train, x_test_fs, cleaned_features) classifier_coefficients[foldname] = coef_df elif model is 'extra_trees': predicted, feature_importances, clf = extra_trees( x_train_fs, y_train, x_test_fs, cleaned_features) classifier_coefficients[foldname] = feature_importances elif model is 'knn': predicted, clf = knn(x_train_fs, y_train, x_test_fs) classifier_objects[foldname] = clf # Calculating fold performance scores balanced, chance, f1 = calc_scores(y_test, predicted) balanced_acc.append(balanced) chance_acc.append(chance) f1_scores.append(f1) # Calculating fold confusion matrix cm = metrics.confusion_matrix(y_test, predicted) normalized_cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] cm_dict[foldname] = pd.DataFrame(cm, index=clf.classes_, columns=clf.classes_) norm_cm_dict[foldname] = pd.DataFrame(normalized_cm, index=clf.classes_, columns=clf.classes_) # Saving performance scores f1_df, score_df = save_scores(f1_scores, balanced_acc, chance_acc, class_labels=clf.classes_) scores_dict = {'accuracy scores': score_df, 'f1 scores': f1_df} try: pu.save_xls(scores_dict, join(model_outdir, 'performance.xlsx')) # Saving coefficients if bool(classifier_coefficients): pu.save_xls(classifier_coefficients, join(model_outdir, 'coefficients.xlsx')) pu.save_xls(cm_dict, join(model_outdir, 'confusion_matrices.xlsx')) pu.save_xls(norm_cm_dict, join(model_outdir, 'confusion_matrices_normalized.xlsx')) # Saving classifier object with open(join(model_outdir, 'classifier_object.pkl'), 'wb') as file: pkl.dump(classifier_objects, file) except Exception: pass return scores_dict