def start_actors(self): qsize = self.in_queue.qsize() printv("Starting actors for {} jobs...".format(qsize)) self.actors = [ RayActor.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue) for _ in range(qsize) ]
def add_kfold_indices(self, n_folds, clean=True): subject_ids = self.data_dict['data'].index kfold_indices = get_kfold_indices(subject_ids, n_folds) if clean: kfold_indices = clean_kfold_indices(kfold_indices, self.behav_data) self.data_dict['kfold_indices'] = kfold_indices printv("You need to (re-) upload data after this operation.")
def get_suprathr_edges_new(df_dict, p_thresh_pos=None, p_thresh_neg=None, r_thresh_pos=None, r_thresh_neg=None, percentile_neg=None, percentile_pos=None, top_n_pos=None, top_n_neg=None): folds_list = list(df_dict.keys()) n_edges = len(df_dict[folds_list[0]]) masks_dict = {} for fold in folds_list: pcorr_df = df_dict[fold] n_edges = len(df_dict[fold]) masks_dict[fold] = {} suprathr_edges_mask = {} if p_thresh_pos and p_thresh_neg: suprathr_edges_mask['pos'] = (pcorr_df['r'] > 0) & ( pcorr_df['p-val'] <= p_thresh_pos) suprathr_edges_mask['neg'] = (pcorr_df['r'] < 0) & ( pcorr_df['p-val'] <= p_thresh_neg) elif r_thresh_pos and r_thresh_neg: suprathr_edges_mask['pos'] = pcorr_df['r'] > r_thresh_pos suprathr_edges_mask['neg'] = pcorr_df['r'] < -abs( r_thresh_neg ) # r_thresh_neg can be both given as a positive or a negative value elif percentile_pos and percentile_neg: r_thresh_pos = np.nanpercentile(pcorr_df['r'], percentile_pos) r_thresh_neg = np.nanpercentile(pcorr_df['r'][pcorr_df['r'] < 0], 100 - percentile_neg) suprathr_edges_mask['pos'] = pcorr_df['r'] > r_thresh_pos suprathr_edges_mask['neg'] = pcorr_df['r'] < -abs(r_thresh_neg) elif top_n_pos and top_n_neg: suprathr_edges_mask['pos'] = np.zeros(pcorr_df.shape[0]) suprathr_edges_mask['neg'] = np.zeros(pcorr_df.shape[0]) suprathr_edges_mask['pos'][np.argpartition( pcorr_df['r'][pcorr_df['r'].notna()], -top_n_pos)[-top_n_pos:]] = 1 suprathr_edges_mask['neg'][np.argpartition( pcorr_df['r'][pcorr_df['r'].notna()], top_n_neg)[:top_n_neg]] = 1 else: raise TypeError( 'Either p_thresh_{neg, pos} or r_thresh_{neg, pos} or percentile_{neg, pos} or top_n_{pos, neg} needs to be defined.' ) printv( "Fold {}: Pos/neg suprathreshold edges (max r pos/max r neg): {}/{} ({}/{})" .format(fold + 1, suprathr_edges_mask['pos'].sum(), suprathr_edges_mask['neg'].sum(), pcorr_df['r'].max(), pcorr_df['r'].min())) for tail in ('pos', 'neg'): masks_dict[fold][tail] = np.zeros(n_edges) masks_dict[fold][tail][:] = suprathr_edges_mask[tail].astype(bool) return masks_dict
def get_fselection_results(self): results = self.get_results(self.out_queue) n = 1 N = len(results) printv("\n") for result in results: fold = result[0] perm = result[1] df = result[2] printv("Rearranging result {} of {}".format(n, N), update=True) self.fselection_results[perm][fold] = df n += 1
def perform_cpm(all_fc_data, all_behav_data, behav, k=10, **cpm_kwargs): """ Takes functional connectivity and behaviour dataframes, selects a behaviour """ from hcpsuite import timer timer('tic', name='Linear CPM') assert all_fc_data.index.equals( all_behav_data.index ), "Row (subject) indices of FC vcts and behavior don't match!" subj_list = all_fc_data.index # get subj_list from df index indices = create_kfold_indices(subj_list, k=k) # Initialize df for storing observed and predicted behavior col_list = [] for tail in ["pos", "neg", "glm"]: col_list.append(behav + " predicted (" + tail + ")") col_list.append(behav + " observed") behav_obs_pred = pd.DataFrame(index=subj_list, columns=col_list) # Initialize array for storing feature masks n_edges = all_fc_data.shape[1] all_masks = {} all_masks["pos"] = np.zeros((k, n_edges)) all_masks["neg"] = np.zeros((k, n_edges)) n_folds_completed = 0 for fold in range(k): printv("Doing fold {} of {} (successful folds: {})...".format( fold + 1, k, n_folds_completed)) train_subs, test_subs = split_train_test(subj_list, indices, test_fold=fold) train_vcts, train_behav, test_vcts = get_train_test_data( all_fc_data, train_subs, test_subs, all_behav_data, behav=behav) mask_dict = select_features(train_vcts, train_behav, **cpm_kwargs) all_masks["pos"][fold, :] = mask_dict["pos"] all_masks["neg"][fold, :] = mask_dict["neg"] model_dict = build_model(train_vcts, mask_dict, train_behav) if not model_dict: # build_model returns False instead of a dict if an array is not valid printv(" - Fold failed -> continuing with next fold...") continue # Skip fold if generated arrays are not valid behav_pred = apply_model(test_vcts, mask_dict, model_dict) for tail, predictions in behav_pred.items(): behav_obs_pred.loc[test_subs, behav + " predicted (" + tail + ")"] = predictions n_folds_completed += 1 print("\nCPM completed. Successful folds: {}".format(n_folds_completed)) behav_obs_pred.loc[subj_list, behav + " observed"] = all_behav_data[behav] timer('toc') return behav_obs_pred, all_masks
def create_fold(fold, subj_list, indices, all_fc_data, all_behav_data, behav): printv("Creating fold {}...".format(fold + 1), update=True) train_subs, test_subs = split_train_test(subj_list, indices, test_fold=fold) train_vcts, train_behav, test_vcts = get_train_test_data(all_fc_data, train_subs, test_subs, all_behav_data, behav=behav) return train_vcts, train_behav, test_vcts, test_subs
def do_perm(n, n_perm, train_vcts, train_behav, test_vcts, test_behav, **cpm_kwargs): global verbose printv("Doing permutation {} of {} ({} %)".format( n + 1, n_perm, round(((n + 1) / n_perm) * 100, 2))) train_behav['obs'] = np.random.permutation(train_behav['obs']) train_behav = train_behav['obs'] v = verbose verbose = False mask_dict = select_features(train_vcts, train_behav, **cpm_kwargs) verbose = v model_dict = build_model(train_vcts, mask_dict, train_behav) behav_pred = apply_model(test_vcts, mask_dict, model_dict) test_behav['glm'] = behav_pred[ 'glm'] # We're only interested in GLM at this point r = get_r_value(test_behav, tail='glm')[0] return r
def get_results(self, queue, n=100): """ Common get function utilised by get_{prediction,fselection}_results Input: queue to get from, max number of items to get at once Output: combined results """ N_total = 0 results = [] while not queue.empty(): N = queue.qsize() if N_total < N: N_total = N if N < n: # To provide some sort of progress display, it makes sense to split n = N printv("Retrieving results: {} of {}".format( len(results) + n, N_total), update=True) items = queue.get_nowait_batch(n) for item in items: results.append(item) return results
def status(self, verbose=True): N = self.status_queue.size() status_list_list = self.status_queue.get_nowait_batch(N) printv("Retrieving {} items from status queue...".format(N)) for status_list in status_list_list: pid = status_list[0] node = status_list[1] msg = status_list[2] self.status_dict[pid] = {"msg": msg, "node": node} n = 1 for pid, info in self.status_dict.items(): if (info['msg']): # Only print alive actors (-> msg != None) print("Actor {} [{}@{}]: {}".format(n, pid, info['node'], info['msg'])) n += 1 print("\n") out_size = self.out_queue.qsize() in_size = self.in_queue.qsize() print("Jobs done: {}".format(out_size)) print("Jobs remaining in queue: {}".format(in_size)) return out_size, in_size
def convert_matrices_to_dataframe(array, subj_ids): """ Takes a NumPy array (subjects:parcels:parcels) and converts it into a Pandas dataframe fit for downstream CPM analyses """ assert array.shape[0] == len( subj_ids ), "Number of subject IDs is not equal to number of subjects in neuroimage file" fc_data = {} n = 0 for id in subj_ids: printv("Flattening matrix of subject {} ({} of {}...)".format( id, n + 1, len(subj_ids)), update=True) tmp = array[n, :, :] # Get matrix of a single subject fc_data[id] = tmp[np.triu_indices_from( tmp, k=1)] # Only use upper triangle of symmetric matrix n += 1 printv("\nCreating DataFrame from matrices...") fc_data = pd.DataFrame.from_dict(fc_data, orient='index') return fc_data
def do_fold(fold, train_vcts, train_behav, test_vcts, test_subs, subj_list, all_behav_data, behav, **cpm_kwargs): global all_masks global behav_obs_pred global n_folds_completed printv("Doing fold {}...".format(fold + 1)) mask_dict = select_features(train_vcts, train_behav, **cpm_kwargs) if isinstance(mask_dict, dict): all_masks["pos"][fold, :] = mask_dict["pos"] all_masks["neg"][fold, :] = mask_dict["neg"] else: all_masks["pos"][fold, :] = np.nan all_masks["neg"][fold, :] = np.nan model_dict = build_model(train_vcts, mask_dict, train_behav) if not model_dict: # build_model returns False instead of a dict if an array is not valid printv(" - Fold failed -> continuing with next fold...") return False # Skip fold if generated arrays are not valid behav_pred = apply_model(test_vcts, mask_dict, model_dict) for tail, predictions in behav_pred.items(): behav_obs_pred.loc[test_subs, behav + " predicted (" + tail + ")"] = predictions n_folds_completed += 1 behav_obs_pred.loc[subj_list, behav + " observed"] = all_behav_data[behav]
def select_features(train_vcts, train_behav, r_thresh=0.2, corr_type='pearson'): """ Runs the CPM feature selection step: - correlates each edge with behavior, and returns a mask of edges that are correlated above some threshold, one for each tail (positive and negative) """ global verbose assert train_vcts.index.equals( train_behav.index), "Row indices of FC vcts and behavior don't match!" # Correlate all edges with behav vector if corr_type == 'pearson': cov = np.dot(train_behav.T - train_behav.mean(), train_vcts - train_vcts.mean(axis=0)) / (train_behav.shape[0] - 1) corr = cov / np.sqrt( np.var(train_behav, ddof=1) * np.var(train_vcts, axis=0, ddof=1)) elif corr_type == 'spearman': corr = [] for edge in train_vcts.columns: r_val = sp.stats.spearmanr(train_vcts.loc[:, edge], train_behav)[0] corr.append(r_val) # Define positive and negative masks mask_dict = {} mask_dict["pos"] = corr > r_thresh mask_dict["neg"] = corr < -r_thresh printv( " - Found ({}/{}) edges positively/negatively correlated (threshold: {}) with behavior in the training set" .format(mask_dict["pos"].sum(), mask_dict["neg"].sum(), r_thresh)) # for debugging printv(" - Max r pos: {}, max r neg: {}".format(corr.max(), corr.min())) return mask_dict
def start_workers(self, n_workers): printv("Starting {} workers".format(n_workers)) self.workers = [ RayWorker.remote(self.data_object, self.in_queue, self.out_queue, self.status_queue) for _ in range(n_workers) ]
def get_suprathr_edges(df_dict, perm=-1, p_thresh_pos=None, p_thresh_neg=None, r_thresh_pos=None, r_thresh_neg=None, percentile_neg=None, percentile_pos=None, top_n_pos=None, top_n_neg=None): folds_list = list(df_dict[perm].keys()) n_folds = len(folds_list) n_edges = len(df_dict[perm][folds_list[0]]) all_masks = {} all_masks['pos'] = np.zeros((n_folds, n_edges)) all_masks['neg'] = np.zeros((n_folds, n_edges)) for fold in folds_list: pcorr_df = df_dict[perm][fold] suprathr_edges_mask = {} if p_thresh_pos and p_thresh_neg: suprathr_edges_mask['pos'] = (pcorr_df['r'] > 0) & ( pcorr_df['p-val'] <= p_thresh_pos) suprathr_edges_mask['neg'] = (pcorr_df['r'] < 0) & ( pcorr_df['p-val'] <= p_thresh_neg) elif r_thresh_pos and r_thresh_neg: suprathr_edges_mask['pos'] = pcorr_df['r'] > r_thresh_pos suprathr_edges_mask['neg'] = pcorr_df['r'] < -abs( r_thresh_neg ) # r_thresh_neg can be both given as a positive or a negative value elif percentile_pos and percentile_neg: r_thresh_pos = np.nanpercentile(pcorr_df['r'], percentile_pos) r_thresh_neg = np.nanpercentile(pcorr_df['r'][pcorr_df['r'] < 0], 100 - percentile_neg) suprathr_edges_mask['pos'] = pcorr_df['r'] > r_thresh_pos suprathr_edges_mask['neg'] = pcorr_df['r'] < -abs(r_thresh_neg) elif top_n_pos and top_n_neg: suprathr_edges_mask['pos'] = np.zeros(pcorr_df.shape[0]) suprathr_edges_mask['neg'] = np.zeros(pcorr_df.shape[0]) suprathr_edges_mask['pos'][np.argpartition( pcorr_df['r'][pcorr_df['r'].notna()], -top_n_pos)[-top_n_pos:]] = 1 suprathr_edges_mask['neg'][np.argpartition( pcorr_df['r'][pcorr_df['r'].notna()], top_n_neg)[:top_n_neg]] = 1 else: raise TypeError( 'Either p_thresh_{neg, pos} or r_thresh_{neg, pos} or percentile_{neg, pos} or top_n_{pos, neg} needs to be defined.' ) printv( "Fold {}: Pos/neg suprathreshold edges (max r pos/max r neg): {}/{} ({}/{})" .format(fold + 1, suprathr_edges_mask['pos'].sum(), suprathr_edges_mask['neg'].sum(), pcorr_df['r'].max(), pcorr_df['r'].min())) all_masks['pos'][fold, :] = suprathr_edges_mask['pos'].astype(bool) all_masks['neg'][fold, :] = suprathr_edges_mask['neg'].astype(bool) return all_masks def start_autosave(path, ray_handler, save_size=1000): from ray.util.ml_utils.node import force_on_current_node AutoSaveActor = force_on_current_node(AutoSaveActor) autosave_actor = AutoSaveActor.remote(path, ray_handler, save_size) return autosave_actor
def plot_consistent_edges_loo(r_mat, thresh=0.13, tail='pos', consistency=0.8, coords=None, save=False, fname='consistent_edges.svg', **plot_connectome_kwargs): """Plot edges obtained in a leave-one out CPM above a defined threshold that are selected in at least a defined percentage of subjects.""" r_mat = np.moveaxis(r_mat, -1, 0) r_mat[np.isnan( r_mat)] = 0 # We need to zero NaNs otherweise symmetrizing won't work if not is_symmetric(r_mat[0]): r_mat = symmetrize_matrices(r_mat, mirror_lower=True) # Symmetrize matrices r_mat_flat = r_mat.reshape(r_mat.shape[0], r_mat.shape[1] * r_mat.shape[2]) # Flatten matrix edges_mask = np.zeros(r_mat_flat.shape[1]) if tail == 'pos': edges_count = np.zeros(r_mat_flat.shape[1]) for i in range(0, r_mat_flat.shape[0]): edges_count[ r_mat_flat[i] > thresh] += 1 # Count number of times an edge is suprathreshold edges_mask[edges_count > (r_mat.shape[0] * consistency)] = 1 elif tail == 'neg': edges_count = np.zeros(r_mat_flat.shape[1]) for i in range(0, r_mat_flat.shape[0]): edges_count[r_mat_flat[i] < (thresh * -1)] += 1 edges_mask[edges_count > (r_mat.shape[0] * consistency)] = -1 elif tail == 'combined': edges_count = np.zeros(r_mat_flat.shape[1]) for i in range(0, r_mat_flat.shape[0]): edges_count[abs(r_mat_flat[i]) > thresh] += 1 edges_mask[edges_count > (r_mat.shape[0] * consistency)] = 1 else: # aka tail='both' edges_count_pos = np.zeros(r_mat_flat.shape[1]) edges_count_neg = np.zeros(r_mat_flat.shape[1]) for i in range(0, r_mat_flat.shape[0]): edges_count_pos[r_mat_flat[i] > thresh] += 1 edges_count_neg[r_mat_flat[i] < (thresh * -1)] += 1 edges_mask[edges_count_pos > (r_mat.shape[0] * consistency)] = 1 edges_mask[edges_count_neg > (r_mat.shape[0] * consistency)] = -1 nodes_mask = edges_mask.reshape((r_mat.shape[1], r_mat.shape[2])) printv("There are {} suprathreshold (> {}) edges in {} % of the subjects". format( len(edges_mask[edges_mask != 0]) / 2, thresh, consistency * 100)) degrees = [] for node in range(513): degree = np.sum(abs(nodes_mask[ node, :])) # Determine degree of each node and add it to list degrees.append(degree) plotting.plot_connectome(nodes_mask, node_coords=coords, display_mode='lzry', node_size=[degree * 20 for degree in degrees], edge_kwargs={"linewidth": 2}, **plot_connectome_kwargs) if save: plt.savefig(fname)