def get_df_stats(scores_df, analyspar): """ get_df_stats(scores_df, analyspar) Returns statistics (mean/median and error) for each data column. Required args: - scores_df (pd.DataFrame): dataframe where each column contains data for which statistics should be measured - analyspar (AnalysPar): named tuple containing analysis parameters Returns: - stats_df (pd.DataFrame): dataframe with only one data row containing data stats for each original column under "{col}_stat" and "{col}_err" """ # take statistics stats_df = pd.DataFrame() for col in scores_df.columns: # get stats stat = math_util.mean_med(scores_df[col].to_numpy(), stats=analyspar.stats, nanpol="omit") err = math_util.error_stat(scores_df[col].to_numpy(), stats=analyspar.stats, error=analyspar.error, nanpol="omit") if isinstance(err, np.ndarray): err = err.tolist() stats_df = gen_util.set_object_columns(stats_df, [f"{col}_err"], in_place=True) stats_df.loc[0, f"{col}_stat"] = stat stats_df.at[0, f"{col}_err"] = err return stats_df
def add_CI_p_vals(shuffle_df, stats_data_df, permpar): """ add_CI_p_vals(shuffle_df, stats_data_df, permpar) Returns confidence intervals from shuffled data, and p-values for real data. Required args: - shuffle_df (pd.DataFrame): dataframe where each row contains data for different data shuffles, and each column contains data to use to construct null distributions. - stats_data_df (pd.DataFrame): dataframe with only one data row containing real data stats for each shuffle_df column. Columns should have the same names as shuffle_df, as "{col}_stat" and "{col}_err". - permpar (PermPar): named tuple containing permutation parameters Returns: - stats_df (pd.DataFrame): dataframe with only one data row containing real data stats, shuffled data stats, and p-values for real data test set results. """ if len(stats_data_df) != 1: raise ValueError("Expected stats_data_df to have length 1.") multcomp = 1 if not permpar.multcomp else permpar.multcomp p_thresh_corr = permpar.p_val / multcomp percs = math_util.get_percentiles(CI=(1 - p_thresh_corr), tails=permpar.tails)[0] percs = [percs[0], 50, percs[1]] stats_df = pd.DataFrame() for col in shuffle_df.columns: # add real data stat_key = f"{col}_stat" err_key = f"{col}_err" if (stat_key not in stats_data_df.columns or err_key not in stats_data_df.columns): raise KeyError( f"{stat_key} and {err_key} not found stats_data_df.") stats_df[stat_key] = stats_data_df[stat_key] stats_df[err_key] = stats_data_df[err_key] # get and add null CI data shuffle_data = shuffle_df[col].to_numpy() shuffle_data = shuffle_data[~np.isnan(shuffle_data)] # remove NaN data rand_util.check_n_rand(len(shuffle_data), p_thresh_corr) null_CI = [np.percentile(shuffle_data, p) for p in percs] null_key = f"{col}_null_CIs" stats_df = gen_util.set_object_columns(stats_df, [null_key], in_place=True) stats_df.at[0, null_key] = null_CI # get and add p-value if "test" in col: perc = scist.percentileofscore(shuffle_data, stats_data_df.loc[0, stat_key], kind='mean') if perc > 50: perc = 100 - perc p_val = perc / 100 stats_df.loc[0, f"{col}_p_vals"] = p_val return stats_df
def add_relative_resp_data(resp_data_df, analyspar, rel_sess=1, in_place=False): """ add_relative_resp_data(resp_data_df, analyspar) Adds relative response data to input dataframe for any column with "exp" in the name, optionally in place. Required args: - resp_data_df (pd.DataFrame): dataframe with one row per session, and response stats (2D array, ROI x stats) under keys for expected ("exp") and unexpected ("unexp") data, separated by Gabor frame (e.g., "exp_3", "unexp_G") if applicable. - analyspar (AnalysPar): named tuple containing analysis parameters Optional args: - rel_sess (int): number of session relative to which data should be scaled, for each mouse default: 1 - in_place (bool): if True, dataframe is modified in place Returns: - resp_data_df (pd.DataFrame): input dataframe, with "rel_{}" columns added for each input column with "exp" in its name """ if not in_place: resp_data_df = resp_data_df.copy(deep=True) nanpol = None if analyspar.rem_bad else "omit" source_columns = [col for col in resp_data_df.columns if "exp" in col] rel_columns = [f"rel_{col}" for col in source_columns] resp_data_df = gen_util.set_object_columns(resp_data_df, rel_columns, in_place=True) # calculate relative value for each for mouse_n, resp_mouse_df in resp_data_df.groupby("mouse_ns"): resp_data_df = resp_data_df.sort_values("sess_ns") # find sess 1 and check that there is only 1 rel_sess_idx = resp_mouse_df.loc[resp_mouse_df["sess_ns"] == rel_sess].index mouse_n_idxs = resp_mouse_df.index if len(rel_sess_idx) != 1: raise RuntimeError( f"Expected to find session {rel_sess} data for each mouse, " f"but if is missing for mouse {mouse_n}.") mouse_row = resp_mouse_df.loc[rel_sess_idx[0]] for source_col in source_columns: rel_col = source_col.replace("unexp", "exp") rel_data = math_util.mean_med(mouse_row[rel_col], analyspar.stats, nanpol=nanpol) for mouse_n_idx in mouse_n_idxs: resp_data_df.at[mouse_n_idx, f"rel_{source_col}"] = \ resp_data_df.loc[mouse_n_idx, source_col] / rel_data return resp_data_df
def get_resp_df(sessions, analyspar, stimpar, rel_sess=1, parallel=False): """ get_resp_df(sessions, analyspar, stimpar) Returns relative response dataframe for requested sessions. Required args: - sessions (list): session objects - analyspar (AnalysPar): named tuple containing analysis parameters - stimpar (StimPar): named tuple containing stimulus parameters Optional args: - rel_sess (int): number of session relative to which data should be scaled, for each mouse. If None, relative data is not added. default: 1 - parallel (bool): if True, some of the analysis is run in parallel across CPU cores default: False Returns: - resp_data_df (pd.DataFrame): data dictionary with response stats (2D array, ROI x stats) under keys for expected ("exp") and unexpected ("unexp") data, separated by Gabor frame (e.g., "exp_3", "unexp_G") if stimpar.stimtype == "gabors", and with "rel_{}" columns added for each input column with "exp" in its name if rel_sess is not None. """ if analyspar.tracked: misc_analys.check_sessions_complete(sessions, raise_err=True) sessids = [sess.sessid for sess in sessions] resp_data_df = misc_analys.get_check_sess_df(sessions, analyspar=analyspar) # double check that sessions are in correct order if resp_data_df["sessids"].tolist() != sessids: raise NotImplementedError( "Implementation error. Sessions must appear in correct order in " "resp_data_df.") logger.info(f"Loading data for each session...", extra={"spacing": TAB}) data_dicts = gen_util.parallel_wrap(get_sess_integ_resp_dict, sessions, args_list=[analyspar, stimpar], parallel=parallel) # add data to df misc_analys.get_check_sess_df(sessions, resp_data_df) for i, idx in enumerate(resp_data_df.index): for key, value in data_dicts[i].items(): if i == 0: resp_data_df = gen_util.set_object_columns(resp_data_df, [key], in_place=True) resp_data_df.at[idx, key] = value[:, 0] # retain stat only, not error # add relative data if rel_sess is not None: resp_data_df = add_relative_resp_data(resp_data_df, analyspar, rel_sess=rel_sess, in_place=True) return resp_data_df
def corr_scatterplots(sessions, analyspar, stimpar, basepar, idxpar, permpar, permute="sess", sig_only=False, randst=None, n_bins=200, parallel=False): """ corr_scatterplots(sessions, analyspar, stimpar, basepar, idxpar, permpar) Returns ROI index correlation scatterplot data for each line/plane/session comparison. Required args: - sessions (list): Session objects - analyspar (AnalysPar): named tuple containing analysis parameters - stimpar (StimPar): named tuple containing stimulus parameters - basepar (BasePar): named tuple containing baseline parameters - idxpar (IdxPar): named tuple containing index parameters - permpar (PermPar): named tuple containing permutation parameters. Optional args: - permute (bool): type of permutation to due ("tracking", "sess" or "all") default: "sess" - sig_only (bool): if True, ROIs with significant USIs are included (only possible if analyspar.tracked is True) default: False - randst (int or np.random.RandomState): seed value to use. (-1 treated as None) default: None - n_bins (int): number of bins for random data default: 200 - parallel (bool): if True, some of the analysis is run in parallel across CPU cores default: False Returns: - idx_corr_df (pd.DataFrame): dataframe with one row per line/plane, and the following columns, in addition to the basic sess_df columns: for correlation data (normalized if corr_type is "diff_corr") for session comparisons (x, y), e.g. 1v2 - binned_rand_stats (list): number of random correlation values per bin (xs x ys) - corr_data_xs (list): USI values for x - corr_data_ys (list): USI values for y - corrs (float): correlation between session data (x and y) - p_vals (float): p-value for correlation, corrected for multiple comparisons and tails - rand_corr_meds (float): median of the random correlations - raw_p_vals (float): p-value for intersession correlations - regr_coefs (float): regression correlation coefficient (slope) - regr_intercepts (float): regression correlation intercept - x_bin_mids (list): x mid point for each random correlation bin - y_bin_mids (list): y mid point for each random correlation bin """ lp_idx_df = get_lp_idx_df( sessions, analyspar=analyspar, stimpar=stimpar, basepar=basepar, idxpar=idxpar, permpar=permpar, sig_only=sig_only, randst=randst, parallel=parallel, ) idx_corr_df = get_basic_idx_corr_df( lp_idx_df, consec_only=False, null_CI_cols=False ) # get correlation pairs corr_ns = get_corr_pairs(lp_idx_df) if len(corr_ns) != 1: raise ValueError("Expected only 1 session correlation pair.") sess_pair = corr_ns[0] # get norm information norm = False corr_type = "corr" if permute in ["sess", "all"]: corr_type = "diff_corr" norm = True # add array columns columns = ["corr_data_xs", "corr_data_ys", "binned_rand_stats", "x_bin_mids", "y_bin_mids"] idx_corr_df = gen_util.set_object_columns(idx_corr_df, columns) logger.info( ("Calculating ROI USI correlations across sessions..."), extra={"spacing": TAB} ) group_columns = ["lines", "planes"] for grp_vals, grp_df in lp_idx_df.groupby(group_columns): grp_df = grp_df.sort_values("sess_ns") # mice already aggregated line, plane = grp_vals row_idx = idx_corr_df.loc[ (idx_corr_df["lines"] == line) & (idx_corr_df["planes"] == plane) ].index if len(row_idx) != 1: raise RuntimeError("Expected exactly one row to match.") row_idx = row_idx[0] if len(grp_df) > 2: raise RuntimeError("Expected no more than 2 rows to correlate.") if len(grp_df) < 2: continue # no pair use_randst = copy.deepcopy(randst) # reset each time # obtain correlation data args_dict = { "data_df" : grp_df, "analyspar" : analyspar, "permpar" : permpar, "permute" : permute, "corr_type" : corr_type, "absolute" : False, "norm" : norm, "randst" : use_randst, "return_data": True, "return_rand": True, "n_rand_ex" : 1000, } all_corr_data = get_corr_data(sess_pair, **args_dict) [roi_corr, _, null_CI, p_val, corr_data, _, rand_exs, _] = all_corr_data regr = LinearRegression().fit(corr_data[0].reshape(-1, 1), corr_data[1]) # bin data rand_stats, x_edge, y_edge = np.histogram2d( rand_exs[0].reshape(-1), rand_exs[1].reshape(-1), bins=n_bins, density=False ) x_mids = np.diff(x_edge) / 2 + x_edge[:-1] y_mids = np.diff(y_edge) / 2 + y_edge[:-1] rand_binned = scind.gaussian_filter( rand_stats, n_bins / 20, mode="constant" ) idx_corr_df.loc[row_idx, "corrs"] = roi_corr idx_corr_df.loc[row_idx, "rand_corr_meds"] = null_CI[1] idx_corr_df.loc[row_idx, "regr_coefs"] = regr.coef_ idx_corr_df.loc[row_idx, "regr_intercepts"] = regr.intercept_ idx_corr_df.at[row_idx, "corr_data_xs"] = corr_data[0].tolist() idx_corr_df.at[row_idx, "corr_data_ys"] = corr_data[1].tolist() idx_corr_df.at[row_idx, "binned_rand_stats"] = rand_binned.tolist() idx_corr_df.at[row_idx, "x_bin_mids"] = x_mids.tolist() idx_corr_df.at[row_idx, "y_bin_mids"] = y_mids.tolist() idx_corr_df.loc[row_idx, "p_vals"] = p_val # corrected p-values idx_corr_df = misc_analys.add_corr_p_vals(idx_corr_df, permpar) return idx_corr_df
def get_ex_traces_df(sessions, analyspar, stimpar, basepar, n_ex=6, rolling_win=4, randst=None, parallel=False): """ get_ex_traces_df(sessions, analyspar, stimpar, basepar) Returns example ROI traces dataframe. Required args: - sessions (list): Session objects - analyspar (AnalysPar): named tuple containing analysis parameters - stimpar (StimPar): named tuple containing stimulus parameters - basepar (BasePar): named tuple containing baseline parameters Optional args: - n_ex (int): number of example traces to retain default: 6 - rolling_win (int): window to use in rolling mean over individual trial traces default: 4 - randst (int or np.random.RandomState): random state or seed value to use. (-1 treated as None) default: None - parallel (bool): if True, some of the analysis is run in parallel across CPU cores default: False Returns: - selected_roi_data (pd.DataFrame): dataframe with a row for each ROI, and the following columns, in addition to the basic sess_df columns: - time_values (list): values for each frame, in seconds (only 0 to stimpar.post, unless split is "by_exp") - roi_ns (list): selected ROI number - traces_sm (list): selected ROI sequence traces, smoothed, with dims: seq x frames - trace_stats (list): selected ROI trace mean or median """ retained_traces_df = misc_analys.get_check_sess_df(sessions, None, analyspar) initial_columns = retained_traces_df.columns logger.info(f"Identifying example ROIs for each session...", extra={"spacing": TAB}) retained_roi_data = gen_util.parallel_wrap( get_sess_ex_traces, sessions, [analyspar, stimpar, basepar, rolling_win], parallel=parallel) randst = rand_util.get_np_rand_state(randst, set_none=True) # add data to dataframe new_columns = list(retained_roi_data[0]) retained_traces_df = gen_util.set_object_columns(retained_traces_df, new_columns, in_place=True) for i, sess in enumerate(sessions): row_idx = retained_traces_df.loc[retained_traces_df["sessids"] == sess.sessid].index if len(row_idx) != 1: raise RuntimeError( "Expected exactly one dataframe row to match session ID.") row_idx = row_idx[0] for column, value in retained_roi_data[i].items(): retained_traces_df.at[row_idx, column] = value # select a few ROIs per line/plane/session columns = retained_traces_df.columns.tolist() columns = [column.replace("roi_trace", "trace") for column in columns] selected_traces_df = pd.DataFrame(columns=columns) group_columns = ["lines", "planes", "sess_ns"] for _, trace_grp_df in retained_traces_df.groupby(group_columns): trace_grp_df = trace_grp_df.sort_values("mouse_ns") grp_indices = trace_grp_df.index n_per = np.asarray([len(roi_ns) for roi_ns in trace_grp_df["roi_ns"]]) roi_ns = np.concatenate(trace_grp_df["roi_ns"].tolist()) concat_idxs = np.sort(randst.choice(len(roi_ns), n_ex, replace=False)) for concat_idx in concat_idxs: row_idx = len(selected_traces_df) sess_idx = np.where(concat_idx < np.cumsum(n_per))[0][0] source_row = trace_grp_df.loc[grp_indices[sess_idx]] for column in initial_columns: selected_traces_df.at[row_idx, column] = source_row[column] selected_traces_df.at[row_idx, "time_values"] = \ source_row["time_values"].tolist() roi_idx = concat_idx - n_per[:sess_idx].sum() for col in ["roi_ns", "traces_sm", "trace_stats"]: source_col = col.replace("trace", "roi_trace") selected_traces_df.at[row_idx, col] = \ source_row[source_col][roi_idx].tolist() for column in [ "mouse_ns", "mouseids", "sess_ns", "sessids", "nrois", "roi_ns" ]: selected_traces_df[column] = selected_traces_df[column].astype(int) return selected_traces_df
def get_ex_idx_corr_norm_df(sessions, analyspar, stimpar, basepar, idxpar, permpar, permute="sess", sig_only=False, n_bins=40, randst=None, parallel=False): """ get_ex_idx_corr_norm_df(sessions, analyspar, stimpar, basepar, idxpar, permpar) Returns example correlation normalization data. Required args: - sessions (list): Session objects - analyspar (AnalysPar): named tuple containing analysis parameters - stimpar (StimPar): named tuple containing stimulus parameters - basepar (BasePar): named tuple containing baseline parameters - idxpar (IdxPar): named tuple containing index parameters - permpar (PermPar): named tuple containing permutation parameters. Optional args: - permute (bool): type of permutation to due ("tracking", "sess" or "all") default: "sess" - sig_only (bool): if True, ROIs with significant USIs are included (only possible if analyspar.tracked is True) default: False - n_bins (int): number of bins default: 40 - randst (int or np.random.RandomState): seed value to use. (-1 treated as None) default: None - parallel (bool): if True, some of the analysis is run in parallel across CPU cores default: False Returns: - idx_corr_norm_df (pd.DataFrame): dataframe with one row for a line/plane, and the following columns, in addition to the basic sess_df columns: for a specific session comparison, e.g. 1v2 - {}v{}_corrs (float): unnormalized intersession ROI index correlations - {}v{}_norm_corrs (float): normalized intersession ROI index correlations - {}v{}_rand_ex_corrs (float): unnormalized intersession ROI index correlations for an example of randomized data - {}v{}_rand_corr_meds (float): median of randomized correlations - {}v{}_corr_data (list): intersession values to correlate - {}v{}_rand_ex (list): intersession values for an example of randomized data - {}v{}_rand_corrs_binned (list): binned random unnormalized intersession ROI index correlations - {}v{}_rand_corrs_bin_edges (list): bins edges """ nanpol = None if analyspar.rem_bad else "omit" initial_columns = misc_analys.get_sess_df_columns(sessions[0], analyspar) lp_idx_df = get_lp_idx_df( sessions, analyspar=analyspar, stimpar=stimpar, basepar=basepar, idxpar=idxpar, permpar=permpar, sig_only=sig_only, randst=randst, parallel=parallel, ) idx_corr_norm_df = get_basic_idx_corr_df(lp_idx_df, consec_only=False) if len(idx_corr_norm_df) != 1: raise ValueError("sessions should be from the same line/plane.") # get correlation pairs corr_ns = get_corr_pairs(lp_idx_df) if len(corr_ns) != 1: raise ValueError("Sessions should allow only one pair.") sess_pair = corr_ns[0] corr_name = f"{sess_pair[0]}v{sess_pair[1]}" drop_columns = [ col for col in idx_corr_norm_df.columns if col not in initial_columns ] idx_corr_norm_df = idx_corr_norm_df.drop(columns=drop_columns) logger.info( ("Calculating ROI USI correlations for a single session pair..."), extra={"spacing": TAB} ) corr_type = "diff_corr" returns = get_corr_data( sess_pair, data_df=lp_idx_df, analyspar=analyspar, permpar=permpar, permute=permute, corr_type=corr_type, absolute=False, norm=False, return_data=True, return_rand=True, n_rand_ex=1, randst=randst ) roi_corr, _, _, _, corr_data, rand_corrs, rand_exs, rand_ex_corrs = returns rand_ex = rand_exs[..., 0] rand_ex_corr = rand_ex_corrs[0] rand_corr_med = math_util.mean_med( rand_corrs, stats="median", nanpol=nanpol ) norm_roi_corr = float( get_norm_corrs(roi_corr, med=rand_corr_med, corr_type=corr_type) ) row_idx = idx_corr_norm_df.index[0] idx_corr_norm_df.loc[row_idx, f"{corr_name}_corrs"] = roi_corr idx_corr_norm_df.loc[row_idx, f"{corr_name}_rand_ex_corrs"] = rand_ex_corr idx_corr_norm_df.loc[row_idx, f"{corr_name}_rand_corr_meds"] = rand_corr_med idx_corr_norm_df.loc[row_idx, f"{corr_name}_norm_corrs"] = norm_roi_corr cols = [ f"{corr_name}_{col_name}" for col_name in ["corr_data", "rand_ex", "rand_corrs_binned", "rand_corrs_bin_edges"] ] idx_corr_norm_df = gen_util.set_object_columns( idx_corr_norm_df, cols, in_place=True ) idx_corr_norm_df.at[row_idx, f"{corr_name}_corr_data"] = corr_data.tolist() idx_corr_norm_df.at[row_idx, f"{corr_name}_rand_ex"] = rand_ex.tolist() fcts = [np.min, np.max] if nanpol is None else [np.nanmin, np.nanmax] bounds = [fct(rand_corrs) for fct in fcts] bins = np.linspace(*bounds, n_bins + 1) rand_corrs_binned = np.histogram(rand_corrs, bins=bins)[0] idx_corr_norm_df.at[row_idx, f"{corr_name}_rand_corrs_bin_edges"] = \ [bounds[0], bounds[-1]] idx_corr_norm_df.at[row_idx, f"{corr_name}_rand_corrs_binned"] = \ rand_corrs_binned.tolist() return idx_corr_norm_df
def get_roi_tracking_ex_df(sessions, analyspar, parallel=False): """ get_roi_tracking_ex_df(sessions, analyspar) Return ROI tracking example information for the requested sessions, showing the different ROI matches identified depending on the orderin which the sessions are matched. Only sessions from certain mice have the requisit data stored in their nway-match files. Required args: - sessions (list): Session objects - analyspar (AnalysPar): named tuple containing analysis parameters Optional args: - parallel (bool): if True, some of the analysis is run in parallel across CPU cores default: False Returns: - roi_mask_df (pd.DataFrame in dict format): dataframe with a row for each mouse, and the following columns, in addition to the basic sess_df columns: - "roi_mask_shapes" (list): shape into which ROI mask indices index (sess x hei x wid) - "union_n_conflicts" (int): number of conflicts after union for "union", "fewest" and "most" tracked ROIs: - "{}_registered_roi_mask_idxs" (list): list of mask indices, registered across sessions, for each session (flattened across ROIs) ((sess, hei, wid) x val), ordered by {}_sess_ns if "fewest" or "most" - "{}_n_tracked" (int): number of tracked ROIs for "fewest", "most" tracked ROIs: - "{}_sess_ns" (list): ordered session number """ perm_types = ["fewest", "most"] add_cols = ["union_n_conflicts"] for perm_type in perm_types: add_cols.append(f"{perm_type}_registered_roi_mask_idxs") add_cols.append(f"{perm_type}_n_tracked") add_cols.append(f"{perm_type}_sess_ns") # collect ROI mask information roi_mask_df = get_roi_tracking_df(sessions, analyspar, reg_only=True, parallel=parallel) roi_mask_df = gen_util.set_object_columns(roi_mask_df, add_cols, in_place=True) roi_mask_df = roi_mask_df.rename( columns={"registered_roi_mask_idxs": "union_registered_roi_mask_idxs"}) all_sessids = [sess.sessid for sess in sessions] for row_idx in roi_mask_df.index: sess_ns = roi_mask_df.loc[row_idx, "sess_ns"] sessids = roi_mask_df.loc[row_idx, "sessids"] mouse_sessions = [ sessions[all_sessids.index(sessid)] for sessid in sessids ] masks, ordered_sess_ns, n_tracked = collect_roi_tracking_example_data( mouse_sessions) roi_mask_df.loc[row_idx, "union_n_tracked"] = n_tracked["union"] roi_mask_df.loc[row_idx, "union_n_conflicts"] = n_tracked["conflict"] for perm_type in perm_types: if set(ordered_sess_ns[perm_type]) != set(sess_ns): raise RuntimeError("Session number do not match.") roi_mask_df.at[row_idx, f"{perm_type}_registered_roi_mask_idxs"] = [ idxs.tolist() for idxs in np.where(masks[perm_type]) ] roi_mask_df.at[row_idx, f"{perm_type}_sess_ns"] = \ ordered_sess_ns[perm_type] roi_mask_df.loc[row_idx, f"{perm_type}_n_tracked"] = \ n_tracked[perm_type] int_cols = [col for col in roi_mask_df.columns if "_n_" in col] for col in int_cols: roi_mask_df[col] = roi_mask_df[col].astype(int) return roi_mask_df
def add_stim_roi_stats(stim_stats_df, sessions, analyspar, stimpar, permpar, comp_sess=[1, 3], in_place=False, randst=None): """ add_stim_roi_stats(stim_stats_df, sessions, analyspar, stimpar, permpar) Adds to dataframe comparison of absolute fractional data changes between sessions for different stimuli, calculated for individual ROIs. Required args: - stim_stats_df (pd.DataFrame): dataframe with one row per line/plane, and the basic sess_df columns, as well as stimulus columns for each comp_sess: - {stimpar.stimtype}_s{comp_sess[0]}: first comp_sess data for each ROI - {stimpar.stimtype}_s{comp_sess[1]}: second comp_sess data for each ROI - sessions (list): session objects - analyspar (AnalysPar): named tuple containing analysis parameters - stimpar (StimPar): named tuple containing stimulus parameters - permpar (PermPar): named tuple containing permutation parameters Optional args: - comp_sess (int): sessions for which to obtain absolute fractional change [x, y] => |(y - x) / x| default: [1, 3] - in_place (bool): if True, targ_df is modified in place. Otherwise, a deep copy is modified. targ_df is returned in either case. default: False - randst (int or np.random.RandomState): random state or seed value to use. (-1 treated as None) default: None Returns: - stim_stats_df (pd.DataFrame): dataframe with one row per line/plane and one for all line/planes together, and the basic sess_df columns, in addition to the input columns, and for each stimtype: - {stimtype} (list): absolute fractional change statistics (me, err) - p_vals (float): p-value for data differences between stimulus types, corrected for multiple comparisons and tails """ nanpol = None if analyspar.rem_bad else "omit" if analyspar.tracked: misc_analys.check_sessions_complete(sessions, raise_err=True) else: raise ValueError( "If analysis is run for individual ROIs and not population " "statistics, analyspar.tracked must be set to True.") if not in_place: stim_stats_df = stim_stats_df.copy(deep=True) stimtypes = gen_util.list_if_not(stimpar.stimtype) stim_stats_df = gen_util.set_object_columns(stim_stats_df, stimtypes, in_place=True) # compile all data full_data = dict() for stimtype in stimpar.stimtype: for n in comp_sess: stim_col = f"{stimtype}_s{n}" full_data[stim_col] = np.concatenate(stim_stats_df[stim_col]) row_idx = len(stim_stats_df) for col in stim_stats_df.columns: stim_stats_df.loc[row_idx, col] = "all" if col in full_data.keys(): stim_stats_df.loc[row_idx, col] = full_data[col] # take statistics for row_idx in stim_stats_df.index: comp_data = [None, None] for s, stimtype in enumerate(stimpar.stimtype): stim_data = [] for n in comp_sess: data_col = f"{stimtype}_s{n}" stim_data.append(stim_stats_df.loc[row_idx, data_col]) abs_fractional_diff(stim_data) # get stats and add to dataframe stim_stats_df.at[row_idx, stimtype] = \ math_util.get_stats( comp_data[s], analyspar.stats, analyspar.error, nanpol=nanpol ).tolist() # obtain p-values stim_stats_df.loc[row_idx, "p_vals"] = rand_util.get_op_p_val( comp_data, permpar.n_perms, stats=analyspar.stats, paired=True, nanpol=nanpol, randst=randst) # remove full data columns data_cols = [] for s, stimtype in enumerate(stimpar.stimtype): for n in comp_sess: data_cols.append(f"{stimtype}_s{n}") stim_stats_df = stim_stats_df.drop(data_cols, axis=1) return stim_stats_df
def add_stim_pop_stats(stim_stats_df, sessions, analyspar, stimpar, permpar, comp_sess=[1, 3], in_place=False, randst=None): """ add_stim_pop_stats(stim_stats_df, sessions, analyspar, stimpar, permpar) Adds to dataframe comparison of absolute fractional data changes between sessions for different stimuli, calculated for population statistics. Required args: - stim_stats_df (pd.DataFrame): dataframe with one row per line/plane, and the basic sess_df columns, as well as stimulus columns for each comp_sess: - {stimpar.stimtype}_s{comp_sess[0]}: first comp_sess data for each ROI - {stimpar.stimtype}_s{comp_sess[1]}: second comp_sess data for each ROI - sessions (list): session objects - analyspar (AnalysPar): named tuple containing analysis parameters - stimpar (StimPar): named tuple containing stimulus parameters - permpar (PermPar): named tuple containing permutation parameters Optional args: - comp_sess (int): sessions for which to obtain absolute fractional change [x, y] => |(y - x) / x| default: [1, 3] - in_place (bool): if True, targ_df is modified in place. Otherwise, a deep copy is modified. targ_df is returned in either case. default: False - randst (int or np.random.RandomState): random state or seed value to use. (-1 treated as None) default: None Returns: - stim_stats_df (pd.DataFrame): dataframe with one row per line/plane and one for all line/planes together, and the basic sess_df columns, in addition to the input columns, and for each stimtype: - {stimtype} (list): absolute fractional change statistics (me, err) - p_vals (float): p-value for data differences between stimulus types, corrected for multiple comparisons and tails """ nanpol = None if analyspar.rem_bad else "omit" if analyspar.tracked: misc_analys.check_sessions_complete(sessions, raise_err=False) if not in_place: stim_stats_df = stim_stats_df.copy(deep=True) stimtypes = gen_util.list_if_not(stimpar.stimtype) stim_stats_df = gen_util.set_object_columns(stim_stats_df, stimtypes, in_place=True) if analyspar.stats != "mean" or analyspar.error != "std": raise NotImplementedError("For population statistics analysis, " "analyspar.stats must be set to 'mean', and " "analyspar.error must be set to 'std'.") # initialize arrays for all data n_linpla = len(stim_stats_df) n_stims = len(stimpar.stimtype) n_bootstrp = misc_analys.N_BOOTSTRP all_stats = np.full((n_linpla, n_stims), np.nan) all_btstrap_stats = np.full((n_linpla, n_stims, n_bootstrp), np.nan) all_rand_stat_diffs = np.full((n_linpla, permpar.n_perms), np.nan) for i, row_idx in enumerate(stim_stats_df.index): full_comp_data = [[], []] for s, stimtype in enumerate(stimpar.stimtype): comp_data, btstrap_comp_data = [], [] choices = None for n in comp_sess: data_col = f"{stimtype}_s{n}" # get data data = stim_stats_df.loc[row_idx, data_col] # get session stats comp_data.append( math_util.mean_med(data, analyspar.stats, nanpol=nanpol)) # get bootstrapped data returns = rand_util.bootstrapped_std( data, randst=randst, n_samples=n_bootstrp, return_rand=True, return_choices=analyspar.tracked, choices=choices, nanpol=nanpol) btstrap_data = returns[1] if analyspar.tracked: choices = returns[-1] # use same choices across sessions btstrap_comp_data.append(btstrap_data) full_comp_data[s].append(data) # retain full data # compute absolute fractional change stats (bootstrapped std) all_stats[i, s] = abs_fractional_diff(comp_data) all_btstrap_stats[i, s] = abs_fractional_diff(btstrap_comp_data) error = np.std(all_btstrap_stats[i, s]) # add to dataframe stim_stats_df.at[row_idx, stimtype] = [all_stats[i, s], error] # obtain p-values for real data wrt random data stim_stat_diff = all_stats[i, 1] - all_stats[i, 0] # permute data for each session across stimtypes sess_rand_stats = [] # sess x stim for j in range(len(comp_sess)): rand_concat = [stim_data[j] for stim_data in full_comp_data] rand_concat = np.stack(rand_concat).T rand_stats = rand_util.permute_diff_ratio( rand_concat, div=None, n_perms=permpar.n_perms, stats=analyspar.stats, op="none", paired=True, # pair stimuli nanpol=nanpol, randst=randst) sess_rand_stats.append(rand_stats) # obtain stats per stimtypes, then differences between stimtypes stim_rand_stats = list(zip(*sess_rand_stats)) # stim x sess all_rand_stats = [] for rand_stats in stim_rand_stats: all_rand_stats.append(abs_fractional_diff(rand_stats)) all_rand_stat_diffs[i] = all_rand_stats[1] - all_rand_stats[0] # calculate p-value p_val = rand_util.get_p_val_from_rand(stim_stat_diff, all_rand_stat_diffs[i], tails=permpar.tails, nanpol=nanpol) stim_stats_df.loc[row_idx, "p_vals"] = p_val # collect stats for all line/planes row_idx = len(stim_stats_df) for col in stim_stats_df.columns: stim_stats_df.loc[row_idx, col] = "all" # average across line/planes all_data = [] for data in [all_stats, all_btstrap_stats, all_rand_stat_diffs]: all_data.append( math_util.mean_med(data, analyspar.stats, nanpol=nanpol, axis=0)) stat, btstrap_stats, rand_stat_diffs = all_data for s, stimtype in enumerate(stimpar.stimtype): error = np.std(btstrap_stats[s]) stim_stats_df.at[row_idx, stimtype] = [stat[s], error] p_val = rand_util.get_p_val_from_rand(stat[1] - stat[0], rand_stat_diffs, tails=permpar.tails, nanpol=nanpol) stim_stats_df.loc[row_idx, "p_vals"] = p_val return stim_stats_df
def check_init_stim_data_df(data_df, sessions, stimpar, comp_sess=[1, 3], stim_data_df=None, analyspar=None): """ check_init_stim_data_df(data_df, stimpar) Checks existing stimulus dataframe or creates one for each line/plane. Required args: - data_df (pd.DataFrame): dataframe with one row per session, and the basic sess_df columns - sessions (list): session objects - stimpar (StimPar): named tuple containing stimulus parameters Optional args: - comp_sess (int): sessions for which to obtain absolute fractional change [x, y] => |(y - x) / x| default: [1, 3] - stim_data_df (pd.DataFrame): dataframe with one row per line/plane, and the basic sess_df columns default: None - analyspar (AnalysPar): named tuple containing analysis parameters default: None Returns: - stim_data_df (pd.DataFrame): dataframe with one row per line/plane, and the basic sess_df columns, as well as stimulus columns for each comp_sess: - {stimpar.stimtype}_s{comp_sess[0]}: for first comp_sess data - {stimpar.stimtype}_s{comp_sess[1]}: for second comp_sess data """ initial_columns = misc_analys.get_sess_df_columns(sessions[0], analyspar) stimtype_cols = [f"{stimpar.stimtype}_s{i}" for i in comp_sess] if stim_data_df is None: new_df = True if analyspar is None: raise ValueError( "If stim_data_df is None, analyspar must be provided.") columns = initial_columns + stimtype_cols stim_data_df = pd.DataFrame(columns=columns) else: new_df = False if stimpar.stimtype in stim_data_df: raise KeyError( f"{stimpar.stimtype} should not already be in stim_data_df.") stim_data_df = gen_util.set_object_columns(stim_data_df, stimtype_cols, in_place=True) group_columns = ["lines", "planes"] aggreg_cols = [ col for col in initial_columns if col not in group_columns + ["sess_ns"] ] # populate dataframe for grp_vals, grp_df in data_df.groupby(group_columns): grp_df = grp_df.sort_values(["sess_ns", "mouse_ns"]) line, plane = grp_vals if new_df: row_idx = len(stim_data_df) for g, group_column in enumerate(group_columns): stim_data_df.loc[row_idx, group_column] = grp_vals[g] else: row_idxs = stim_data_df.loc[(stim_data_df["lines"] == line) & ( stim_data_df["planes"] == plane)].index if len(row_idxs) != 1: raise ValueError( "Expected exactly one row to match line/plane.") row_idx = row_idxs[0] # add aggregated values for initial columns ext_stim_data_df = misc_analys.aggreg_columns(grp_df, stim_data_df, aggreg_cols, row_idx=row_idx, in_place=new_df) # check data was added correctly if not new_df: for col in aggreg_cols: if (ext_stim_data_df.loc[row_idx, col] != stim_data_df.loc[row_idx, col]): raise RuntimeError( "If stim_data_df is not None, it must contain columns " "generated from data_df. This does not appear to be " f"the case, as the values in {col} do not match the " "values that would be added if stim_data_df was None.") if new_df: stim_data_df = ext_stim_data_df return stim_data_df