def date_perspective(speed_df, red_light_df, traffic_crash_df): """ Method to calculate and display correlation between redlight violations, speed violations and total violation against crashes with respect to date. :param speed_df: dataframe consisting of speed violation data :param red_light_df: dataframe consisting of red light violation data :param traffic_crash_df: dataframe consisting of traffic crash data :return: None """ date_red_light_frame = red_light_df[['VIOLATION DATE', 'VIOLATIONS']] date_speed_frame = speed_df[['VIOLATION DATE', 'VIOLATIONS']] date_traffic_crash = traffic_crash_df[["Date"]] date_red_light_frame = date_red_light_frame.groupby( 'VIOLATION DATE', sort=False, as_index=False)['VIOLATIONS'].sum() date_speed_frame = date_speed_frame.groupby( 'VIOLATION DATE', sort=False, as_index=False)['VIOLATIONS'].sum() date_traffic_crash['count'] = date_traffic_crash.groupby( 'Date')['Date'].transform('count') date_traffic_crash.rename(columns={'count': 'Crashes'}, inplace=True) date_speed_frame.rename(columns={ 'VIOLATION DATE': 'Date', 'VIOLATIONS': 'Red_Light_Violations' }, inplace=True) date_red_light_frame.rename(columns={ 'VIOLATION DATE': 'Date', 'VIOLATIONS': 'Speed_Limit_Violations' }, inplace=True) res = pd.merge(date_traffic_crash, date_speed_frame, how='left', on='Date', sort=True).drop_duplicates() res = pd.merge(res, date_red_light_frame, how='left', on='Date', sort=True).drop_duplicates() res['Total_Violations'] = res['Red_Light_Violations'] + res[ 'Speed_Limit_Violations'] res.fillna(0, inplace=True) cor1 = pg.corr(x=res['Red_Light_Violations'], y=res['Crashes']) cor2 = pg.corr(x=res['Speed_Limit_Violations'], y=res['Crashes']) cor3 = pg.corr(x=res['Total_Violations'], y=res['Crashes']) f1 = visualize(res["Red_Light_Violations"], res["Crashes"], "Red Light Violations", "Crashes", cor1, 'blue') f2 = visualize(res["Speed_Limit_Violations"], res["Crashes"], "Speed-limit Violations", "Crashes", cor2, 'blue') f3 = visualize(res["Total_Violations"], res["Crashes"], "Total Violations", "Crashes", cor3, 'blue') return f1, f2, f3
def create_correlation_to_target_value(df_corr, feature_column_list, target_column='react_per_100_audience'): """ Creates a correlation matrix of all numerical features input: df_corr: the original dataframe to make build the correlations feature_column_list: features to be included in the correlation analysis target_column: the colum that stores the target value output: a table of different correlation metrics for the speicifc features & target value """ if target_column in feature_column_list: feature_column_list.remove(target_column) list_of_dfs = [] for i in feature_column_list: try: df_pg = pg.corr(x=df_corr[i], y=df_corr[target_column]) df_pg.index = [i] list_of_dfs.append(df_pg) except Exception as e: print('correlation did not work for {}'.format(i)) df_complete_corr = pd.concat(list_of_dfs, ignore_index=False) df_complete_corr = df_complete_corr[[ 'n', 'r', 'CI95%', 'p-val', 'BF10' ]].rename( columns={ 'n': '# sample', 'r': 'correlation coefficient', 'CI95%': '95% confidence interval', 'p-val': 'p value', 'BF10': 'Bayes Factor' }) return df_complete_corr
def cross_correlation(data_as_array): correlations = [] r_temp = [] p_temp = [] ci95range_temp = [] power_temp = [] df_temp = [] for ki1 in range(len(data_as_array)): for ki2 in range(len(data_as_array)): if ki2 > ki1: correlations.append( pearsonr(data_as_array[ki1], data_as_array[ki2])[0]**2) pg_ = pg.corr(data_as_array[ki1], data_as_array[ki2]) aa = pg_['CI95%'][0][0]**2 bb = pg_['CI95%'][0][1]**2 lower_ = np.min([aa, bb]) upper_ = np.max([aa, bb]) p_ = pg_['p-val'][0] r_ = pg_['r'][0] power_ = pg_['power'][0] df_ = pg_['n'][0] - 2 r_temp.append(r_) p_temp.append(p_) temp___ = (upper_ - lower_) ci95range_temp.append(temp___) power_temp.append(power_) df_temp.append(df_) return correlations, r_temp, p_temp, ci95range_temp, power_temp, df_temp
def preprocess_pandas(df): scaler = MinMaxScaler() df[["income", "insured", "pm25"]] = scaler.fit_transform((df[["income", "insured", "pm25"]])) covariance = cov(df['income'], df['pm25']) pear_cov, _ = pearsonr(df['income'], df['pm25']) spearmans_cov = spearmanr(df['income'], df['pm25']) print(pg.corr(x=df['income'], y=-df['pm25']))
def corr(input_data, sigmas, dissimilarityMatrix): ndims, ninstrus = input_data.shape[0], input_data.shape[1] idx_triu = np.triu_indices(dissimilarityMatrix.shape[0], k=1) target_v = dissimilarityMatrix[idx_triu] mean_target = np.mean(target_v) std_target = np.std(target_v) no_samples = ninstrus * (ninstrus - 1) / 2 kernel = np.zeros((ninstrus, ninstrus)) idx = [i for i in range(len(input_data))] sigmas = np.clip(sigmas, a_min=1.0, a_max=1e15) for i in range(ninstrus): for j in range(i + 1, ninstrus): kernel[i, j] = -np.sum( np.power( np.divide(input_data[idx, i] - input_data[idx, j], (sigmas[idx] + np.finfo(float).eps)), 2)) kernel_v = kernel[idx_triu] mean_kernel = np.mean(kernel_v) std_kernel = np.std(kernel_v) Jn = np.sum(np.multiply(kernel_v - mean_kernel, target_v - mean_target)) Jd = no_samples * std_target * std_kernel # corr_, p___ = pearsonr(np.asarray(target_v),np.asarray(kernel_v)) pg_ = pg.corr(np.asarray(target_v), np.asarray(kernel_v)) aa = pg_['CI95%'][0][0]**2 bb = pg_['CI95%'][0][1]**2 lower = np.min([aa, bb]) upper = np.max([aa, bb]) p = pg_['p-val'][0] r = pg_['r'][0] power = pg_['power'][0] df = pg_['n'][0] - 2 return r, p, lower, upper, power, df
def location_perspective(speed_df, red_light_df, traffic_crash_df): """ Method to display correlation between red light violations, speed violations and total violation against crashes with respect to location. :param speed_df: dataframe consisting of speed violation data :param red_light_df: dataframe consisting of red light violation data :param traffic_crash_df: dataframe consisting of traffic crash data :return: None """ speed_frame_sample = speed_df[['STREET_NAME', 'VIOLATIONS' ]].groupby('STREET_NAME', as_index=False).sum() red_light_frame_sample = red_light_df[['STREET_NAME', 'VIOLATIONS' ]].groupby('STREET_NAME', as_index=False).sum() traffic_frame_sample = traffic_crash_df[['STREET_NAME', 'Date']].groupby( 'STREET_NAME', as_index=False).count() red_light_frame_sample.columns = ["STREET_NAME", "REDLIGHT_VIOLATIONS"] speed_frame_sample.columns = ["STREET_NAME", "SPEED_VIOLATIONS"] res = pd.merge(traffic_frame_sample, speed_frame_sample, how='left', on='STREET_NAME') res = pd.merge(res, red_light_frame_sample, how='left', on='STREET_NAME') res.columns = [ "STREET_NAME", "REDLIGHT_VIOLATIONS", "SPEED_VIOLATIONS", "Crashes" ] res["Total_violations"] = res["REDLIGHT_VIOLATIONS"] + res[ "SPEED_VIOLATIONS"] res['REDLIGHT_VIOLATIONS'].fillna(0, inplace=True) res['SPEED_VIOLATIONS'].fillna(0, inplace=True) res['Crashes'].fillna(0, inplace=True) res['Total_violations'].fillna(0, inplace=True) cor1 = pg.corr(x=res['REDLIGHT_VIOLATIONS'], y=res['Crashes']) cor2 = pg.corr(x=res['SPEED_VIOLATIONS'], y=res['Crashes']) cor3 = pg.corr(x=res['Total_violations'], y=res['Crashes']) f1 = visualize(res["REDLIGHT_VIOLATIONS"], res["Crashes"], "Red Light Violations", "Crashes", cor1, 'red') f2 = visualize(res["SPEED_VIOLATIONS"], res["Crashes"], "Speed Camera Violations", "Crashes", cor2, 'orange') f3 = visualize(res["SPEED_VIOLATIONS"], res["Crashes"], "Total Violations", "Crashes", cor3, 'lightblue') return f1, f2, f3
def getstats(aggDict): animals = list(aggDict['0NP']['sub_id']) cols = list(aggDict['0NP'].columns.values) phaseCols = cols[3:19] # Getting phase names sequences = ['0NP', '1NP', '(0,1,~) D', '(0,-1,~)'] # Loading results of manual analysis mxl = pd.read_excel('Impulsivity strategies.xlsx', sheet_name=sequences, index_col=0, nrows=25, usecols=np.arange(0, 19), keep_default_na=False) # Auto. vs Man. correlation corrDict = {} meanCorrs = {} # Just the mean for kind, kval in enumerate(sequences): corrDict[kval] = {} meanCorrs[kval] = 0 df = aggDict[kval].set_index('sub_id') for animal in animals: corRes = pg.corr(mxl[kval].loc[animal, '1L':'8D'].astype('float64'), df.loc[animal, '1L':'8D'].astype('float64'), method='pearson') corrDict[kval][animal] = corRes.loc['pearson'] meanCorrs[kval] += corRes.loc['pearson']['r'] / len(animals) # For export compDict = {} for ind, sheetname in enumerate(sequences): compDict[sheetname] = pd.DataFrame.from_dict(corrDict[sheetname]).T # Between subjects pairwise t-tests betDict = {'treat': {}, 'phen': {}} for bet in betDict.keys(): for key in sequences: # Converting to long format df = aggDict[key].loc[:, :'8D'].melt(id_vars=['sub_id', 'treat', 'phen'], value_vars=phaseCols, var_name='phase', value_name='dv') df['dv'] = pd.to_numeric(df['dv']) df = pg.pairwise_ttests(data=df, subject='sub_id', dv='dv', within='phase', between=[bet], return_desc=True).round(6) betDict[bet][key] = df # Wilcoxon for impulsive sequence, for phases 6D:7L df = aggDict['(0,-1,~)'].loc[:, :'8D'] phenotype = {'epi': df[df.phen == 'epi'], 'non': df[df.phen == 'non']} for phenkey, phenval in phenotype.items(): w, p = stats.wilcoxon(phenval.loc[:, '6D'], phenval.loc[:, '7L'], mode='approx') phenotype[phenkey] = pd.DataFrame.from_dict({'w': [w], 'p': [p]}) # Descriptive stats for self-control cDrink = aggDict['(0,1,~) D'].loc[:, :'8D'] cStats = [(cDrink.mean(axis=0), cDrink.sem(axis=0))] return corrDict, meanCorrs, compDict, betDict, phenotype, cStats
def get_correlation(df_st, df_sc, pval_cut=0.05, log2fc_cut=1., method='pearson'): """Calculate Pearson Correlation Parameters ---------- df_st : pandas.Dataframe Data frame containing the results of the dge analysis df_sc : pandas.Dataframe Data frame containing the results of the dge analysis pval_cut : float p-value cut parameter log2fc_cut : float effect size cut parameter method : str Correlation method to use Returns ------- """ # 1. get up-regulated genes in cyto+ group in both data sets m_sig_st = (df_st['pval'].values <= pval_cut) & (abs( df_st['log2fc'].values) >= log2fc_cut) m_sig_sc = (df_sc['pval'].values <= pval_cut) & (abs( df_sc['log2fc'].values) >= log2fc_cut) mask_siggenes = np.logical_and(m_sig_st, m_sig_sc) # 2. Calculate Pearson Correlation and p-value sig_r = pingouin.corr(x=df_st['signed_pval'][mask_siggenes], y=df_sc['signed_pval'][mask_siggenes], method=method) print(sig_r['p-val'].values[0]) print(sig_r['r']) return sig_r
font_scale = expand.slider("Font scale", 0.0, 4.0, 1.0, 0.1) start_x = float(df[x_var1].max() / 10) start_y = float(df[x_var2].max() - df[x_var2].max() / 10) max_x = float(df[x_var1].max()) max_y = float(df[x_var2].max()) x_pos = expand.slider("X position for the label", 0.0, max_x, start_x, (max_x / 100 + 0.1)) y_pos = expand.slider("Y position for the label", 0.0, max_y, start_y, (max_y / 100 + 0.1)) sns.set(style='white', font_scale=font_scale) st.success("Correlation results") corr_result = pg.corr(x=df[x_var1], y=df[x_var2], method=method_selected) st.write(corr_result) st.success("Correlation matrices") st.write( pg.pairwise_corr(df, padjust='bonf', method=method_selected).sort_values(by=['p-unc'])) st.write(df.rcorr(padjust='bonf')) st.success("Correlation plot with distributions is being generated") fig = plt.figure(figsize=(12, 6)) g = sns.JointGrid(data=df, x=x_var1, y=x_var2, height=6) g = g.plot_joint(sns.regplot, color="xkcd:muted blue") g = g.plot_marginals(sns.distplot,
def correlation_plot(df_counts_cytoresps, genes_dict, save_folder): """ Plot correlation of counts distribution for each cytokine over all samples :param df_counts_cytoresps: :param genes_dict: :param save_folder: :return: """ # df_counts_cytoresps = df_counts_cytoresps.replace({'0': np.nan, 0: np.nan}) df_counts_cytoresps = df_counts_cytoresps.replace({np.nan: 0}) for cyto in genes_dict.keys(): for cyto_reps in genes_dict.keys(): resp_name = "_".join([cyto_reps, 'responder']) temp_df = df_counts_cytoresps[[cyto, resp_name, 'disease']] # stats: hypothesis here will be that the counts of a cytokine is correlated to the counts of its responders sig_r = pingouin.corr(x=temp_df[resp_name], y=temp_df[cyto], method='pearson') # Plot Correlation # 2. Plot correlation fig, ax = plt.subplots(figsize=fig_size) ax.grid(False) sns.regplot(data=temp_df, x=resp_name, y=cyto, ax=ax, scatter=False, color="black", label=None) ax.scatter(data=temp_df, x=resp_name, y=cyto, c='k') # Axis params ax.set_xlabel(" ".join(["Responder Counts"]), fontsize=xy_fontsize) if cyto == 'IFNG': ax.set_ylabel(r'IFN-$\gamma$ Counts', fontsize=xy_fontsize) else: ax.set_ylabel(" ".join([cyto, 'Counts']), fontsize=xy_fontsize) if temp_df.max()[0] < 10: ax.set_yticks(np.arange(0, temp_df.max()[0] + 1, 1)) # Add text: Correlation value and p-value ax.text(temp_df.max()[1] / 2 - temp_df.max()[1] / 10, temp_df.max()[0], 'r = {:.2f}; p = {:.2e}'.format( sig_r['r'].values[0], sig_r['p-val'].values[0]), fontstyle='italic', fontsize=text_fontsize) else: ax.set_yticks(np.arange(0, temp_df.max()[0] + 2, 2)) # Add text: Correlation value and p-value ax.text(temp_df.max()[1] / 2 - temp_df.max()[1] / 10, temp_df.max()[0] + 1, 'r = {:.2f}; p = {:.2e}'.format( sig_r['r'].values[0], sig_r['p-val'].values[0]), fontstyle='italic', fontsize=text_fontsize) ax.set_xlim([-0.5, temp_df.max()[1] + temp_df.max()[1] / 20]) ax.set_ylim([-0.5, temp_df.max()[0] + temp_df.max()[0] / 20]) plt.tight_layout() # remove upper and right edge lines in plot sns.despine(ax=ax) # 3. Save figure fig.savefig( os.path.join(save_folder, "_".join(['Fig4A', cyto, resp_name, fileformat]))) plt.close()
########################################### Endplate results gt_slopes_dir = "../data/PredictionsVsGroundTruth/EndplateSlopes_GroundTruthEndplates.csv" gt_slopes_data = genfromtxt(gt_slopes_dir, delimiter=',') pred_slopes_dir = "../data/PredictionsVsGroundTruth/EndplateSlopes.csv" pred_slopes_data = genfromtxt(pred_slopes_dir, delimiter=',') slopesDiff = pred_slopes_data - gt_slopes_data slopesAbsDiff = abs(pred_slopes_data - gt_slopes_data) SD = np.std(slopesDiff) slopesDiffMean = np.mean(slopesDiff) slopesAbsDiffMean = np.mean(slopesAbsDiff) slopesCorr = pg.corr(pred_slopes_data.reshape(-1),gt_slopes_data.reshape(-1)) plt.figure() sns.distplot(slopesDiff.reshape(-1)) plt.xlabel("Difference in Endplate Slope (Degrees)") plt.ylabel("Density") plt.title("Difference between Predicted and Ground-truth Endplate Slopes") plt.show() plt.figure() sns.scatterplot(x=gt_slopes_data.reshape(-1), y=pred_slopes_data.reshape(-1)) plt.xlabel("Ground-truth Endplate Slope (Degrees)") plt.ylabel("Predicted Endplate Slope (Degrees)") plt.title("Ground-truth vs. Predicted Endplate Slopes") plt.show()
AD2 = abs(gt_angle_data[:, 1] - pred_angle_data[:, 1]) MAD2 = np.mean(AD2) AD3 = abs(gt_angle_data[:, 2] - pred_angle_data[:, 2]) MAD3 = np.mean(AD3) ########## This is shorter MAD = np.mean(abs(gt_angle_data.reshape(-1) - pred_angle_data.reshape(-1))) D = pred_angle_data - gt_angle_data MD = np.mean(D) SD = np.std(D) corr = pg.corr(pred_angle_data.reshape(-1), gt_angle_data.reshape(-1)) print(corr.to_string()) plt.figure() # sns.distplot(D[:,0], label="Proximal-thoracic") # sns.distplot(D[:,1], label="Main thoracic") # sns.distplot(D[:,2], label="Lumbar") sns.distplot(D.reshape(-1)) plt.xlabel("Difference in Cobb Angle (Degrees)") plt.ylabel("Density") # plt.legend() plt.title("Difference between Predicted and Ground-truth Cobb Angles") plt.show() ########## Shapiro-Wilk test ShapiroWilk = pg.normality(data=D.reshape(-1))
print('Mutual Median pairwise correlations') print('Full STRF') p_tab = [] power_tab = [] CI95range_tab = [] for iDataset in range(nbDatasets): for jDataset in range(iDataset + 1, nbDatasets): strfTensorI = np.reshape(sigmasTab[iDataset], (128, 11, 22)) strf_scale_rateI, strf_freq_rateI, strf_freq_scaleI = avgvec2strfavg( strf2avgvec(strfTensorI)) strfTensorJ = np.reshape(sigmasTab[jDataset], (128, 11, 22)) strf_scale_rateJ, strf_freq_rateJ, strf_freq_scaleJ = avgvec2strfavg( strf2avgvec(strfTensorJ)) # pairwisePearsonMatrixFullTensor[iDataset][jDataset] = spearmanr(strfTensorI.flatten(),strfTensorJ.flatten())[0]**2 pg_ = pg.corr(strfTensorI.flatten(), strfTensorJ.flatten(), method="spearman") aa = pg_['CI95%'][0][0]**2 bb = pg_['CI95%'][0][1]**2 lower = np.min([aa, bb]) upper = np.max([aa, bb]) p = pg_['p-val'][0] r = pg_['r'][0] power = pg_['power'][0] df = pg_['n'][0] - 2 # print(str(iDataset)+' '+str(jDataset)+' '+str(df)+' '+"%.2f" %(r**2)+' '+"%.3f" %p+' ['+"%.3f" %(lower)+';'+"%.3f" %(upper)+'] '+"%.3f" %(power)+' '+"%.3f" %(upper-lower)) p_tab.append(p) power_tab.append(power) CI95range_tab.append(upper - lower) print('Full Statistics Ranges') print('p_median=' + "%.2f" % (np.median(p_tab)) + ' p_min=' + "%.2f" %
def get_crossval_sigmas_from_folder(timbre_spaces=[ 'Barthet2010', 'Grey1977', 'Grey1978', 'Iverson1993_Onset', 'Iverson1993_Remainder', 'Iverson1993_Whole', 'McAdams1995', 'Patil2012_A3', 'Patil2012_DX4', 'Patil2012_GD4', 'Lakatos2000_Harm', 'Lakatos2000_Comb', 'Lakatos2000_Perc', 'Siedenburg2016_e2set1', 'Siedenburg2016_e2set2', 'Siedenburg2016_e2set3', 'Siedenburg2016_e3' ], representation=['auditory_strf'], folder='results_light', averaging='avg_time_avg_freq', early_stopping=True, folder_old='all'): sigmas = [] correlation_testing = [] correlation_training = [] correlation_with_all = [] cross_corr = [] for tsp in timbre_spaces: for root, dirs, files in os.walk(os.path.join(folder, tsp.lower())): for f in files: if averaging + '_' + tsp.lower() in f: results = pickle.load(open(os.path.join(root, f), 'rb')) # print(files) metricOnAll = pickle.load( open(os.path.join(root, 'all.pkl'), 'rb')) metricOnAll = metricOnAll['sigmas'].flatten() sigmas_ = [ results['sigmas'][fold] for fold in range(len(results['correlations'])) ] sigmas_ = np.array(sigmas_) corr_ = [ pearsonr(results['sigmas'][fold].flatten(), metricOnAll)[0] for fold in range(len(results['correlations'])) ] r_withall_temp = [] p_withall_temp = [] ci95range_withall_temp = [] df_withall_temp = [] power_withall_temp = [] for fold in range(len(results['correlations'])): pearsonr(results['sigmas'][fold].flatten(), metricOnAll)[0] pg_ = pg.corr(results['sigmas'][fold].flatten(), metricOnAll) aa = pg_['CI95%'][0][0]**2 bb = pg_['CI95%'][0][1]**2 lower_ = np.min([aa, bb]) upper_ = np.max([aa, bb]) p_ = pg_['p-val'][0] r_ = pg_['r'][0] power_ = pg_['power'][0] df_ = pg_['n'][0] - 2 r_withall_temp.append(r_) p_withall_temp.append(p_) temp___ = (upper_ - lower_) ci95range_withall_temp.append(temp___) power_withall_temp.append(power_) df_withall_temp.append(df_) file_ = 'resultsOptims_strf' + f.split('results')[1].split( '.pkl')[0] + '.pkl' # resultsOptims_strf_avg_time_Grey1978_F311.pkl sigmas__ = pickle.load( open(os.path.join('./out_aud_STRF_crossval/', file_), 'rb')) r_training_temp = [] p_training_temp = [] ci95range_training_temp = [] power_training_temp = [] df_training_temp = [] r_testing_temp = [] p_testing_temp = [] ci95range_testing_temp = [] power_testing_temp = [] df_testing_temp = [] pg_training = [] pg_testing = [] for fold in range(len(results['correlations'])): # training data train_idx = [ i for i in range( sigmas__['representations'].shape[1]) if i != fold ] input_data_training = sigmas__[ 'representations'][:, train_idx] target_data_training = sigmas__['dissimilarities'][ train_idx, :] target_data_training = target_data_training[:, train_idx] r_, p_, lower_, upper_, power_, df_ = corr( input_data_training, results['sigmas'][fold], target_data_training) r_training_temp.append(r_) p_training_temp.append(p_) temp___ = (upper_ - lower_) ci95range_training_temp.append(temp___) power_training_temp.append(power_) df_training_temp.append(df_) # pg_training.append(corr(input_data_training,results['sigmas'][fold],target_data_training)) # testing data test_idx = [fold] ninstrus = sigmas__['representations'].shape[1] input_data_testing = sigmas__[ 'representations'][:, test_idx[0]] target_data_testing = np.zeros((ninstrus - 1, 1)) cpt_i = 0 for i in range(ninstrus): if i > fold: target_data_testing[cpt_i] = sigmas__[ 'dissimilarities'][fold, i] cpt_i += 1 elif i < fold: target_data_testing[cpt_i] = sigmas__[ 'dissimilarities'][i, fold] cpt_i += 1 # print(input_data_testing) # print(target_data_testing) # pg_testing.append(corr(input_data_testing,results['sigmas'][fold],target_data_testing)) test_input = input_data_testing test_target = target_data_testing mean_target_test = np.mean(test_target) std_target_test = np.std(test_target) kernel_test = np.zeros((ninstrus - 1, 1)) # print(input_data_training.shape) for i in range(len(kernel_test)): # print(i) kernel_test[i, 0] = -np.sum( np.power( np.divide( test_input - input_data_training[:, i], (results['sigmas'][fold] + np.finfo(float).eps)), 2)) # pg_testing.append(pearsonr(kernel_test[:,0], test_target[:,0])[0]) pg_ = pg.corr(np.asarray(kernel_test[:, 0]), np.asarray(test_target[:, 0])) aa = pg_['CI95%'][0][0]**2 bb = pg_['CI95%'][0][1]**2 lower_ = np.min([aa, bb]) upper_ = np.max([aa, bb]) p_ = pg_['p-val'][0] r_ = pg_['r'][0] power_ = pg_['power'][0] df_ = pg_['n'][0] - 2 r_testing_temp.append(r_) p_testing_temp.append(p_) temp___ = (upper_ - lower_) ci95range_testing_temp.append(temp___) power_testing_temp.append(power_) df_testing_temp.append(df_) # pg_testing.append(corr(input_data_testing,results['sigmas'][fold],target_data_testing)) # print(np.median(np.asarray(pg_training)**2)) # print(iqr(np.asarray(pg_training)**2)) # print(np.median(np.asarray(pg_testing)**2)) # print(iqr(np.asarray(pg_testing)**2)) # print(iqr(np.asarray(results['correlations_testing'])**2)) # print('{} - corr={:.3f} (std={:.3f})'.format( # tsp, # np.mean(cross_correlation(sigmas_)), # np.std(cross_correlation(sigmas_)))) # sigmas.append(np.mean(sigmas_, axis=0)) print( "%i" % (np.median(np.asarray(df_training_temp))) + ' ' # +"%.2f" %np.median(np.asarray(r_training_temp)**2)+' ' # +"%.2f" %np.min(np.asarray(r_training_temp)**2)+' ' # +"%.2f" %np.max(np.asarray(r_training_temp)**2)+' ' + "%.3f" % np.median(np.asarray(p_training_temp)) + ' ' + "%.3f" % np.min(np.asarray(p_training_temp)) + ' ' + "%.3f" % np.max(np.asarray(p_training_temp)) + ' ' + "%.3f" % np.median(np.asarray(ci95range_training_temp)) + ' ' + "%.3f" % np.min(np.asarray(ci95range_training_temp)) + ' ' + "%.3f" % np.max(np.asarray(ci95range_training_temp)) + ' ' + "%.3f" % np.median(np.asarray(power_training_temp)) + ' ' + "%.3f" % np.min(np.asarray(power_training_temp)) + ' ' + "%.3f" % np.max(np.asarray(power_training_temp))) print( "%i" % (np.median(np.asarray(df_testing_temp))) + ' ' # +"%.2f" %np.median(np.asarray(r_training_temp)**2)+' ' # +"%.2f" %np.min(np.asarray(r_training_temp)**2)+' ' # +"%.2f" %np.max(np.asarray(r_training_temp)**2)+' ' + "%.3f" % np.median(np.asarray(p_testing_temp)) + ' ' + "%.3f" % np.min(np.asarray(p_testing_temp)) + ' ' + "%.3f" % np.max(np.asarray(p_testing_temp)) + ' ' + "%.3f" % np.median(np.asarray(ci95range_testing_temp)) + ' ' + "%.3f" % np.min(np.asarray(ci95range_testing_temp)) + ' ' + "%.3f" % np.max(np.asarray(ci95range_testing_temp)) + ' ' + "%.3f" % np.median(np.asarray(power_testing_temp)) + ' ' + "%.3f" % np.min(np.asarray(power_testing_temp)) + ' ' + "%.3f" % np.max(np.asarray(power_testing_temp))) _, r_temp_within, p_temp_within, ci95range_temp_within, power_temp_within, df_temp_within = cross_correlation( sigmas_) print( "%i" % (np.median(np.asarray(df_testing_temp))) + ' ' # +"%.2f" %np.median(np.asarray(r_training_temp)**2)+' ' # +"%.2f" %np.min(np.asarray(r_training_temp)**2)+' ' # +"%.2f" %np.max(np.asarray(r_training_temp)**2)+' ' + "%.3f" % np.median(np.asarray(p_temp_within)) + ' ' + "%.3f" % np.min(np.asarray(p_temp_within)) + ' ' + "%.3f" % np.max(np.asarray(p_temp_within)) + ' ' + "%.3f" % np.median(np.asarray(ci95range_temp_within)) + ' ' + "%.3f" % np.min(np.asarray(ci95range_temp_within)) + ' ' + "%.3f" % np.max(np.asarray(ci95range_temp_within)) + ' ' + "%.3f" % np.median(np.asarray(power_temp_within)) + ' ' + "%.3f" % np.min(np.asarray(power_temp_within)) + ' ' + "%.3f" % np.max(np.asarray(power_temp_within))) print( "%i" % (np.median(np.asarray(df_withall_temp))) + ' ' # +"%.2f" %np.median(np.asarray(r_training_temp)**2)+' ' # +"%.2f" %np.min(np.asarray(r_training_temp)**2)+' ' # +"%.2f" %np.max(np.asarray(r_training_temp)**2)+' ' + "%.3f" % np.median(np.asarray(p_withall_temp)) + ' ' + "%.3f" % np.min(np.asarray(p_withall_temp)) + ' ' + "%.3f" % np.max(np.asarray(p_withall_temp)) + ' ' + "%.3f" % np.median(np.asarray(ci95range_withall_temp)) + ' ' + "%.3f" % np.min(np.asarray(ci95range_withall_temp)) + ' ' + "%.3f" % np.max(np.asarray(ci95range_withall_temp)) + ' ' + "%.3f" % np.median(np.asarray(power_withall_temp)) + ' ' + "%.3f" % np.min(np.asarray(power_withall_temp)) + ' ' + "%.3f" % np.max(np.asarray(power_withall_temp))) # print('{} correlations_training - corr: Mdn={:.2f} (IQR={:.3f})'.format( # tsp, # np.median(np.asarray(results['correlations'])**2), # iqr(np.asarray(results['correlations'])**2))) # correlation_training.append(np.median(np.asarray(results['correlations'])**2)) # print('{} correlations_testing - corr: Mdn={:.2f} (IQR={:.3f})'.format( # tsp, # np.median(np.asarray(results['correlations_testing'])**2), # iqr(np.asarray(results['correlations_testing'])**2))) # correlation_testing.append(np.median(np.asarray(results['correlations_testing'])**2)) # print('{} within_corr - corr: Mdn={:.2f} (IQR={:.3f})'.format( # tsp, # np.median(np.asarray(cross_correlation(sigmas_))), # iqr(np.asarray(cross_correlation(sigmas_))))) # cross_corr.append(np.median(np.asarray(cross_correlation(sigmas_)))) # sigmas.append(np.mean(sigmas_, axis=0)) # print('{} correlation with all - corr: Mdn={:.2f} (IQR={:.3f})'.format( # tsp, # np.median(np.asarray(corr_)**2), # iqr(np.asarray(corr_)**2))) # print() # correlation_with_all.append(np.median(np.asarray(corr_)**2)) # sigmas.append(np.mean(sigmas_, axis=0)) print(correlation_training) print(correlation_testing) print('Cross val correlation: ' + str(pearsonr(correlation_training, correlation_testing)[0]**2) + ' ' + str(pearsonr(correlation_training, correlation_testing)[1])) plt.scatter(np.asarray(correlation_training), np.asarray(correlation_testing)) plt.show() print(np.median(correlation_training)) print(iqr(correlation_training)) print(np.median(correlation_testing)) print(iqr(correlation_testing)) print(np.median(cross_corr)) print(iqr(cross_corr)) print(np.median(correlation_with_all)) print(iqr(correlation_with_all)) mds_data = [ 0.9368, 0.6845, 0.5935, 0.2046, 0.2371, 0.5662, 0.6901, 0.8612, 0.5610, 0.4604, 0.3068, 0.7646, 0.3152, 0.6535, 0.7005, 0.7070, 0.3616 ] plt.scatter(np.asarray(mds_data), np.asarray(correlation_testing)) plt.show() print(pearsonr(mds_data, correlation_testing)) return sigmas
from src.conf import * output_dir = figures_dir / "panels" output_dir.mkdir() myelo = matrix.columns[matrix.columns.str.contains("MDSC/All_CD45_(PBMC)", regex=False)] fig, axes = plt.subplots(3, 2, figsize=(7, 10), tight_layout=True) for ax, pop in zip(axes, myelo): for cbc, name, ax in zip(["lymph_CBC", "neutrophils"], ["Lymphocytes", "Neutrophils"], ax): p = meta[[cbc]].join(matrix[pop]).dropna() sns.regplot(p[cbc], p[pop], scatter_kws=dict(s=2, alpha=0.5), ax=ax) res = pg.corr(p[cbc], p[pop], method="spearman").squeeze() f = np.array([0.1, 1.1]) ax.set( title= f"r = {res['r']:.2f}; ci = {res['CI95%']}; p = {res['p-val']:.2e}", xlabel=f"{name} (%, Sysmex CBC)", ylabel=pop, # xlim=(-10, 110), xlim=np.asarray(ax.get_xlim()) * f, # ylim=(-10, 110), ylim=np.asarray(ax.get_ylim()) * f, ) fig.savefig(output_dir / "sysmex.neutrophil_lymphocyte.svg", **figkws) lr = pg.linear_regression(p[pop], p["neutrophils"])
def time_series_with_biometric_bar_plot(biometric_source_data_1, biometric_source_data_2, sample_source_data_1, sample_source_data_2, view_1, view_2, selection_dict): # parse selections biometric = selection_dict['biometric'] metabolite = selection_dict['metabolite'] user = selection_dict['user'] scale = selection_dict['scale'] start_time = pd.to_datetime('8/22/2018') end_time = pd.to_datetime('9/1/2018') # set up data and relevant stats if user == "Both": # run rm_corr title = 'Daily total/average: {}'.format(biometric) # .tolist() causing refresh error x = list(biometric_source_data_1.data[biometric]) + list( biometric_source_data_2.data[biometric]) y = list(np.log2(biometric_source_data_1.data[metabolite])) \ + list(np.log2(biometric_source_data_2.data[metabolite])) subject = ["Subject1"] * len(biometric_source_data_1.data[metabolite]) \ + ["Subject2"] * len(biometric_source_data_2.data[metabolite]) df = pd.DataFrame({ 'x': x, 'y': y, 'subject': subject, }) r, p, dof = pg.rm_corr(data=df, x='x', y='y', subject='subject') title = "Daily total/average: {} vs. log2 (Avg. Int.) {}; RM Corr : r = {}, p = {}".format( biometric, metabolite, round(r, 3), round(p, 3)) # get biometric max biometric_max = max(x) # get metabolite intensity min metabolite_intensities = list(sample_source_data_1.data[metabolite]) \ + list(sample_source_data_2.data[metabolite]) intensity_min, intensity_max = min(metabolite_intensities), \ max(metabolite_intensities) elif user == "Subject1": # calculate Spearman's Rho for Subject1 x = biometric_source_data_1.data[biometric] y = np.log2(biometric_source_data_1.data[metabolite]) corr_df = pg.corr(x, y, method='skipped') coef = corr_df.iloc[0]['r'] p = corr_df.iloc[0]['p-val'] #print(corr_df) title = "Daily average {} vs. log2(Avg. Int.) {}; Spearman's Rho: {}, p = {}".format( biometric, metabolite, round(coef, 3), round(p, 5)) # get biometric max biometric_max = max(x) # get metabolite intensity min metabolite_intensities = list(sample_source_data_1.data[metabolite]) intensity_min, intensity_max = min(metabolite_intensities), \ max(metabolite_intensities) elif user == "Subject2": # calculate Spearman's Rho x = biometric_source_data_2.data[biometric] y = np.log2(biometric_source_data_2.data[metabolite]) corr_df = pg.corr(x, y, method='skipped') coef = corr_df.iloc[0]['r'] p = corr_df.iloc[0]['p-val'] #print(corr_df) title = "Daily average {} vs. log2(Avg. Int.) {}; Spearman's Rho: {}, p = {}".format( biometric, metabolite, round(coef, 3), round(p, 5)) # get biometric max biometric_max = max(x) # get metabolite intensity range metabolite_intensities = sample_source_data_2.data[metabolite] intensity_min, intensity_max = min(metabolite_intensities), \ max(metabolite_intensities) # Set up figure and formatting p = figure( title=title, tools=tools, x_axis_type="datetime", plot_width=800, plot_height=400, x_range=[start_time, end_time], y_range=[intensity_min, intensity_max], ) #tooltips = [("sample", "@SampleID")]) # Setting the second y axis range name and range biometric_max_start = biometric_max * 0.10 biometric_range_end = biometric_max * 1.10 p.extra_y_ranges = { "biometric_axis": Range1d(start=biometric_max_start, end=biometric_range_end) } # Adding the second axis to the plot. p.add_layout(LinearAxis(y_range_name="biometric_axis"), 'right') p.xaxis.ticker = DaysTicker(days=np.arange(1, 59)) p.xaxis.formatter = DatetimeTickFormatter( hours=["%d %B %Y"], days=["%d %B %Y"], months=["%d %B %Y"], years=["%d %B %Y"], ) p.output_backend = "svg" p.xaxis.axis_label = None p.toolbar.logo = None p.xaxis.major_label_orientation = pi / 4 p.xgrid.grid_line_color = None p.ygrid.grid_line_color = None p.outline_line_color = None p.yaxis.axis_label = metabolite + " {}".format(scale) p.yaxis[1].axis_label = biometric ### Now for actual data ### # Have to make width huge, since Datetime has millisecond resolution: # https://stackoverflow.com/questions/45711567/categorical-y-axis-and-datetime-x-axis-with-bokeh-vbar-plot # 1 hr * 4 millisecond_width = 3600000 * 24 if user == "Both" or user == "Subject1": # time series data legend_title = "Subject 1 [{}]".format(metabolite) p.line('Datetime', metabolite, source=sample_source_data_1, color='red') p.circle('Datetime', metabolite, source=sample_source_data_1, color="red", size=5, alpha=0.5, view=view_1, hover_color="black") # biometric data legend_title = "Subject 1 Daily Total {}".format(biometric) p.step('Datetime', y=biometric, color="red", mode="center", line_dash="dashed", source=biometric_source_data_1, legend=legend_title, y_range_name="biometric_axis") p.vbar('Datetime', top=biometric, fill_color="red", width=millisecond_width, line_color=None, alpha=0.3, source=biometric_source_data_1, y_range_name="biometric_axis") # overwrite if user == "Both" or user == "Subject2": # time series data legend_title = "Subject 2 [{}]".format(metabolite) p.line('Datetime', metabolite, source=sample_source_data_2, color='blue') p.circle('Datetime', metabolite, source=sample_source_data_2, color="blue", size=5, alpha=0.5, view=view_2, hover_color="black") # biometric data legend_title = "Subject 2 Daily Total {}".format(biometric) p.step('Datetime', y=biometric, color="blue", mode="center", line_dash="dashed", source=biometric_source_data_2, legend=legend_title, y_range_name="biometric_axis") p.vbar(x='Datetime', top=biometric, fill_color="blue", width=millisecond_width, line_color=None, alpha=0.3, source=biometric_source_data_2, y_range_name="biometric_axis") # Light cycle formatting, this needs to come second for tool tips to render vline_list = [] for datetime in pd.date_range(start='8/22/2018', end='9/1/2018'): vline = Span( location=datetime, dimension='height', line_color='grey', #this should creat a ~6 hr window around midnight, to simulate # the dark cycle during this time period line_width=24, line_dash='solid', line_alpha=0.3) vline_list.append(vline) p.renderers.extend(vline_list) return p
def get_crossval_sigmas_from_folder( timbre_spaces = ['Barthet2010','Grey1977','Grey1978','Iverson1993_Onset', 'Iverson1993_Remainder','Iverson1993_Whole', 'McAdams1995','Patil2012_A3', 'Patil2012_DX4','Patil2012_GD4','Siedenburg2016_e3', 'Lakatos2000_Harm', 'Lakatos2000_Comb','Lakatos2000_Perc','Siedenburg2016_e2set1','Siedenburg2016_e2set2', 'Siedenburg2016_e2set3'], representation = ['auditory_strf'], folder='results_light', averaging='avg_time_avg_freq', early_stopping=True, folder_old='all'): sigmas = [] correlation_testing = [] correlation_training = [] correlation_with_all = [] cross_corr = [] for tsp in timbre_spaces: for root, dirs, files in os.walk(os.path.join(folder, tsp.lower())): for f in files: if averaging + '_' + tsp.lower() in f: results = pickle.load(open(os.path.join(root, f), 'rb')) # print(results) metricOnAll = pickle.load(open(os.path.join(root, 'all.pkl'), 'rb')) metricOnAll = metricOnAll['sigmas'].flatten() # print(results) sigmas_ = [results['sigmas'][fold] for fold in range(len(results['correlations']))] sigmas_ = np.array(sigmas_) corr_ = [pearsonr(results['sigmas'][fold].flatten(),metricOnAll)[0] for fold in range(len(results['correlations']))] # print('{} - corr={:.3f} (std={:.3f})'.format( # tsp, # np.mean(cross_correlation(sigmas_)), # np.std(cross_correlation(sigmas_)))) # sigmas.append(np.mean(sigmas_, axis=0)) print('{} correlations_training - corr={:.2f} (std={:.3f})'.format( tsp, np.median(np.asarray(results['correlations'])**2), iqr(np.asarray(results['correlations'])**2))) correlation_training.append(np.median(np.asarray(results['correlations'])**2)) print('{} correlations_testing - corr={:.2f} (std={:.3f})'.format( tsp, np.median(np.asarray(results['correlations_testing'])**2), iqr(results['correlations_testing'])**2)) correlation_testing.append(np.median(np.asarray(results['correlations_testing'])**2)) print('{} within_corr - corr={:.2f} (std={:.3f})'.format( tsp, np.median(np.asarray(cross_correlation(sigmas_))), iqr(np.asarray(cross_correlation(sigmas_))))) cross_corr.append(np.median(np.asarray(cross_correlation(sigmas_)))) sigmas.append(np.mean(sigmas_, axis=0)) print('{} correlation with all - corr={:.2f} (std={:.3f})'.format( tsp, np.median(np.asarray(corr_)**2), iqr(np.asarray(corr_)**2))) print() correlation_with_all.append(np.median(np.asarray(corr_)**2)) # sigmas.append(np.mean(sigmas_, axis=0)) print(correlation_training) print(correlation_testing) print('Cross val correlation: '+str(pearsonr(correlation_training,correlation_testing)[0]**2)+' '+str(pearsonr(correlation_training,correlation_testing)[1])) print(pg.corr(correlation_training,correlation_testing)) plt.scatter(np.asarray(correlation_training),np.asarray(correlation_testing)) plt.show() print(np.median(correlation_training)) print(iqr(correlation_training)) print(np.median(correlation_testing)) print(iqr(correlation_testing)) print(np.median(cross_corr)) print(iqr(cross_corr)) print(np.median(correlation_with_all)) print(iqr(correlation_with_all)) mds_data = [0.9368,0.6845,0.5935,0.2046,0.2371,0.5662,0.6901,0.8612,0.5610,0.4604,0.3068,0.7646,0.3152,0.6535,0.7005,0.7070,0.3616] plt.scatter(np.asarray(mds_data),np.asarray(correlation_testing)) plt.show() print(pearsonr(mds_data,correlation_testing)) return sigmas
pass if data1[j, 0] != 0 and index_2 == -1: index_2 = j pass if index_1 != -1 and index_2 != -1: break i -= 1 j -= 1 data = data[-index_1:, :] data1 = data1[-index_2:, :] data = data[-2000:, :] data1 = data1[-2000:, :] x = pg.corr(x=data[:, 0], y=data1[:, 0]) print(x) # print(data.tostring()) # print(data1.tostring()) # data = data[:,:] # data1 = data1[:,:] # data = data.reshape(data.shape[0],1) # data1 = data1.reshape(data1.shape[0],1) # data = data[-10000:,:] # data1 = data1[-10000:,:] # print(data1.shape[1]) # df = pd.DataFrame(data,data1) # print(df.head())
def plot__spatial_correlation(df_counts, cytokine_responders, save_folder, distance): """Calculate spatial (Pearson) Correlation between each Cyto+ spot and its nn responder genes spots for each cluster -> Plot for Workflow figure (Fig 1) Parameters ---------- df_counts : pandas.Dataframe cytokine_responders : dict save_folder : str distance : int Returns ------- list of p-values """ p_vals = [] for cyto in cytokine_responders: resp_name = "_".join([cyto, 'responder']) temp_df = df_counts[[cyto, resp_name]].copy() temp_df = temp_df[~np.isnan(temp_df[cyto].values.astype(np.float64))] temp_df[cyto] = temp_df[cyto].values.astype(np.float) temp_df["_".join([cyto, 'responder'])] = temp_df["_".join([cyto, 'responder'])].values.astype(np.float) # 1. Calculate correlation # Always report the correlation and the p-value: # -> Use pearson correlation # -> at low statistics the p-value might be infiltrated sig_r = pingouin.corr(x=temp_df["_".join([cyto, 'responder'])], y=temp_df[cyto], method='pearson') p_vals.append(sig_r['p-val'].values[0]) # 2. Plot correlation fig, ax = plt.subplots(figsize=fig_size) ax.grid(False) sns.regplot(data=temp_df, x=resp_name, y=cyto, ax=ax, scatter=False, color="black", label=None) ax.scatter(data=temp_df, x=resp_name, y=cyto, c='k') # Add text: Correlation value and p-value ax.text(temp_df.max()[1] / 2 - temp_df.max()[1]/10, temp_df.max()[0], 'r = {:.2f}; p = {:.2e}'.format(sig_r['r'].values[0], sig_r['p-val'].values[0]), fontstyle='italic', fontsize=text_fontsize) # Axis params ax.set_xlabel(" ".join(["Responder Counts"]), fontsize=axis_label_fontsize) if cyto == 'IFNG': ax.set_ylabel(r'IFN-$\gamma$ Counts', fontsize=axis_label_fontsize) else: ax.set_ylabel(" ".join([cyto, 'Counts']), fontsize=axis_label_fontsize) ax.set_xlim([-0.5, temp_df.max()[1] + temp_df.max()[1] / 50]) ax.set_ylim([-0.5, temp_df.max()[0] + temp_df.max()[0] / 50]) ax.tick_params(labelsize=xy_ticks) plt.tight_layout() # remove upper and right edge lines in plot sns.despine(ax=ax) # 3. Save figure fig.savefig(os.path.join(save_folder, "_".join(['Fig1', str(distance), cyto, resp_name, fileformat]))) plt.close() return p_vals
def read_tables(form_path_1, key_path_1, form_path_2, key_path_2): """ Reads-in human judgements and reports results. """ # Read-in forms form_1 = open(form_path_1, 'r', encoding='utf8') form_2 = open(form_path_2, 'r', encoding='utf8') # Read in keys with open(key_path_1, 'r', encoding='utf8') as kp1: keys_1 = json.load(kp1) with open(key_path_2, 'r', encoding='utf8') as kp2: keys_2 = json.load(kp2) # Trackers correct_sense_pick = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } is_ambiguous = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } is_natural = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } shared_correct_sense_pick_1 = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } shared_is_ambiguous_1 = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } shared_is_natural_1 = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } shared_correct_sense_pick_2 = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } shared_is_ambiguous_2 = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } shared_is_natural_2 = { 'wmt': { 'natural': [], 'synthetic': [] }, 'os': { 'natural': [], 'synthetic': [] } } # Go through annotations line by line for form, keys, shared_correct_sense_pick, shared_is_ambiguous, shared_is_natural in \ [(form_1, keys_1, shared_correct_sense_pick_1, shared_is_ambiguous_1, shared_is_natural_1), (form_2, keys_2, shared_correct_sense_pick_2, shared_is_ambiguous_2, shared_is_natural_2)]: for line_id, line in enumerate(form): if line_id < 2: continue key = keys[str(line_id - 1)] domain, prv, sns_1, sns_2 = key sns_tpl = (sns_1, sns_2) sns_pick, amb_pick, nat_pick = line.split('\t')[-3:] # Assign to trackers if line_id < 1002: if sns_pick in ['BOTH', 'NONE']: pass else: correct_sense_pick[domain][prv].append( int(sns_tpl[int(sns_pick) - 1])) if amb_pick == 'UNSURE': pass else: is_ambiguous[domain][prv].append(int(amb_pick == 'NO')) is_natural[domain][prv].append(int(nat_pick)) else: # Assign to trackers shared_correct_sense_pick[domain][prv].append(sns_pick) shared_is_ambiguous[domain][prv].append(amb_pick) shared_is_natural[domain][prv].append(int(nat_pick)) # Report summary print('Correct sense picked:') all_natural = list() all_synthetic = list() for domain in correct_sense_pick.keys(): for prv in correct_sense_pick[domain]: if prv == 'natural': all_natural += correct_sense_pick[domain][prv] else: all_synthetic += correct_sense_pick[domain][prv] total = len(correct_sense_pick[domain][prv]) pos = sum(correct_sense_pick[domain][prv]) neg = total - pos print( '{:s} | {:s} : Yes {:d} ({:.3f}%) | No {:d} ({:.3f}%)'.format( domain, prv, pos, (pos / total) * 100, neg, (neg / total) * 100)) for tag, scores in [('all natural', all_natural), ('all synthetic', all_synthetic)]: total = len(scores) pos = sum(scores) neg = total - pos print('{:s} : Yes {:d} ({:.3f}%) | No {:d} ({:.3f}%)'.format( tag, pos, (pos / total) * 100, neg, (neg / total) * 100)) print('=' * 20) print('Homograph is NOT ambiguous:') all_natural = list() all_synthetic = list() for domain in is_ambiguous.keys(): for prv in is_ambiguous[domain]: if prv == 'natural': all_natural += is_ambiguous[domain][prv] else: all_synthetic += is_ambiguous[domain][prv] total = len(is_ambiguous[domain][prv]) pos = sum(is_ambiguous[domain][prv]) neg = total - pos print( '{:s} | {:s} : Yes {:d} ({:.3f}%) | No {:d} ({:.3f}%)'.format( domain, prv, pos, (pos / total) * 100, neg, (neg / total) * 100)) for tag, scores in [('all natural', all_natural), ('all synthetic', all_synthetic)]: total = len(scores) pos = sum(scores) neg = total - pos print('{:s} : Yes {:d} ({:.3f}%) | No {:d} ({:.3f}%)'.format( tag, pos, (pos / total) * 100, neg, (neg / total) * 100)) print('=' * 20) print('Naturalness scores:') all_natural = list() all_synthetic = list() for domain in is_natural.keys(): for prv in is_natural[domain]: if prv == 'natural': all_natural += is_natural[domain][prv] else: all_synthetic += is_natural[domain][prv] print('{:s} | {:s} : {:.3f}'.format( domain, prv, np.mean(is_natural[domain][prv]))) for tag, scores in [('all natural', all_natural), ('all synthetic', all_synthetic)]: print('{:s} : {:.3f}'.format(tag, np.mean(scores))) print('=' * 20) print('Rater agreement - Cohen\'s (weighted) kappa:') all_1 = list() all_2 = list() print('Correct sense picked:') for domain in shared_correct_sense_pick_1.keys(): for prv in shared_correct_sense_pick_1[domain]: all_1 += shared_correct_sense_pick_1[domain][prv] all_2 += shared_correct_sense_pick_2[domain][prv] ck_sns = cohen_kappa_score(all_1, all_2, labels=['1', '2', 'NONE', 'BOTH']) ck_sns = 1. if math.isnan(ck_sns) else ck_sns print(ck_sns) print('Homograph is NOT ambiguous:') all_1 = list() all_2 = list() for domain in shared_is_ambiguous_1.keys(): for prv in shared_is_ambiguous_1[domain]: all_1 += shared_is_ambiguous_1[domain][prv] all_2 += shared_is_ambiguous_2[domain][prv] ck_amb = cohen_kappa_score(all_1, all_2, labels=['YES', 'NO', 'UNSURE']) ck_amb = 1. if math.isnan(ck_amb) else ck_amb print(ck_amb) print('Naturalness scores:') all_1 = list() all_2 = list() for domain in shared_is_natural_1.keys(): for prv in shared_is_natural_1[domain]: all_1 += shared_is_natural_1[domain][prv] all_2 += shared_is_natural_2[domain][prv] ck_nat = cohen_kappa_score(all_1, all_2, labels=[1, 2, 3, 4, 5], weights='linear') ck_nat = 1. if math.isnan(ck_nat) else ck_nat print(ck_nat) print(corr(all_1, all_2, method='pearson').round(3)) print('Mean agreement: {:.3f}'.format((ck_sns + ck_amb + ck_nat) / 3))
str(math.pow(spearmanr(coef1, coef2)[0], 2))) print("full vs. scale/rate: " + str(math.pow(spearmanr(coefFull, coef0)[0], 2))) print("full vs. freq/rate: " + str(math.pow(spearmanr(coefFull, coef1)[0], 2))) print("full vs. freq/scale: " + str(math.pow(spearmanr(coefFull, coef2)[0], 2))) tabCorr.append(math.pow(spearmanr(coef0, coef1)[0], 2)) tabCorr.append(math.pow(spearmanr(coef0, coef2)[0], 2)) tabCorr.append(math.pow(spearmanr(coef1, coef2)[0], 2)) tabCorr.append(math.pow(spearmanr(coefFull, coef0)[0], 2)) tabCorr.append(math.pow(spearmanr(coefFull, coef1)[0], 2)) tabCorr.append(math.pow(spearmanr(coefFull, coef2)[0], 2)) ## pg_ = pg.corr(coef0, coef1, method='spearman') aa = pg_['CI95%'][0][0]**2 bb = pg_['CI95%'][0][1]**2 lower_ = np.min([aa, bb]) upper_ = np.max([aa, bb]) p_ = pg_['p-val'][0] r_ = pg_['r'][0] power_ = pg_['power'][0] df_ = pg_['n'][0] - 2 r_withall_temp.append(r_) p_withall_temp.append(p_) temp___ = (upper_ - lower_) ci95range_withall_temp.append(temp___) power_withall_temp.append(power_) df_withall_temp.append(df_) ##
########################################### Angle results gt_lenke_prob_dir = "../data/PredictionsVsGroundTruth/LenkeCurveTypeProbabilities_GroundTruthEndplates.csv" gt_lenke_prob_data = genfromtxt(gt_lenke_prob_dir, delimiter=',') pred_lenke_prob_dir = "../data/PredictionsVsGroundTruth/LenkeCurveTypeProbabilities.csv" pred_lenke_prob_data = genfromtxt(pred_lenke_prob_dir, delimiter=',') MAD = np.mean( abs(gt_lenke_prob_data.reshape(-1) - pred_lenke_prob_data.reshape(-1))) D = pred_lenke_prob_data - gt_lenke_prob_data MD = np.mean(D) SD = np.std(D) corr = pg.corr(pred_lenke_prob_data.reshape(-1), gt_lenke_prob_data.reshape(-1)) print(corr.to_string()) plt.figure() # sns.distplot(D[:,0], label="Proximal-thoracic") # sns.distplot(D[:,1], label="Main thoracic") # sns.distplot(D[:,2], label="Lumbar") sns.distplot(D.reshape(-1)) plt.xlabel("Difference in Probability") plt.ylabel("Density") # plt.legend() plt.title( "Difference between Predicted and Ground-truth Lenke Curve Type Probabilities" ) plt.show()
def part_corr(data=None, x=None, y=None, covar=None, x_covar=None, y_covar=None, tail='two-sided', method='pearson'): from pingouin.utils import _flatten_list assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.' assert data.shape[0] > 2, 'Data must have at least 3 samples.' assert isinstance(x, (str, tuple)), 'x must be a string.' assert isinstance(y, (str, tuple)), 'y must be a string.' assert isinstance(covar, (str, list, type(None))) assert isinstance(x_covar, (str, list, type(None))) assert isinstance(y_covar, (str, list, type(None))) if covar is not None and (x_covar is not None or y_covar is not None): raise ValueError('Cannot specify both covar and {x,y}_covar.') assert x != covar, 'x and covar must be independent' assert y != covar, 'y and covar must be independent' assert x != y, 'x and y must be independent' # Check that columns exist col = _flatten_list([x, y, covar, x_covar, y_covar]) if isinstance(covar, str): covar = [covar] if isinstance(x_covar, str): x_covar = [x_covar] if isinstance(y_covar, str): y_covar = [y_covar] assert all([c in data for c in col]), 'columns are not in dataframe.' # Check that columns are numeric assert all([data[c].dtype.kind in 'bfiu' for c in col]) # Drop rows with NaN data = data[col].dropna() assert data.shape[0] > 2, 'Data must have at least 3 non-NAN samples.' # Standardize (= no need for an intercept in least-square regression) #This does NOT work with dummy variable for plate covariates -- so I will not standardize those #So, only standardize for those variables that work for c in col: if (data[c].std(axis=0) != 0): data[c] = (data[c] - data[c].mean(axis=0)) / data[c].std(axis=0) if covar is not None: # PARTIAL CORRELATION cvar = np.atleast_2d(data[covar].to_numpy()) beta_x = np.linalg.lstsq(cvar, data[x].to_numpy(), rcond=None)[0] beta_y = np.linalg.lstsq(cvar, data[y].to_numpy(), rcond=None)[0] res_x = data[x].to_numpy() - cvar @ beta_x res_y = data[y].to_numpy() - cvar @ beta_y else: # SEMI-PARTIAL CORRELATION # Initialize "fake" residuals res_x, res_y = data[x].to_numpy(), data[y].to_numpy() if x_covar is not None: cvar = np.atleast_2d(C[x_covar].to_numpy()) beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0] res_x = C[x].to_numpy() - cvar @ beta_x if y_covar is not None: cvar = np.atleast_2d(C[y_covar].to_numpy()) beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0] res_y = C[y].to_numpy() - cvar @ beta_y return pg.corr(res_x, res_y, method=method, tail=tail)
lf = [ 'dayofyear', 'lograin3T', 'lograin7T', 'wet3', 'wet7', 'upwelling', 'spring_tide', 'days_since_full_moon' ] FC = hf.groupby('event').mean()['logFC'] ENT = hf.groupby('event').mean()['logENT'] FCv = hf.groupby('event').var()['logFC'] ENTv = hf.groupby('event').var()['logENT'] for l in lf: print('\n' + l) # Means print('mean (ENT/FC):') print(pg.corr(ENT, hf.groupby('event').max()[l])[['r', 'p-val']]) print(pg.corr(FC, hf.groupby('event').max()[l])[['r', 'p-val']]) # Variances print('var (ENT/FC):') print(pg.corr(ENTv, hf.groupby('event').max()[l])[['r', 'p-val']]) print(pg.corr(FCv, hf.groupby('event').max()[l])[['r', 'p-val']]) #%% 2 N = len(EV) # What will be the angle of each axis in the plot? (we divide the plot / number of variable) angles = [n / float(N) * 2 * np.pi for n in range(N)] angles += angles[:1] for b in hf.beach.unique(): ENT_corrs = list(hf[hf.beach == b].corr().loc['logENT'][EV])
# In[ ]: import pandas as pd import pingouin as pg # In[ ]: #Read in data file df = pd.read_csv('../data/responses-processed.csv') # In[ ]: #Do people who use Signal use security in choosing #instant messaging tools? #Simple row-to-row correlation pg.corr(x=df['Q3-17'], y=df['Q34-31']) # In[ ]: pg.corr(x=df['Q40-0'], y=df['Q3-16']) # In[ ]: corr = pg.pairwise_corr(df, columns=[['Q7-7'], [ 'Q3-0', 'Q3-1', 'Q3-2', 'Q3-3', 'Q3-4', 'Q3-5', 'Q3-6', 'Q3-7', 'Q3-8', 'Q3-9', 'Q3-10', 'Q3-11', 'Q3-12', 'Q3-13', 'Q3-14', 'Q3-15', 'Q3-16', 'Q3-17', 'Q3-18'
def main(subject, session, smoothed, pca_confounds, n_voxels=1000, bids_folder='/data', mask='wang15_ips'): target_dir = op.join(bids_folder, 'derivatives', 'decoded_pdfs.volume') if smoothed: target_dir += '.smoothed' if pca_confounds: target_dir += '.pca_confounds' target_dir = op.join(target_dir, f'sub-{subject}', 'func') if not op.exists(target_dir): os.makedirs(target_dir) sub = Subject(subject, bids_folder) paradigm = sub.get_behavior(sessions=session, drop_no_responses=False) paradigm['log(n1)'] = np.log(paradigm['n1']) paradigm = paradigm.droplevel(['subject', 'session']) data = get_single_trial_volume(subject, session, bids_folder=bids_folder, mask=mask, smoothed=smoothed, pca_confounds=pca_confounds).astype( np.float32) data.index = paradigm.index print(data) pdfs = [] runs = range(1, 9) for test_run in runs: test_data, test_paradigm = data.loc[test_run].copy( ), paradigm.loc[test_run].copy() train_data, train_paradigm = data.drop( test_run, level='run').copy(), paradigm.drop(test_run, level='run').copy() pars = get_prf_parameters_volume(subject, session, cross_validated=True, smoothed=smoothed, pca_confounds=pca_confounds, run=test_run, mask=mask, bids_folder=bids_folder) # pars = get_prf_parameters_volume(subject, session, cross_validated=False, mask=mask, bids_folder=bids_folder) print(pars) model = GaussianPRF(parameters=pars) pred = model.predict( paradigm=train_paradigm['log(n1)'].astype(np.float32)) r2 = get_rsq(train_data, pred) print(r2.describe()) r2_mask = r2.sort_values(ascending=False).index[:n_voxels] train_data = train_data[r2_mask] test_data = test_data[r2_mask] print(r2.loc[r2_mask]) model.apply_mask(r2_mask) model.init_pseudoWWT(stimulus_range, model.parameters) residfit = ResidualFitter(model, train_data, train_paradigm['log(n1)'].astype(np.float32)) omega, dof = residfit.fit(init_sigma2=10.0, method='t', max_n_iterations=10000) print('DOF', dof) bins = stimulus_range.astype(np.float32) pdf = model.get_stimulus_pdf(test_data, bins, model.parameters, omega=omega, dof=dof) print(pdf) E = (pdf * pdf.columns).sum(1) / pdf.sum(1) print(pd.concat((E, test_paradigm['log(n1)']), axis=1)) print(pingouin.corr(E, test_paradigm['log(n1)'])) pdfs.append(pdf) pdfs = pd.concat(pdfs) target_fn = op.join( target_dir, f'sub-{subject}_ses-{session}_mask-{mask}_nvoxels-{n_voxels}_space-{space}_pars.tsv' ) pdfs.to_csv(target_fn, sep='\t')
'rb')) # load sigmas sigmasTab.append(sigmas['sigmas'].flatten()) represenationsVariances = np.std( sigmas['representations'], axis=1) # compute variances of representations represenationsVariancesTensor = np.reshape(represenationsVariances, (128, 11, 22)) strf_scale_rateVar, strf_freq_rateVar, strf_freq_scaleVar = avgvec2strfavg( strf2avgvec(represenationsVariancesTensor) ) # compute representations variances projections sigmas_ = sigmas['sigmas'] # load sigmas sigmasTensor = np.reshape(sigmas_, (128, 11, 22)) strf_scale_rateSig, strf_freq_rateSig, strf_freq_scaleSig = avgvec2strfavg( strf2avgvec(sigmasTensor)) # compute metrics projections pg_ = pg.corr(sigmas_.flatten(), represenationsVariances.flatten()) aa = pg_['CI95%'][0][0]**2 bb = pg_['CI95%'][0][1]**2 lower = np.min([aa, bb]) upper = np.max([aa, bb]) p = pg_['p-val'][0] r = pg_['r'][0] power = pg_['power'][0] df = pg_['n'][0] - 2 # print(file+' r^2='+str(r**2)+' p='+str(p)+' lower='+str(lower)+' upper='+str(upper)+' power='+str(power)+' df='+str(df)) print( str(df) + ' ' + "%.2f" % (r**2) + ' ' + "%.3f" % p + ' [' + "%.3f" % (lower) + ';' + "%.3f" % (upper) + '] ' + "%.3f" % (power)) print()
def main(subject, session, n_voxels=250, bids_folder='/data', mask='wang15_ips'): session1 = session[:2] + '1' session2 = session[:2] + '2' pars = get_prf_parameters_volume(subject, session1, cross_validated=False, mask=mask, bids_folder=bids_folder).astype( np.float32) behavior = get_task_behavior(subject, session2, bids_folder) data = get_single_trial_volume(subject, session2, bids_folder=bids_folder, mask=mask).astype(np.float32) print(data) paradigm = behavior[['log(n1)']].astype(np.float32) paradigm.index = data.index print(paradigm) pdfs = [] runs = range(1, 9) for test_run in runs: test_data, test_paradigm = data.xs( test_run, level='run').copy(), paradigm.xs(test_run, level='run').copy() train_data, train_paradigm = data.drop( test_run, level='run').copy(), paradigm.drop(test_run, level='run').copy() model = GaussianPRF(parameters=pars, paradigm=train_paradigm) parfitter = ParameterFitter(model, train_data, train_paradigm) new_pars = parfitter.refine_baseline_and_amplitude(pars) new_pars = parfitter.fit(init_pars=new_pars, fixed_pars=['mu', 'sd']) print(new_pars) model.parameters = new_pars.astype(np.float32) pred = model.predict() r2 = get_rsq(train_data, pred) print(r2.describe()) r2_mask = r2.sort_values(ascending=False).index[:n_voxels] train_data = train_data[r2_mask] test_data = test_data[r2_mask] print(r2.loc[r2_mask]) model.apply_mask(r2_mask) model.init_pseudoWWT(stimulus_range, model.parameters) residfit = ResidualFitter(model, train_data, train_paradigm['log(n1)'].astype(np.float32)) omega, dof = residfit.fit(init_sigma2=10.0, method='t', max_n_iterations=10000) print('DOF', dof) bins = np.linspace(np.log(5), np.log(80), 150, endpoint=True).astype(np.float32) pdf = model.get_stimulus_pdf(test_data, bins, model.parameters, omega=omega, dof=dof) print(pdf) E = (pdf * pdf.columns).sum(1) / pdf.sum(1) print(pd.concat((E, test_paradigm['log(n1)']), axis=1)) print(pingouin.corr(E, test_paradigm['log(n1)'])) pdfs.append(pdf) pdfs = pd.concat(pdfs) target_dir = op.join(bids_folder, 'derivatives', 'decoded_pdfs.volume.across_session') target_dir = op.join(target_dir, f'sub-{subject}', 'func') if not op.exists(target_dir): os.makedirs(target_dir) target_fn = op.join( target_dir, f'sub-{subject}_ses-{session2}_mask-{mask}_nvoxels-{n_voxels}_space-T1w_pars.tsv' ) pdfs.to_csv(target_fn, sep='\t')
def main(subject, session, smoothed, n_verts=100, bids_folder='/data', mask='wang15_ips'): target_dir = op.join(bids_folder, 'derivatives', 'decoded_pdfs') if smoothed: target_dir += '.smoothed' target_dir = op.join(target_dir, f'sub-{subject}', 'func') if not op.exists(target_dir): os.makedirs(target_dir) paradigm = [ pd.read_csv(op.join( bids_folder, f'sub-{subject}', f'ses-{session}', 'func', f'sub-{subject}_ses-{session}_task-task_run-{run}_events.tsv'), sep='\t') for run in range(1, 9) ] paradigm = pd.concat(paradigm, keys=range(1, 9), names=['run']).droplevel(1) paradigm = paradigm[paradigm.trial_type == 'stimulus 1'].set_index( 'trial_nr', append=True) paradigm['log(n1)'] = np.log(paradigm['n1']) print(paradigm) data = get_single_trial_surf_data(subject, session, bids_folder, mask=mask, smoothed=smoothed, space=space) data.index = paradigm.index # np.random.seed(666) # resample_mask = np.random.choice(data.columns, n_verts) # data = data[resample_mask].astype(np.float32) pdfs = [] runs = range(1, 9) for test_run in runs: test_data, test_paradigm = data.loc[test_run].copy( ), paradigm.loc[test_run].copy() train_data, train_paradigm = data.drop( test_run, level='run').copy(), paradigm.drop(test_run, level='run').copy() pars = get_prf_parameters(subject, session, run=test_run, mask=mask, bids_folder=bids_folder, smoothed=smoothed, space=space) # pars = pars.loc[resample_mask] model = GaussianPRF(parameters=pars) pred = model.predict( paradigm=train_paradigm['log(n1)'].astype(np.float32)) r2 = get_rsq(train_data, pred) print(r2.describe()) print(r2.sort_values(ascending=False)) r2_mask = r2.sort_values(ascending=False).index[:n_verts] model.apply_mask(r2_mask) train_data = train_data[r2_mask].astype(np.float32) test_data = test_data[r2_mask].astype(np.float32) print(model.parameters) print(train_data) model.init_pseudoWWT(stimulus_range, model.parameters) residfit = ResidualFitter(model, train_data, train_paradigm['log(n1)'].astype(np.float32)) omega, dof = residfit.fit(init_sigma2=10.0, method='t', max_n_iterations=10000) print('DOF', dof) bins = np.linspace(np.log(5), np.log(80), 150, endpoint=True).astype(np.float32) pdf = model.get_stimulus_pdf(test_data, bins, model.parameters, omega=omega, dof=dof) print(pdf) E = (pdf * pdf.columns).sum(1) / pdf.sum(1) print(pd.concat((E, test_paradigm['log(n1)']), axis=1)) print(pingouin.corr(E, test_paradigm['log(n1)'])) pdfs.append(pdf) pdfs = pd.concat(pdfs) target_fn = op.join( target_dir, f'sub-{subject}_ses-{session}_mask-{mask}_nverts-{n_verts}_space-{space}_pars.tsv' ) pdfs.to_csv(target_fn, sep='\t')