def date_perspective(speed_df, red_light_df, traffic_crash_df):
    """
    Method to calculate and display correlation between redlight violations, speed violations and total violation against crashes with
    respect to date.
    :param speed_df: dataframe consisting of speed violation data
    :param red_light_df: dataframe consisting of red light violation data
    :param traffic_crash_df: dataframe consisting of traffic crash data
    :return: None
    """

    date_red_light_frame = red_light_df[['VIOLATION DATE', 'VIOLATIONS']]
    date_speed_frame = speed_df[['VIOLATION DATE', 'VIOLATIONS']]
    date_traffic_crash = traffic_crash_df[["Date"]]
    date_red_light_frame = date_red_light_frame.groupby(
        'VIOLATION DATE', sort=False, as_index=False)['VIOLATIONS'].sum()
    date_speed_frame = date_speed_frame.groupby(
        'VIOLATION DATE', sort=False, as_index=False)['VIOLATIONS'].sum()
    date_traffic_crash['count'] = date_traffic_crash.groupby(
        'Date')['Date'].transform('count')
    date_traffic_crash.rename(columns={'count': 'Crashes'}, inplace=True)
    date_speed_frame.rename(columns={
        'VIOLATION DATE': 'Date',
        'VIOLATIONS': 'Red_Light_Violations'
    },
                            inplace=True)
    date_red_light_frame.rename(columns={
        'VIOLATION DATE': 'Date',
        'VIOLATIONS': 'Speed_Limit_Violations'
    },
                                inplace=True)

    res = pd.merge(date_traffic_crash,
                   date_speed_frame,
                   how='left',
                   on='Date',
                   sort=True).drop_duplicates()
    res = pd.merge(res, date_red_light_frame, how='left', on='Date',
                   sort=True).drop_duplicates()
    res['Total_Violations'] = res['Red_Light_Violations'] + res[
        'Speed_Limit_Violations']
    res.fillna(0, inplace=True)
    cor1 = pg.corr(x=res['Red_Light_Violations'], y=res['Crashes'])
    cor2 = pg.corr(x=res['Speed_Limit_Violations'], y=res['Crashes'])
    cor3 = pg.corr(x=res['Total_Violations'], y=res['Crashes'])

    f1 = visualize(res["Red_Light_Violations"], res["Crashes"],
                   "Red Light Violations", "Crashes", cor1, 'blue')
    f2 = visualize(res["Speed_Limit_Violations"], res["Crashes"],
                   "Speed-limit Violations", "Crashes", cor2, 'blue')
    f3 = visualize(res["Total_Violations"], res["Crashes"], "Total Violations",
                   "Crashes", cor3, 'blue')

    return f1, f2, f3
Example #2
0
def create_correlation_to_target_value(df_corr,
                                       feature_column_list,
                                       target_column='react_per_100_audience'):
    """
    Creates a correlation matrix of all numerical features
    input:
            df_corr: the original dataframe to make build the correlations
            feature_column_list: features to be included in the correlation analysis
            target_column: the colum that stores the target value
    output:
            a table of different correlation metrics for the speicifc features & target value
    """

    if target_column in feature_column_list:
        feature_column_list.remove(target_column)
    list_of_dfs = []
    for i in feature_column_list:
        try:
            df_pg = pg.corr(x=df_corr[i], y=df_corr[target_column])
            df_pg.index = [i]
            list_of_dfs.append(df_pg)
        except Exception as e:
            print('correlation did not work for {}'.format(i))
    df_complete_corr = pd.concat(list_of_dfs, ignore_index=False)
    df_complete_corr = df_complete_corr[[
        'n', 'r', 'CI95%', 'p-val', 'BF10'
    ]].rename(
        columns={
            'n': '# sample',
            'r': 'correlation coefficient',
            'CI95%': '95% confidence interval',
            'p-val': 'p value',
            'BF10': 'Bayes Factor'
        })
    return df_complete_corr
def cross_correlation(data_as_array):
    correlations = []
    r_temp = []
    p_temp = []
    ci95range_temp = []
    power_temp = []
    df_temp = []
    for ki1 in range(len(data_as_array)):
        for ki2 in range(len(data_as_array)):
            if ki2 > ki1:
                correlations.append(
                    pearsonr(data_as_array[ki1], data_as_array[ki2])[0]**2)
                pg_ = pg.corr(data_as_array[ki1], data_as_array[ki2])
                aa = pg_['CI95%'][0][0]**2
                bb = pg_['CI95%'][0][1]**2
                lower_ = np.min([aa, bb])
                upper_ = np.max([aa, bb])
                p_ = pg_['p-val'][0]
                r_ = pg_['r'][0]
                power_ = pg_['power'][0]
                df_ = pg_['n'][0] - 2
                r_temp.append(r_)
                p_temp.append(p_)
                temp___ = (upper_ - lower_)
                ci95range_temp.append(temp___)
                power_temp.append(power_)
                df_temp.append(df_)
    return correlations, r_temp, p_temp, ci95range_temp, power_temp, df_temp
Example #4
0
def preprocess_pandas(df):
    scaler = MinMaxScaler()
    df[["income", "insured", "pm25"]] = scaler.fit_transform((df[["income", "insured", "pm25"]]))
    covariance = cov(df['income'], df['pm25'])
    pear_cov, _ = pearsonr(df['income'], df['pm25'])
    spearmans_cov = spearmanr(df['income'], df['pm25'])
    print(pg.corr(x=df['income'], y=-df['pm25']))
def corr(input_data, sigmas, dissimilarityMatrix):
    ndims, ninstrus = input_data.shape[0], input_data.shape[1]
    idx_triu = np.triu_indices(dissimilarityMatrix.shape[0], k=1)
    target_v = dissimilarityMatrix[idx_triu]
    mean_target = np.mean(target_v)
    std_target = np.std(target_v)
    no_samples = ninstrus * (ninstrus - 1) / 2
    kernel = np.zeros((ninstrus, ninstrus))
    idx = [i for i in range(len(input_data))]
    sigmas = np.clip(sigmas, a_min=1.0, a_max=1e15)
    for i in range(ninstrus):
        for j in range(i + 1, ninstrus):
            kernel[i, j] = -np.sum(
                np.power(
                    np.divide(input_data[idx, i] - input_data[idx, j],
                              (sigmas[idx] + np.finfo(float).eps)), 2))
    kernel_v = kernel[idx_triu]
    mean_kernel = np.mean(kernel_v)
    std_kernel = np.std(kernel_v)
    Jn = np.sum(np.multiply(kernel_v - mean_kernel, target_v - mean_target))
    Jd = no_samples * std_target * std_kernel
    # corr_, p___ = pearsonr(np.asarray(target_v),np.asarray(kernel_v))
    pg_ = pg.corr(np.asarray(target_v), np.asarray(kernel_v))
    aa = pg_['CI95%'][0][0]**2
    bb = pg_['CI95%'][0][1]**2
    lower = np.min([aa, bb])
    upper = np.max([aa, bb])
    p = pg_['p-val'][0]
    r = pg_['r'][0]
    power = pg_['power'][0]
    df = pg_['n'][0] - 2
    return r, p, lower, upper, power, df
def location_perspective(speed_df, red_light_df, traffic_crash_df):
    """
    Method to display correlation between red light violations, speed violations and total violation against crashes with
    respect to location.
    :param speed_df: dataframe consisting of speed violation data
    :param red_light_df: dataframe consisting of red light violation data
    :param traffic_crash_df: dataframe consisting of traffic crash data
    :return: None
    """
    speed_frame_sample = speed_df[['STREET_NAME', 'VIOLATIONS'
                                   ]].groupby('STREET_NAME',
                                              as_index=False).sum()
    red_light_frame_sample = red_light_df[['STREET_NAME', 'VIOLATIONS'
                                           ]].groupby('STREET_NAME',
                                                      as_index=False).sum()
    traffic_frame_sample = traffic_crash_df[['STREET_NAME', 'Date']].groupby(
        'STREET_NAME', as_index=False).count()

    red_light_frame_sample.columns = ["STREET_NAME", "REDLIGHT_VIOLATIONS"]
    speed_frame_sample.columns = ["STREET_NAME", "SPEED_VIOLATIONS"]
    res = pd.merge(traffic_frame_sample,
                   speed_frame_sample,
                   how='left',
                   on='STREET_NAME')
    res = pd.merge(res, red_light_frame_sample, how='left', on='STREET_NAME')
    res.columns = [
        "STREET_NAME", "REDLIGHT_VIOLATIONS", "SPEED_VIOLATIONS", "Crashes"
    ]
    res["Total_violations"] = res["REDLIGHT_VIOLATIONS"] + res[
        "SPEED_VIOLATIONS"]
    res['REDLIGHT_VIOLATIONS'].fillna(0, inplace=True)
    res['SPEED_VIOLATIONS'].fillna(0, inplace=True)
    res['Crashes'].fillna(0, inplace=True)
    res['Total_violations'].fillna(0, inplace=True)

    cor1 = pg.corr(x=res['REDLIGHT_VIOLATIONS'], y=res['Crashes'])
    cor2 = pg.corr(x=res['SPEED_VIOLATIONS'], y=res['Crashes'])
    cor3 = pg.corr(x=res['Total_violations'], y=res['Crashes'])

    f1 = visualize(res["REDLIGHT_VIOLATIONS"], res["Crashes"],
                   "Red Light Violations", "Crashes", cor1, 'red')
    f2 = visualize(res["SPEED_VIOLATIONS"], res["Crashes"],
                   "Speed Camera Violations", "Crashes", cor2, 'orange')
    f3 = visualize(res["SPEED_VIOLATIONS"], res["Crashes"], "Total Violations",
                   "Crashes", cor3, 'lightblue')

    return f1, f2, f3
Example #7
0
def getstats(aggDict):
    
    animals = list(aggDict['0NP']['sub_id'])
    cols = list(aggDict['0NP'].columns.values)
    phaseCols = cols[3:19] # Getting phase names

    sequences = ['0NP', '1NP', '(0,1,~) D', '(0,-1,~)']
    
    # Loading results of manual analysis
    mxl = pd.read_excel('Impulsivity strategies.xlsx', sheet_name=sequences, index_col=0, nrows=25,
                        usecols=np.arange(0, 19), keep_default_na=False)

    # Auto. vs Man. correlation
    corrDict = {}
    meanCorrs = {} # Just the mean

    for kind, kval in enumerate(sequences):
        corrDict[kval] = {}
        meanCorrs[kval] = 0
        df = aggDict[kval].set_index('sub_id')
        for animal in animals:
            corRes = pg.corr(mxl[kval].loc[animal, '1L':'8D'].astype('float64'),
                             df.loc[animal, '1L':'8D'].astype('float64'), method='pearson')
            corrDict[kval][animal] = corRes.loc['pearson']
            meanCorrs[kval] += corRes.loc['pearson']['r'] / len(animals)

    # For export
    compDict = {}
    for ind, sheetname in enumerate(sequences):
        compDict[sheetname] = pd.DataFrame.from_dict(corrDict[sheetname]).T

    # Between subjects pairwise t-tests
    betDict = {'treat': {}, 'phen': {}}
    for bet in betDict.keys():
        for key in sequences:
            
            # Converting to long format
            df = aggDict[key].loc[:, :'8D'].melt(id_vars=['sub_id', 'treat', 'phen'], value_vars=phaseCols,
                                                 var_name='phase', value_name='dv')
            df['dv'] = pd.to_numeric(df['dv'])
            df = pg.pairwise_ttests(data=df, subject='sub_id', dv='dv', within='phase',
                                    between=[bet], return_desc=True).round(6)
            betDict[bet][key] = df
            
    # Wilcoxon for impulsive sequence, for phases 6D:7L
    df = aggDict['(0,-1,~)'].loc[:, :'8D']
    phenotype = {'epi': df[df.phen == 'epi'], 'non': df[df.phen == 'non']}
    for phenkey, phenval in phenotype.items():
        w, p = stats.wilcoxon(phenval.loc[:, '6D'], phenval.loc[:, '7L'], mode='approx')
        phenotype[phenkey] = pd.DataFrame.from_dict({'w': [w], 'p': [p]})

    # Descriptive stats for self-control
    cDrink = aggDict['(0,1,~) D'].loc[:, :'8D']
    cStats = [(cDrink.mean(axis=0), cDrink.sem(axis=0))]

    return corrDict, meanCorrs, compDict, betDict, phenotype, cStats
Example #8
0
def get_correlation(df_st,
                    df_sc,
                    pval_cut=0.05,
                    log2fc_cut=1.,
                    method='pearson'):
    """Calculate Pearson Correlation

    Parameters
    ----------
    df_st : pandas.Dataframe
        Data frame containing the results of the dge analysis
    df_sc : pandas.Dataframe
        Data frame containing the results of the dge analysis
    pval_cut : float
        p-value cut parameter
    log2fc_cut : float
        effect size cut parameter
    method : str
        Correlation method to use

    Returns
    -------

    """
    # 1. get up-regulated genes in cyto+ group in both data sets
    m_sig_st = (df_st['pval'].values <= pval_cut) & (abs(
        df_st['log2fc'].values) >= log2fc_cut)
    m_sig_sc = (df_sc['pval'].values <= pval_cut) & (abs(
        df_sc['log2fc'].values) >= log2fc_cut)
    mask_siggenes = np.logical_and(m_sig_st, m_sig_sc)

    # 2. Calculate Pearson Correlation and p-value
    sig_r = pingouin.corr(x=df_st['signed_pval'][mask_siggenes],
                          y=df_sc['signed_pval'][mask_siggenes],
                          method=method)
    print(sig_r['p-val'].values[0])
    print(sig_r['r'])

    return sig_r
Example #9
0
    font_scale = expand.slider("Font scale", 0.0, 4.0, 1.0, 0.1)

    start_x = float(df[x_var1].max() / 10)
    start_y = float(df[x_var2].max() - df[x_var2].max() / 10)
    max_x = float(df[x_var1].max())
    max_y = float(df[x_var2].max())
    x_pos = expand.slider("X position for the label", 0.0, max_x, start_x,
                          (max_x / 100 + 0.1))
    y_pos = expand.slider("Y position for the label", 0.0, max_y, start_y,
                          (max_y / 100 + 0.1))

    sns.set(style='white', font_scale=font_scale)

    st.success("Correlation results")

    corr_result = pg.corr(x=df[x_var1], y=df[x_var2], method=method_selected)
    st.write(corr_result)

    st.success("Correlation matrices")

    st.write(
        pg.pairwise_corr(df, padjust='bonf',
                         method=method_selected).sort_values(by=['p-unc']))

    st.write(df.rcorr(padjust='bonf'))

    st.success("Correlation plot with distributions is being generated")
    fig = plt.figure(figsize=(12, 6))
    g = sns.JointGrid(data=df, x=x_var1, y=x_var2, height=6)
    g = g.plot_joint(sns.regplot, color="xkcd:muted blue")
    g = g.plot_marginals(sns.distplot,
Example #10
0
def correlation_plot(df_counts_cytoresps, genes_dict, save_folder):
    """
    Plot correlation of counts distribution for each cytokine over all samples

    :param df_counts_cytoresps:
    :param genes_dict:
    :param save_folder:
    :return:
    """

    # df_counts_cytoresps = df_counts_cytoresps.replace({'0': np.nan, 0: np.nan})
    df_counts_cytoresps = df_counts_cytoresps.replace({np.nan: 0})

    for cyto in genes_dict.keys():
        for cyto_reps in genes_dict.keys():
            resp_name = "_".join([cyto_reps, 'responder'])
            temp_df = df_counts_cytoresps[[cyto, resp_name, 'disease']]

            # stats: hypothesis here will be that the counts of a cytokine is correlated to the counts of its responders
            sig_r = pingouin.corr(x=temp_df[resp_name],
                                  y=temp_df[cyto],
                                  method='pearson')

            # Plot Correlation
            #  2. Plot correlation
            fig, ax = plt.subplots(figsize=fig_size)
            ax.grid(False)
            sns.regplot(data=temp_df,
                        x=resp_name,
                        y=cyto,
                        ax=ax,
                        scatter=False,
                        color="black",
                        label=None)
            ax.scatter(data=temp_df, x=resp_name, y=cyto, c='k')

            # Axis params
            ax.set_xlabel(" ".join(["Responder Counts"]), fontsize=xy_fontsize)
            if cyto == 'IFNG':
                ax.set_ylabel(r'IFN-$\gamma$ Counts', fontsize=xy_fontsize)
            else:
                ax.set_ylabel(" ".join([cyto, 'Counts']), fontsize=xy_fontsize)

            if temp_df.max()[0] < 10:
                ax.set_yticks(np.arange(0, temp_df.max()[0] + 1, 1))
                # Add text: Correlation value and p-value
                ax.text(temp_df.max()[1] / 2 - temp_df.max()[1] / 10,
                        temp_df.max()[0],
                        'r = {:.2f}; p = {:.2e}'.format(
                            sig_r['r'].values[0], sig_r['p-val'].values[0]),
                        fontstyle='italic',
                        fontsize=text_fontsize)
            else:
                ax.set_yticks(np.arange(0, temp_df.max()[0] + 2, 2))
                # Add text: Correlation value and p-value
                ax.text(temp_df.max()[1] / 2 - temp_df.max()[1] / 10,
                        temp_df.max()[0] + 1,
                        'r = {:.2f}; p = {:.2e}'.format(
                            sig_r['r'].values[0], sig_r['p-val'].values[0]),
                        fontstyle='italic',
                        fontsize=text_fontsize)
            ax.set_xlim([-0.5, temp_df.max()[1] + temp_df.max()[1] / 20])
            ax.set_ylim([-0.5, temp_df.max()[0] + temp_df.max()[0] / 20])

            plt.tight_layout()
            # remove upper and right edge lines in plot
            sns.despine(ax=ax)

            # 3. Save figure
            fig.savefig(
                os.path.join(save_folder,
                             "_".join(['Fig4A', cyto, resp_name, fileformat])))
            plt.close()
Example #11
0
########################################### Endplate results
gt_slopes_dir = "../data/PredictionsVsGroundTruth/EndplateSlopes_GroundTruthEndplates.csv"
gt_slopes_data = genfromtxt(gt_slopes_dir, delimiter=',')

pred_slopes_dir = "../data/PredictionsVsGroundTruth/EndplateSlopes.csv"
pred_slopes_data = genfromtxt(pred_slopes_dir, delimiter=',')

slopesDiff = pred_slopes_data - gt_slopes_data
slopesAbsDiff = abs(pred_slopes_data - gt_slopes_data)

SD = np.std(slopesDiff)

slopesDiffMean = np.mean(slopesDiff)
slopesAbsDiffMean = np.mean(slopesAbsDiff)
slopesCorr = pg.corr(pred_slopes_data.reshape(-1),gt_slopes_data.reshape(-1))

plt.figure()
sns.distplot(slopesDiff.reshape(-1))
plt.xlabel("Difference in Endplate Slope (Degrees)")
plt.ylabel("Density")
plt.title("Difference between Predicted and Ground-truth Endplate Slopes")
plt.show()

plt.figure()
sns.scatterplot(x=gt_slopes_data.reshape(-1), y=pred_slopes_data.reshape(-1))
plt.xlabel("Ground-truth Endplate Slope (Degrees)")
plt.ylabel("Predicted Endplate Slope (Degrees)")
plt.title("Ground-truth vs. Predicted Endplate Slopes")
plt.show()
AD2 = abs(gt_angle_data[:, 1] - pred_angle_data[:, 1])
MAD2 = np.mean(AD2)

AD3 = abs(gt_angle_data[:, 2] - pred_angle_data[:, 2])
MAD3 = np.mean(AD3)

########## This is shorter
MAD = np.mean(abs(gt_angle_data.reshape(-1) - pred_angle_data.reshape(-1)))

D = pred_angle_data - gt_angle_data
MD = np.mean(D)

SD = np.std(D)

corr = pg.corr(pred_angle_data.reshape(-1), gt_angle_data.reshape(-1))
print(corr.to_string())

plt.figure()
# sns.distplot(D[:,0], label="Proximal-thoracic")
# sns.distplot(D[:,1], label="Main thoracic")
# sns.distplot(D[:,2], label="Lumbar")
sns.distplot(D.reshape(-1))
plt.xlabel("Difference in Cobb Angle (Degrees)")
plt.ylabel("Density")
# plt.legend()
plt.title("Difference between Predicted and Ground-truth Cobb Angles")
plt.show()

########## Shapiro-Wilk test
ShapiroWilk = pg.normality(data=D.reshape(-1))
Example #13
0
print('Mutual Median pairwise correlations')
print('Full STRF')
p_tab = []
power_tab = []
CI95range_tab = []
for iDataset in range(nbDatasets):
    for jDataset in range(iDataset + 1, nbDatasets):
        strfTensorI = np.reshape(sigmasTab[iDataset], (128, 11, 22))
        strf_scale_rateI, strf_freq_rateI, strf_freq_scaleI = avgvec2strfavg(
            strf2avgvec(strfTensorI))
        strfTensorJ = np.reshape(sigmasTab[jDataset], (128, 11, 22))
        strf_scale_rateJ, strf_freq_rateJ, strf_freq_scaleJ = avgvec2strfavg(
            strf2avgvec(strfTensorJ))
        # pairwisePearsonMatrixFullTensor[iDataset][jDataset]  = spearmanr(strfTensorI.flatten(),strfTensorJ.flatten())[0]**2
        pg_ = pg.corr(strfTensorI.flatten(),
                      strfTensorJ.flatten(),
                      method="spearman")
        aa = pg_['CI95%'][0][0]**2
        bb = pg_['CI95%'][0][1]**2
        lower = np.min([aa, bb])
        upper = np.max([aa, bb])
        p = pg_['p-val'][0]
        r = pg_['r'][0]
        power = pg_['power'][0]
        df = pg_['n'][0] - 2
        # print(str(iDataset)+' '+str(jDataset)+' '+str(df)+' '+"%.2f" %(r**2)+' '+"%.3f" %p+' ['+"%.3f" %(lower)+';'+"%.3f" %(upper)+'] '+"%.3f" %(power)+' '+"%.3f" %(upper-lower))
        p_tab.append(p)
        power_tab.append(power)
        CI95range_tab.append(upper - lower)
print('Full Statistics Ranges')
print('p_median=' + "%.2f" % (np.median(p_tab)) + ' p_min=' + "%.2f" %
def get_crossval_sigmas_from_folder(timbre_spaces=[
    'Barthet2010', 'Grey1977', 'Grey1978', 'Iverson1993_Onset',
    'Iverson1993_Remainder', 'Iverson1993_Whole', 'McAdams1995',
    'Patil2012_A3', 'Patil2012_DX4', 'Patil2012_GD4', 'Lakatos2000_Harm',
    'Lakatos2000_Comb', 'Lakatos2000_Perc', 'Siedenburg2016_e2set1',
    'Siedenburg2016_e2set2', 'Siedenburg2016_e2set3', 'Siedenburg2016_e3'
],
                                    representation=['auditory_strf'],
                                    folder='results_light',
                                    averaging='avg_time_avg_freq',
                                    early_stopping=True,
                                    folder_old='all'):
    sigmas = []
    correlation_testing = []
    correlation_training = []
    correlation_with_all = []
    cross_corr = []
    for tsp in timbre_spaces:
        for root, dirs, files in os.walk(os.path.join(folder, tsp.lower())):
            for f in files:
                if averaging + '_' + tsp.lower() in f:
                    results = pickle.load(open(os.path.join(root, f), 'rb'))
                    # print(files)
                    metricOnAll = pickle.load(
                        open(os.path.join(root, 'all.pkl'), 'rb'))
                    metricOnAll = metricOnAll['sigmas'].flatten()

                    sigmas_ = [
                        results['sigmas'][fold]
                        for fold in range(len(results['correlations']))
                    ]
                    sigmas_ = np.array(sigmas_)
                    corr_ = [
                        pearsonr(results['sigmas'][fold].flatten(),
                                 metricOnAll)[0]
                        for fold in range(len(results['correlations']))
                    ]
                    r_withall_temp = []
                    p_withall_temp = []
                    ci95range_withall_temp = []
                    df_withall_temp = []
                    power_withall_temp = []

                    for fold in range(len(results['correlations'])):
                        pearsonr(results['sigmas'][fold].flatten(),
                                 metricOnAll)[0]
                        pg_ = pg.corr(results['sigmas'][fold].flatten(),
                                      metricOnAll)
                        aa = pg_['CI95%'][0][0]**2
                        bb = pg_['CI95%'][0][1]**2
                        lower_ = np.min([aa, bb])
                        upper_ = np.max([aa, bb])
                        p_ = pg_['p-val'][0]
                        r_ = pg_['r'][0]
                        power_ = pg_['power'][0]
                        df_ = pg_['n'][0] - 2
                        r_withall_temp.append(r_)
                        p_withall_temp.append(p_)
                        temp___ = (upper_ - lower_)
                        ci95range_withall_temp.append(temp___)
                        power_withall_temp.append(power_)
                        df_withall_temp.append(df_)

                    file_ = 'resultsOptims_strf' + f.split('results')[1].split(
                        '.pkl')[0] + '.pkl'
                    # resultsOptims_strf_avg_time_Grey1978_F311.pkl

                    sigmas__ = pickle.load(
                        open(os.path.join('./out_aud_STRF_crossval/', file_),
                             'rb'))
                    r_training_temp = []
                    p_training_temp = []
                    ci95range_training_temp = []
                    power_training_temp = []
                    df_training_temp = []

                    r_testing_temp = []
                    p_testing_temp = []
                    ci95range_testing_temp = []
                    power_testing_temp = []
                    df_testing_temp = []

                    pg_training = []
                    pg_testing = []

                    for fold in range(len(results['correlations'])):
                        # training data
                        train_idx = [
                            i for i in range(
                                sigmas__['representations'].shape[1])
                            if i != fold
                        ]
                        input_data_training = sigmas__[
                            'representations'][:, train_idx]
                        target_data_training = sigmas__['dissimilarities'][
                            train_idx, :]
                        target_data_training = target_data_training[:,
                                                                    train_idx]
                        r_, p_, lower_, upper_, power_, df_ = corr(
                            input_data_training, results['sigmas'][fold],
                            target_data_training)
                        r_training_temp.append(r_)
                        p_training_temp.append(p_)
                        temp___ = (upper_ - lower_)
                        ci95range_training_temp.append(temp___)
                        power_training_temp.append(power_)
                        df_training_temp.append(df_)

                        # pg_training.append(corr(input_data_training,results['sigmas'][fold],target_data_training))

                        # testing data
                        test_idx = [fold]
                        ninstrus = sigmas__['representations'].shape[1]
                        input_data_testing = sigmas__[
                            'representations'][:, test_idx[0]]
                        target_data_testing = np.zeros((ninstrus - 1, 1))

                        cpt_i = 0
                        for i in range(ninstrus):
                            if i > fold:
                                target_data_testing[cpt_i] = sigmas__[
                                    'dissimilarities'][fold, i]
                                cpt_i += 1
                            elif i < fold:
                                target_data_testing[cpt_i] = sigmas__[
                                    'dissimilarities'][i, fold]
                                cpt_i += 1

                        # print(input_data_testing)
                        # print(target_data_testing)
                        # pg_testing.append(corr(input_data_testing,results['sigmas'][fold],target_data_testing))

                        test_input = input_data_testing
                        test_target = target_data_testing

                        mean_target_test = np.mean(test_target)
                        std_target_test = np.std(test_target)
                        kernel_test = np.zeros((ninstrus - 1, 1))
                        # print(input_data_training.shape)
                        for i in range(len(kernel_test)):
                            # print(i)
                            kernel_test[i, 0] = -np.sum(
                                np.power(
                                    np.divide(
                                        test_input - input_data_training[:, i],
                                        (results['sigmas'][fold] +
                                         np.finfo(float).eps)), 2))

                        # pg_testing.append(pearsonr(kernel_test[:,0], test_target[:,0])[0])

                        pg_ = pg.corr(np.asarray(kernel_test[:, 0]),
                                      np.asarray(test_target[:, 0]))
                        aa = pg_['CI95%'][0][0]**2
                        bb = pg_['CI95%'][0][1]**2
                        lower_ = np.min([aa, bb])
                        upper_ = np.max([aa, bb])
                        p_ = pg_['p-val'][0]
                        r_ = pg_['r'][0]
                        power_ = pg_['power'][0]
                        df_ = pg_['n'][0] - 2
                        r_testing_temp.append(r_)
                        p_testing_temp.append(p_)
                        temp___ = (upper_ - lower_)
                        ci95range_testing_temp.append(temp___)
                        power_testing_temp.append(power_)
                        df_testing_temp.append(df_)

                        # pg_testing.append(corr(input_data_testing,results['sigmas'][fold],target_data_testing))

                    # print(np.median(np.asarray(pg_training)**2))
                    # print(iqr(np.asarray(pg_training)**2))
                    # print(np.median(np.asarray(pg_testing)**2))
                    # print(iqr(np.asarray(pg_testing)**2))
                    # print(iqr(np.asarray(results['correlations_testing'])**2))

                    # print('{} - corr={:.3f} (std={:.3f})'.format(
                    #     tsp,
                    #     np.mean(cross_correlation(sigmas_)),
                    #     np.std(cross_correlation(sigmas_))))
                    # sigmas.append(np.mean(sigmas_, axis=0))

                    print(
                        "%i" % (np.median(np.asarray(df_training_temp))) + ' '

                        # +"%.2f" %np.median(np.asarray(r_training_temp)**2)+' '
                        # +"%.2f" %np.min(np.asarray(r_training_temp)**2)+' '
                        # +"%.2f" %np.max(np.asarray(r_training_temp)**2)+' '
                        + "%.3f" % np.median(np.asarray(p_training_temp)) +
                        ' ' + "%.3f" % np.min(np.asarray(p_training_temp)) +
                        ' ' + "%.3f" % np.max(np.asarray(p_training_temp)) +
                        ' ' + "%.3f" %
                        np.median(np.asarray(ci95range_training_temp)) + ' ' +
                        "%.3f" % np.min(np.asarray(ci95range_training_temp)) +
                        ' ' +
                        "%.3f" % np.max(np.asarray(ci95range_training_temp)) +
                        ' ' +
                        "%.3f" % np.median(np.asarray(power_training_temp)) +
                        ' ' +
                        "%.3f" % np.min(np.asarray(power_training_temp)) +
                        ' ' + "%.3f" % np.max(np.asarray(power_training_temp)))

                    print(
                        "%i" % (np.median(np.asarray(df_testing_temp))) + ' '

                        # +"%.2f" %np.median(np.asarray(r_training_temp)**2)+' '
                        # +"%.2f" %np.min(np.asarray(r_training_temp)**2)+' '
                        # +"%.2f" %np.max(np.asarray(r_training_temp)**2)+' '
                        + "%.3f" % np.median(np.asarray(p_testing_temp)) +
                        ' ' + "%.3f" % np.min(np.asarray(p_testing_temp)) +
                        ' ' + "%.3f" % np.max(np.asarray(p_testing_temp)) +
                        ' ' + "%.3f" %
                        np.median(np.asarray(ci95range_testing_temp)) + ' ' +
                        "%.3f" % np.min(np.asarray(ci95range_testing_temp)) +
                        ' ' +
                        "%.3f" % np.max(np.asarray(ci95range_testing_temp)) +
                        ' ' +
                        "%.3f" % np.median(np.asarray(power_testing_temp)) +
                        ' ' + "%.3f" % np.min(np.asarray(power_testing_temp)) +
                        ' ' + "%.3f" % np.max(np.asarray(power_testing_temp)))

                    _, r_temp_within, p_temp_within, ci95range_temp_within, power_temp_within, df_temp_within = cross_correlation(
                        sigmas_)
                    print(
                        "%i" % (np.median(np.asarray(df_testing_temp))) + ' '

                        # +"%.2f" %np.median(np.asarray(r_training_temp)**2)+' '
                        # +"%.2f" %np.min(np.asarray(r_training_temp)**2)+' '
                        # +"%.2f" %np.max(np.asarray(r_training_temp)**2)+' '
                        + "%.3f" % np.median(np.asarray(p_temp_within)) + ' ' +
                        "%.3f" % np.min(np.asarray(p_temp_within)) + ' ' +
                        "%.3f" % np.max(np.asarray(p_temp_within)) + ' ' +
                        "%.3f" % np.median(np.asarray(ci95range_temp_within)) +
                        ' ' +
                        "%.3f" % np.min(np.asarray(ci95range_temp_within)) +
                        ' ' +
                        "%.3f" % np.max(np.asarray(ci95range_temp_within)) +
                        ' ' +
                        "%.3f" % np.median(np.asarray(power_temp_within)) +
                        ' ' + "%.3f" % np.min(np.asarray(power_temp_within)) +
                        ' ' + "%.3f" % np.max(np.asarray(power_temp_within)))

                    print(
                        "%i" % (np.median(np.asarray(df_withall_temp))) + ' '

                        # +"%.2f" %np.median(np.asarray(r_training_temp)**2)+' '
                        # +"%.2f" %np.min(np.asarray(r_training_temp)**2)+' '
                        # +"%.2f" %np.max(np.asarray(r_training_temp)**2)+' '
                        + "%.3f" % np.median(np.asarray(p_withall_temp)) +
                        ' ' + "%.3f" % np.min(np.asarray(p_withall_temp)) +
                        ' ' + "%.3f" % np.max(np.asarray(p_withall_temp)) +
                        ' ' + "%.3f" %
                        np.median(np.asarray(ci95range_withall_temp)) + ' ' +
                        "%.3f" % np.min(np.asarray(ci95range_withall_temp)) +
                        ' ' +
                        "%.3f" % np.max(np.asarray(ci95range_withall_temp)) +
                        ' ' +
                        "%.3f" % np.median(np.asarray(power_withall_temp)) +
                        ' ' + "%.3f" % np.min(np.asarray(power_withall_temp)) +
                        ' ' + "%.3f" % np.max(np.asarray(power_withall_temp)))

                    # print('{} correlations_training - corr: Mdn={:.2f} (IQR={:.3f})'.format(
                    #     tsp,
                    #     np.median(np.asarray(results['correlations'])**2),
                    #     iqr(np.asarray(results['correlations'])**2)))
                    # correlation_training.append(np.median(np.asarray(results['correlations'])**2))

                    # print('{} correlations_testing - corr: Mdn={:.2f} (IQR={:.3f})'.format(
                    #     tsp,
                    #     np.median(np.asarray(results['correlations_testing'])**2),
                    #     iqr(np.asarray(results['correlations_testing'])**2)))
                    # correlation_testing.append(np.median(np.asarray(results['correlations_testing'])**2))

                    # print('{} within_corr - corr: Mdn={:.2f} (IQR={:.3f})'.format(
                    #     tsp,
                    #     np.median(np.asarray(cross_correlation(sigmas_))),
                    #     iqr(np.asarray(cross_correlation(sigmas_)))))
                    # cross_corr.append(np.median(np.asarray(cross_correlation(sigmas_))))
                    # sigmas.append(np.mean(sigmas_, axis=0))

                    # print('{} correlation with all - corr: Mdn={:.2f} (IQR={:.3f})'.format(
                    #     tsp,
                    #     np.median(np.asarray(corr_)**2),
                    #     iqr(np.asarray(corr_)**2)))
                    # print()
                    # correlation_with_all.append(np.median(np.asarray(corr_)**2))
                    # sigmas.append(np.mean(sigmas_, axis=0))

    print(correlation_training)
    print(correlation_testing)
    print('Cross val correlation: ' +
          str(pearsonr(correlation_training, correlation_testing)[0]**2) +
          ' ' + str(pearsonr(correlation_training, correlation_testing)[1]))
    plt.scatter(np.asarray(correlation_training),
                np.asarray(correlation_testing))
    plt.show()
    print(np.median(correlation_training))
    print(iqr(correlation_training))
    print(np.median(correlation_testing))
    print(iqr(correlation_testing))
    print(np.median(cross_corr))
    print(iqr(cross_corr))
    print(np.median(correlation_with_all))
    print(iqr(correlation_with_all))

    mds_data = [
        0.9368, 0.6845, 0.5935, 0.2046, 0.2371, 0.5662, 0.6901, 0.8612, 0.5610,
        0.4604, 0.3068, 0.7646, 0.3152, 0.6535, 0.7005, 0.7070, 0.3616
    ]
    plt.scatter(np.asarray(mds_data), np.asarray(correlation_testing))
    plt.show()
    print(pearsonr(mds_data, correlation_testing))

    return sigmas
from src.conf import *

output_dir = figures_dir / "panels"
output_dir.mkdir()

myelo = matrix.columns[matrix.columns.str.contains("MDSC/All_CD45_(PBMC)",
                                                   regex=False)]

fig, axes = plt.subplots(3, 2, figsize=(7, 10), tight_layout=True)
for ax, pop in zip(axes, myelo):
    for cbc, name, ax in zip(["lymph_CBC", "neutrophils"],
                             ["Lymphocytes", "Neutrophils"], ax):
        p = meta[[cbc]].join(matrix[pop]).dropna()
        sns.regplot(p[cbc], p[pop], scatter_kws=dict(s=2, alpha=0.5), ax=ax)
        res = pg.corr(p[cbc], p[pop], method="spearman").squeeze()
        f = np.array([0.1, 1.1])
        ax.set(
            title=
            f"r = {res['r']:.2f}; ci = {res['CI95%']}; p = {res['p-val']:.2e}",
            xlabel=f"{name} (%, Sysmex CBC)",
            ylabel=pop,
            # xlim=(-10, 110),
            xlim=np.asarray(ax.get_xlim()) * f,
            # ylim=(-10, 110),
            ylim=np.asarray(ax.get_ylim()) * f,
        )
fig.savefig(output_dir / "sysmex.neutrophil_lymphocyte.svg", **figkws)

lr = pg.linear_regression(p[pop], p["neutrophils"])
def time_series_with_biometric_bar_plot(biometric_source_data_1,
                                        biometric_source_data_2,
                                        sample_source_data_1,
                                        sample_source_data_2, view_1, view_2,
                                        selection_dict):

    # parse selections
    biometric = selection_dict['biometric']
    metabolite = selection_dict['metabolite']
    user = selection_dict['user']
    scale = selection_dict['scale']
    start_time = pd.to_datetime('8/22/2018')
    end_time = pd.to_datetime('9/1/2018')

    # set up data and relevant stats
    if user == "Both":

        # run rm_corr
        title = 'Daily total/average: {}'.format(biometric)
        # .tolist() causing refresh error
        x = list(biometric_source_data_1.data[biometric]) + list(
            biometric_source_data_2.data[biometric])
        y = list(np.log2(biometric_source_data_1.data[metabolite])) \
            + list(np.log2(biometric_source_data_2.data[metabolite]))
        subject = ["Subject1"] * len(biometric_source_data_1.data[metabolite]) \
            + ["Subject2"] * len(biometric_source_data_2.data[metabolite])
        df = pd.DataFrame({
            'x': x,
            'y': y,
            'subject': subject,
        })
        r, p, dof = pg.rm_corr(data=df, x='x', y='y', subject='subject')
        title = "Daily total/average: {} vs. log2 (Avg. Int.) {}; RM Corr : r = {}, p = {}".format(
            biometric, metabolite, round(r, 3), round(p, 3))

        #  get biometric max
        biometric_max = max(x)
        # get metabolite intensity min
        metabolite_intensities = list(sample_source_data_1.data[metabolite]) \
            + list(sample_source_data_2.data[metabolite])
        intensity_min, intensity_max = min(metabolite_intensities), \
            max(metabolite_intensities)

    elif user == "Subject1":
        # calculate Spearman's Rho for Subject1
        x = biometric_source_data_1.data[biometric]
        y = np.log2(biometric_source_data_1.data[metabolite])
        corr_df = pg.corr(x, y, method='skipped')
        coef = corr_df.iloc[0]['r']
        p = corr_df.iloc[0]['p-val']
        #print(corr_df)
        title = "Daily average {} vs. log2(Avg. Int.) {}; Spearman's Rho: {}, p = {}".format(
            biometric, metabolite, round(coef, 3), round(p, 5))

        #  get biometric max
        biometric_max = max(x)
        # get metabolite intensity min
        metabolite_intensities = list(sample_source_data_1.data[metabolite])
        intensity_min, intensity_max = min(metabolite_intensities), \
            max(metabolite_intensities)

    elif user == "Subject2":
        # calculate Spearman's Rho
        x = biometric_source_data_2.data[biometric]
        y = np.log2(biometric_source_data_2.data[metabolite])
        corr_df = pg.corr(x, y, method='skipped')
        coef = corr_df.iloc[0]['r']
        p = corr_df.iloc[0]['p-val']
        #print(corr_df)
        title = "Daily average {} vs. log2(Avg. Int.) {}; Spearman's Rho: {}, p = {}".format(
            biometric, metabolite, round(coef, 3), round(p, 5))

        #  get biometric max
        biometric_max = max(x)
        # get metabolite intensity range
        metabolite_intensities = sample_source_data_2.data[metabolite]
        intensity_min, intensity_max = min(metabolite_intensities), \
            max(metabolite_intensities)

    # Set up figure and formatting
    p = figure(
        title=title,
        tools=tools,
        x_axis_type="datetime",
        plot_width=800,
        plot_height=400,
        x_range=[start_time, end_time],
        y_range=[intensity_min, intensity_max],
    )
    #tooltips = [("sample", "@SampleID")])

    # Setting the second y axis range name and range
    biometric_max_start = biometric_max * 0.10
    biometric_range_end = biometric_max * 1.10
    p.extra_y_ranges = {
        "biometric_axis":
        Range1d(start=biometric_max_start, end=biometric_range_end)
    }

    # Adding the second axis to the plot.
    p.add_layout(LinearAxis(y_range_name="biometric_axis"), 'right')

    p.xaxis.ticker = DaysTicker(days=np.arange(1, 59))
    p.xaxis.formatter = DatetimeTickFormatter(
        hours=["%d %B %Y"],
        days=["%d %B %Y"],
        months=["%d %B %Y"],
        years=["%d %B %Y"],
    )

    p.output_backend = "svg"
    p.xaxis.axis_label = None
    p.toolbar.logo = None
    p.xaxis.major_label_orientation = pi / 4
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    p.outline_line_color = None

    p.yaxis.axis_label = metabolite + "  {}".format(scale)
    p.yaxis[1].axis_label = biometric

    ### Now for actual data ###

    # Have to make width huge, since Datetime has millisecond resolution:
    # https://stackoverflow.com/questions/45711567/categorical-y-axis-and-datetime-x-axis-with-bokeh-vbar-plot
    # 1 hr * 4
    millisecond_width = 3600000 * 24
    if user == "Both" or user == "Subject1":

        # time series data
        legend_title = "Subject 1 [{}]".format(metabolite)
        p.line('Datetime',
               metabolite,
               source=sample_source_data_1,
               color='red')
        p.circle('Datetime',
                 metabolite,
                 source=sample_source_data_1,
                 color="red",
                 size=5,
                 alpha=0.5,
                 view=view_1,
                 hover_color="black")

        # biometric data
        legend_title = "Subject 1 Daily Total {}".format(biometric)
        p.step('Datetime',
               y=biometric,
               color="red",
               mode="center",
               line_dash="dashed",
               source=biometric_source_data_1,
               legend=legend_title,
               y_range_name="biometric_axis")

        p.vbar('Datetime',
               top=biometric,
               fill_color="red",
               width=millisecond_width,
               line_color=None,
               alpha=0.3,
               source=biometric_source_data_1,
               y_range_name="biometric_axis")

    # overwrite
    if user == "Both" or user == "Subject2":

        # time series data
        legend_title = "Subject 2 [{}]".format(metabolite)
        p.line('Datetime',
               metabolite,
               source=sample_source_data_2,
               color='blue')
        p.circle('Datetime',
                 metabolite,
                 source=sample_source_data_2,
                 color="blue",
                 size=5,
                 alpha=0.5,
                 view=view_2,
                 hover_color="black")

        # biometric data
        legend_title = "Subject 2 Daily Total {}".format(biometric)
        p.step('Datetime',
               y=biometric,
               color="blue",
               mode="center",
               line_dash="dashed",
               source=biometric_source_data_2,
               legend=legend_title,
               y_range_name="biometric_axis")

        p.vbar(x='Datetime',
               top=biometric,
               fill_color="blue",
               width=millisecond_width,
               line_color=None,
               alpha=0.3,
               source=biometric_source_data_2,
               y_range_name="biometric_axis")

    # Light cycle formatting, this needs to come second for tool tips to render
    vline_list = []
    for datetime in pd.date_range(start='8/22/2018', end='9/1/2018'):
        vline = Span(
            location=datetime,
            dimension='height',
            line_color='grey',
            #this should creat a ~6 hr window around midnight, to simulate
            # the dark cycle during this time period
            line_width=24,
            line_dash='solid',
            line_alpha=0.3)
        vline_list.append(vline)
    p.renderers.extend(vline_list)

    return p
def get_crossval_sigmas_from_folder(
    timbre_spaces = ['Barthet2010','Grey1977','Grey1978','Iverson1993_Onset',
                 'Iverson1993_Remainder','Iverson1993_Whole', 'McAdams1995','Patil2012_A3',
                 'Patil2012_DX4','Patil2012_GD4','Siedenburg2016_e3', 'Lakatos2000_Harm',
                 'Lakatos2000_Comb','Lakatos2000_Perc','Siedenburg2016_e2set1','Siedenburg2016_e2set2',
                 'Siedenburg2016_e2set3'],
    representation = ['auditory_strf'],
    folder='results_light',
    averaging='avg_time_avg_freq',
    early_stopping=True,
    folder_old='all'):
    sigmas = []
    correlation_testing = []
    correlation_training = []
    correlation_with_all = []
    cross_corr = []
    for tsp in timbre_spaces:
        for root, dirs, files in os.walk(os.path.join(folder, tsp.lower())):
            for f in files:
                if averaging + '_' + tsp.lower() in f:
                    results = pickle.load(open(os.path.join(root, f), 'rb'))
                    # print(results)
                    metricOnAll = pickle.load(open(os.path.join(root, 'all.pkl'), 'rb'))
                    metricOnAll = metricOnAll['sigmas'].flatten()
                    # print(results)
                    sigmas_ = [results['sigmas'][fold] for fold in range(len(results['correlations']))]
                    sigmas_ = np.array(sigmas_)
                    corr_ = [pearsonr(results['sigmas'][fold].flatten(),metricOnAll)[0] for fold in range(len(results['correlations']))]
                    # print('{} - corr={:.3f} (std={:.3f})'.format(
                    #     tsp, 
                    #     np.mean(cross_correlation(sigmas_)), 
                    #     np.std(cross_correlation(sigmas_))))
                    # sigmas.append(np.mean(sigmas_, axis=0))                    
                    print('{} correlations_training - corr={:.2f} (std={:.3f})'.format(
                        tsp, 
                        np.median(np.asarray(results['correlations'])**2), 
                        iqr(np.asarray(results['correlations'])**2)))
                    correlation_training.append(np.median(np.asarray(results['correlations'])**2))                    

                    print('{} correlations_testing - corr={:.2f} (std={:.3f})'.format(
                        tsp, 
                        np.median(np.asarray(results['correlations_testing'])**2), 
                        iqr(results['correlations_testing'])**2))
                    correlation_testing.append(np.median(np.asarray(results['correlations_testing'])**2))


                    print('{} within_corr - corr={:.2f} (std={:.3f})'.format(
                        tsp, 
                        np.median(np.asarray(cross_correlation(sigmas_))), 
                        iqr(np.asarray(cross_correlation(sigmas_)))))
                    cross_corr.append(np.median(np.asarray(cross_correlation(sigmas_))))                    
                    sigmas.append(np.mean(sigmas_, axis=0))

                    print('{} correlation with all - corr={:.2f} (std={:.3f})'.format(
                        tsp, 
                        np.median(np.asarray(corr_)**2), 
                        iqr(np.asarray(corr_)**2)))
                    print()
                    correlation_with_all.append(np.median(np.asarray(corr_)**2))                    
                    # sigmas.append(np.mean(sigmas_, axis=0))                    

    print(correlation_training)
    print(correlation_testing)
    print('Cross val correlation: '+str(pearsonr(correlation_training,correlation_testing)[0]**2)+' '+str(pearsonr(correlation_training,correlation_testing)[1]))
    print(pg.corr(correlation_training,correlation_testing))

    plt.scatter(np.asarray(correlation_training),np.asarray(correlation_testing))
    plt.show()
    print(np.median(correlation_training))
    print(iqr(correlation_training))
    print(np.median(correlation_testing))
    print(iqr(correlation_testing))    
    print(np.median(cross_corr))
    print(iqr(cross_corr))    
    print(np.median(correlation_with_all))
    print(iqr(correlation_with_all))




    mds_data = [0.9368,0.6845,0.5935,0.2046,0.2371,0.5662,0.6901,0.8612,0.5610,0.4604,0.3068,0.7646,0.3152,0.6535,0.7005,0.7070,0.3616]
    plt.scatter(np.asarray(mds_data),np.asarray(correlation_testing))
    plt.show() 
    print(pearsonr(mds_data,correlation_testing))

    return sigmas
Example #18
0
        pass
    if data1[j, 0] != 0 and index_2 == -1:
        index_2 = j
        pass
    if index_1 != -1 and index_2 != -1:
        break
    i -= 1
    j -= 1

data = data[-index_1:, :]
data1 = data1[-index_2:, :]

data = data[-2000:, :]
data1 = data1[-2000:, :]

x = pg.corr(x=data[:, 0], y=data1[:, 0])
print(x)

# print(data.tostring())
# print(data1.tostring())

# data = data[:,:]
# data1 = data1[:,:]
# data = data.reshape(data.shape[0],1)
# data1 = data1.reshape(data1.shape[0],1)
# data = data[-10000:,:]
# data1 = data1[-10000:,:]
# print(data1.shape[1])

# df = pd.DataFrame(data,data1)
# print(df.head())
Example #19
0
def plot__spatial_correlation(df_counts, cytokine_responders, save_folder, distance):
    """Calculate spatial (Pearson) Correlation between each Cyto+ spot and its nn responder genes spots for each cluster
    -> Plot for Workflow figure (Fig 1)
    Parameters
    ----------
    df_counts : pandas.Dataframe
    cytokine_responders : dict
    save_folder : str
    distance : int

    Returns
    -------
    list of p-values

    """

    p_vals = []
    for cyto in cytokine_responders:
        resp_name = "_".join([cyto, 'responder'])
        temp_df = df_counts[[cyto, resp_name]].copy()

        temp_df = temp_df[~np.isnan(temp_df[cyto].values.astype(np.float64))]
        temp_df[cyto] = temp_df[cyto].values.astype(np.float)
        temp_df["_".join([cyto, 'responder'])] = temp_df["_".join([cyto, 'responder'])].values.astype(np.float)

        # 1. Calculate correlation
        # Always report the correlation and the p-value:
        # -> Use pearson correlation
        # -> at low statistics the p-value might be infiltrated
        sig_r = pingouin.corr(x=temp_df["_".join([cyto, 'responder'])], y=temp_df[cyto], method='pearson')
        p_vals.append(sig_r['p-val'].values[0])

        #  2. Plot correlation
        fig, ax = plt.subplots(figsize=fig_size)
        ax.grid(False)
        sns.regplot(data=temp_df, x=resp_name, y=cyto, ax=ax, scatter=False, color="black", label=None)
        ax.scatter(data=temp_df, x=resp_name, y=cyto, c='k')

        # Add text: Correlation value and p-value
        ax.text(temp_df.max()[1] / 2 - temp_df.max()[1]/10, temp_df.max()[0],
                'r = {:.2f}; p = {:.2e}'.format(sig_r['r'].values[0], sig_r['p-val'].values[0]),
                fontstyle='italic', fontsize=text_fontsize)

        # Axis params
        ax.set_xlabel(" ".join(["Responder Counts"]), fontsize=axis_label_fontsize)
        if cyto == 'IFNG':
            ax.set_ylabel(r'IFN-$\gamma$ Counts', fontsize=axis_label_fontsize)
        else:
            ax.set_ylabel(" ".join([cyto, 'Counts']), fontsize=axis_label_fontsize)
        ax.set_xlim([-0.5, temp_df.max()[1] + temp_df.max()[1] / 50])
        ax.set_ylim([-0.5, temp_df.max()[0] + temp_df.max()[0] / 50])
        ax.tick_params(labelsize=xy_ticks)

        plt.tight_layout()
        # remove upper and right edge lines in plot
        sns.despine(ax=ax)

        # 3. Save figure
        fig.savefig(os.path.join(save_folder, "_".join(['Fig1', str(distance), cyto, resp_name, fileformat])))
        plt.close()

    return p_vals
def read_tables(form_path_1, key_path_1, form_path_2, key_path_2):
    """ Reads-in human judgements and reports results. """

    # Read-in forms
    form_1 = open(form_path_1, 'r', encoding='utf8')
    form_2 = open(form_path_2, 'r', encoding='utf8')

    # Read in keys
    with open(key_path_1, 'r', encoding='utf8') as kp1:
        keys_1 = json.load(kp1)
    with open(key_path_2, 'r', encoding='utf8') as kp2:
        keys_2 = json.load(kp2)

    # Trackers
    correct_sense_pick = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }
    is_ambiguous = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }
    is_natural = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }

    shared_correct_sense_pick_1 = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }
    shared_is_ambiguous_1 = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }
    shared_is_natural_1 = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }

    shared_correct_sense_pick_2 = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }
    shared_is_ambiguous_2 = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }
    shared_is_natural_2 = {
        'wmt': {
            'natural': [],
            'synthetic': []
        },
        'os': {
            'natural': [],
            'synthetic': []
        }
    }

    # Go through annotations line by line
    for form, keys, shared_correct_sense_pick, shared_is_ambiguous, shared_is_natural in \
            [(form_1, keys_1, shared_correct_sense_pick_1, shared_is_ambiguous_1, shared_is_natural_1),
             (form_2, keys_2, shared_correct_sense_pick_2, shared_is_ambiguous_2, shared_is_natural_2)]:
        for line_id, line in enumerate(form):
            if line_id < 2:
                continue
            key = keys[str(line_id - 1)]
            domain, prv, sns_1, sns_2 = key
            sns_tpl = (sns_1, sns_2)
            sns_pick, amb_pick, nat_pick = line.split('\t')[-3:]

            # Assign to trackers
            if line_id < 1002:
                if sns_pick in ['BOTH', 'NONE']:
                    pass
                else:
                    correct_sense_pick[domain][prv].append(
                        int(sns_tpl[int(sns_pick) - 1]))
                if amb_pick == 'UNSURE':
                    pass
                else:
                    is_ambiguous[domain][prv].append(int(amb_pick == 'NO'))
                    is_natural[domain][prv].append(int(nat_pick))

            else:
                # Assign to trackers
                shared_correct_sense_pick[domain][prv].append(sns_pick)
                shared_is_ambiguous[domain][prv].append(amb_pick)
                shared_is_natural[domain][prv].append(int(nat_pick))

    # Report summary
    print('Correct sense picked:')
    all_natural = list()
    all_synthetic = list()
    for domain in correct_sense_pick.keys():
        for prv in correct_sense_pick[domain]:
            if prv == 'natural':
                all_natural += correct_sense_pick[domain][prv]
            else:
                all_synthetic += correct_sense_pick[domain][prv]
            total = len(correct_sense_pick[domain][prv])
            pos = sum(correct_sense_pick[domain][prv])
            neg = total - pos
            print(
                '{:s} | {:s} : Yes {:d} ({:.3f}%) | No {:d} ({:.3f}%)'.format(
                    domain, prv, pos, (pos / total) * 100, neg,
                    (neg / total) * 100))
    for tag, scores in [('all natural', all_natural),
                        ('all synthetic', all_synthetic)]:
        total = len(scores)
        pos = sum(scores)
        neg = total - pos
        print('{:s} : Yes {:d} ({:.3f}%) | No {:d} ({:.3f}%)'.format(
            tag, pos, (pos / total) * 100, neg, (neg / total) * 100))
    print('=' * 20)

    print('Homograph is NOT ambiguous:')
    all_natural = list()
    all_synthetic = list()
    for domain in is_ambiguous.keys():
        for prv in is_ambiguous[domain]:
            if prv == 'natural':
                all_natural += is_ambiguous[domain][prv]
            else:
                all_synthetic += is_ambiguous[domain][prv]
            total = len(is_ambiguous[domain][prv])
            pos = sum(is_ambiguous[domain][prv])
            neg = total - pos
            print(
                '{:s} | {:s} : Yes {:d} ({:.3f}%) | No {:d} ({:.3f}%)'.format(
                    domain, prv, pos, (pos / total) * 100, neg,
                    (neg / total) * 100))
    for tag, scores in [('all natural', all_natural),
                        ('all synthetic', all_synthetic)]:
        total = len(scores)
        pos = sum(scores)
        neg = total - pos
        print('{:s} : Yes {:d} ({:.3f}%) | No {:d} ({:.3f}%)'.format(
            tag, pos, (pos / total) * 100, neg, (neg / total) * 100))
    print('=' * 20)

    print('Naturalness scores:')
    all_natural = list()
    all_synthetic = list()
    for domain in is_natural.keys():
        for prv in is_natural[domain]:
            if prv == 'natural':
                all_natural += is_natural[domain][prv]
            else:
                all_synthetic += is_natural[domain][prv]
            print('{:s} | {:s} : {:.3f}'.format(
                domain, prv, np.mean(is_natural[domain][prv])))
    for tag, scores in [('all natural', all_natural),
                        ('all synthetic', all_synthetic)]:
        print('{:s} : {:.3f}'.format(tag, np.mean(scores)))
    print('=' * 20)

    print('Rater agreement - Cohen\'s (weighted) kappa:')
    all_1 = list()
    all_2 = list()
    print('Correct sense picked:')
    for domain in shared_correct_sense_pick_1.keys():
        for prv in shared_correct_sense_pick_1[domain]:
            all_1 += shared_correct_sense_pick_1[domain][prv]
            all_2 += shared_correct_sense_pick_2[domain][prv]
    ck_sns = cohen_kappa_score(all_1, all_2, labels=['1', '2', 'NONE', 'BOTH'])
    ck_sns = 1. if math.isnan(ck_sns) else ck_sns
    print(ck_sns)

    print('Homograph is NOT ambiguous:')
    all_1 = list()
    all_2 = list()
    for domain in shared_is_ambiguous_1.keys():
        for prv in shared_is_ambiguous_1[domain]:
            all_1 += shared_is_ambiguous_1[domain][prv]
            all_2 += shared_is_ambiguous_2[domain][prv]
    ck_amb = cohen_kappa_score(all_1, all_2, labels=['YES', 'NO', 'UNSURE'])
    ck_amb = 1. if math.isnan(ck_amb) else ck_amb
    print(ck_amb)

    print('Naturalness scores:')
    all_1 = list()
    all_2 = list()
    for domain in shared_is_natural_1.keys():
        for prv in shared_is_natural_1[domain]:
            all_1 += shared_is_natural_1[domain][prv]
            all_2 += shared_is_natural_2[domain][prv]
    ck_nat = cohen_kappa_score(all_1,
                               all_2,
                               labels=[1, 2, 3, 4, 5],
                               weights='linear')
    ck_nat = 1. if math.isnan(ck_nat) else ck_nat
    print(ck_nat)
    print(corr(all_1, all_2, method='pearson').round(3))

    print('Mean agreement: {:.3f}'.format((ck_sns + ck_amb + ck_nat) / 3))
Example #21
0
                  str(math.pow(spearmanr(coef1, coef2)[0], 2)))
            print("full vs. scale/rate: " +
                  str(math.pow(spearmanr(coefFull, coef0)[0], 2)))
            print("full vs. freq/rate: " +
                  str(math.pow(spearmanr(coefFull, coef1)[0], 2)))
            print("full vs. freq/scale: " +
                  str(math.pow(spearmanr(coefFull, coef2)[0], 2)))

            tabCorr.append(math.pow(spearmanr(coef0, coef1)[0], 2))
            tabCorr.append(math.pow(spearmanr(coef0, coef2)[0], 2))
            tabCorr.append(math.pow(spearmanr(coef1, coef2)[0], 2))
            tabCorr.append(math.pow(spearmanr(coefFull, coef0)[0], 2))
            tabCorr.append(math.pow(spearmanr(coefFull, coef1)[0], 2))
            tabCorr.append(math.pow(spearmanr(coefFull, coef2)[0], 2))
            ##
            pg_ = pg.corr(coef0, coef1, method='spearman')
            aa = pg_['CI95%'][0][0]**2
            bb = pg_['CI95%'][0][1]**2
            lower_ = np.min([aa, bb])
            upper_ = np.max([aa, bb])
            p_ = pg_['p-val'][0]
            r_ = pg_['r'][0]
            power_ = pg_['power'][0]
            df_ = pg_['n'][0] - 2
            r_withall_temp.append(r_)
            p_withall_temp.append(p_)
            temp___ = (upper_ - lower_)
            ci95range_withall_temp.append(temp___)
            power_withall_temp.append(power_)
            df_withall_temp.append(df_)
            ##
Example #22
0
########################################### Angle results
gt_lenke_prob_dir = "../data/PredictionsVsGroundTruth/LenkeCurveTypeProbabilities_GroundTruthEndplates.csv"
gt_lenke_prob_data = genfromtxt(gt_lenke_prob_dir, delimiter=',')

pred_lenke_prob_dir = "../data/PredictionsVsGroundTruth/LenkeCurveTypeProbabilities.csv"
pred_lenke_prob_data = genfromtxt(pred_lenke_prob_dir, delimiter=',')

MAD = np.mean(
    abs(gt_lenke_prob_data.reshape(-1) - pred_lenke_prob_data.reshape(-1)))

D = pred_lenke_prob_data - gt_lenke_prob_data
MD = np.mean(D)

SD = np.std(D)

corr = pg.corr(pred_lenke_prob_data.reshape(-1),
               gt_lenke_prob_data.reshape(-1))
print(corr.to_string())

plt.figure()
# sns.distplot(D[:,0], label="Proximal-thoracic")
# sns.distplot(D[:,1], label="Main thoracic")
# sns.distplot(D[:,2], label="Lumbar")
sns.distplot(D.reshape(-1))
plt.xlabel("Difference in Probability")
plt.ylabel("Density")
# plt.legend()
plt.title(
    "Difference between Predicted and Ground-truth Lenke Curve Type Probabilities"
)
plt.show()
Example #23
0
def part_corr(data=None,
              x=None,
              y=None,
              covar=None,
              x_covar=None,
              y_covar=None,
              tail='two-sided',
              method='pearson'):
    from pingouin.utils import _flatten_list
    assert isinstance(data, pd.DataFrame), 'data must be a pandas DataFrame.'
    assert data.shape[0] > 2, 'Data must have at least 3 samples.'
    assert isinstance(x, (str, tuple)), 'x must be a string.'
    assert isinstance(y, (str, tuple)), 'y must be a string.'
    assert isinstance(covar, (str, list, type(None)))
    assert isinstance(x_covar, (str, list, type(None)))
    assert isinstance(y_covar, (str, list, type(None)))
    if covar is not None and (x_covar is not None or y_covar is not None):
        raise ValueError('Cannot specify both covar and {x,y}_covar.')
    assert x != covar, 'x and covar must be independent'
    assert y != covar, 'y and covar must be independent'
    assert x != y, 'x and y must be independent'
    # Check that columns exist
    col = _flatten_list([x, y, covar, x_covar, y_covar])
    if isinstance(covar, str):
        covar = [covar]
    if isinstance(x_covar, str):
        x_covar = [x_covar]
    if isinstance(y_covar, str):
        y_covar = [y_covar]

    assert all([c in data for c in col]), 'columns are not in dataframe.'
    # Check that columns are numeric
    assert all([data[c].dtype.kind in 'bfiu' for c in col])

    # Drop rows with NaN
    data = data[col].dropna()
    assert data.shape[0] > 2, 'Data must have at least 3 non-NAN samples.'

    # Standardize (= no need for an intercept in least-square regression)
    #This does NOT work with dummy variable for plate covariates -- so I will not standardize those
    #So, only standardize for those variables that work
    for c in col:
        if (data[c].std(axis=0) != 0):
            data[c] = (data[c] - data[c].mean(axis=0)) / data[c].std(axis=0)
    if covar is not None:
        # PARTIAL CORRELATION
        cvar = np.atleast_2d(data[covar].to_numpy())
        beta_x = np.linalg.lstsq(cvar, data[x].to_numpy(), rcond=None)[0]
        beta_y = np.linalg.lstsq(cvar, data[y].to_numpy(), rcond=None)[0]
        res_x = data[x].to_numpy() - cvar @ beta_x
        res_y = data[y].to_numpy() - cvar @ beta_y
    else:
        # SEMI-PARTIAL CORRELATION
        # Initialize "fake" residuals
        res_x, res_y = data[x].to_numpy(), data[y].to_numpy()
        if x_covar is not None:
            cvar = np.atleast_2d(C[x_covar].to_numpy())
            beta_x = np.linalg.lstsq(cvar, C[x].to_numpy(), rcond=None)[0]
            res_x = C[x].to_numpy() - cvar @ beta_x
        if y_covar is not None:
            cvar = np.atleast_2d(C[y_covar].to_numpy())
            beta_y = np.linalg.lstsq(cvar, C[y].to_numpy(), rcond=None)[0]
            res_y = C[y].to_numpy() - cvar @ beta_y
    return pg.corr(res_x, res_y, method=method, tail=tail)
lf = [
    'dayofyear', 'lograin3T', 'lograin7T', 'wet3', 'wet7', 'upwelling',
    'spring_tide', 'days_since_full_moon'
]

FC = hf.groupby('event').mean()['logFC']
ENT = hf.groupby('event').mean()['logENT']
FCv = hf.groupby('event').var()['logFC']
ENTv = hf.groupby('event').var()['logENT']

for l in lf:
    print('\n' + l)
    # Means
    print('mean (ENT/FC):')
    print(pg.corr(ENT, hf.groupby('event').max()[l])[['r', 'p-val']])
    print(pg.corr(FC, hf.groupby('event').max()[l])[['r', 'p-val']])
    # Variances
    print('var (ENT/FC):')
    print(pg.corr(ENTv, hf.groupby('event').max()[l])[['r', 'p-val']])
    print(pg.corr(FCv, hf.groupby('event').max()[l])[['r', 'p-val']])

#%% 2
N = len(EV)
# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
angles = [n / float(N) * 2 * np.pi for n in range(N)]
angles += angles[:1]

for b in hf.beach.unique():

    ENT_corrs = list(hf[hf.beach == b].corr().loc['logENT'][EV])
# In[ ]:

import pandas as pd
import pingouin as pg

# In[ ]:

#Read in data file
df = pd.read_csv('../data/responses-processed.csv')

# In[ ]:

#Do people who use Signal use security in choosing
#instant messaging tools?
#Simple row-to-row correlation
pg.corr(x=df['Q3-17'], y=df['Q34-31'])

# In[ ]:

pg.corr(x=df['Q40-0'], y=df['Q3-16'])

# In[ ]:

corr = pg.pairwise_corr(df,
                        columns=[['Q7-7'],
                                 [
                                     'Q3-0', 'Q3-1', 'Q3-2', 'Q3-3', 'Q3-4',
                                     'Q3-5', 'Q3-6', 'Q3-7', 'Q3-8', 'Q3-9',
                                     'Q3-10', 'Q3-11', 'Q3-12', 'Q3-13',
                                     'Q3-14', 'Q3-15', 'Q3-16', 'Q3-17',
                                     'Q3-18'
Example #26
0
def main(subject,
         session,
         smoothed,
         pca_confounds,
         n_voxels=1000,
         bids_folder='/data',
         mask='wang15_ips'):

    target_dir = op.join(bids_folder, 'derivatives', 'decoded_pdfs.volume')

    if smoothed:
        target_dir += '.smoothed'

    if pca_confounds:
        target_dir += '.pca_confounds'

    target_dir = op.join(target_dir, f'sub-{subject}', 'func')

    if not op.exists(target_dir):
        os.makedirs(target_dir)

    sub = Subject(subject, bids_folder)
    paradigm = sub.get_behavior(sessions=session, drop_no_responses=False)
    paradigm['log(n1)'] = np.log(paradigm['n1'])
    paradigm = paradigm.droplevel(['subject', 'session'])

    data = get_single_trial_volume(subject,
                                   session,
                                   bids_folder=bids_folder,
                                   mask=mask,
                                   smoothed=smoothed,
                                   pca_confounds=pca_confounds).astype(
                                       np.float32)
    data.index = paradigm.index
    print(data)

    pdfs = []
    runs = range(1, 9)

    for test_run in runs:

        test_data, test_paradigm = data.loc[test_run].copy(
        ), paradigm.loc[test_run].copy()
        train_data, train_paradigm = data.drop(
            test_run, level='run').copy(), paradigm.drop(test_run,
                                                         level='run').copy()

        pars = get_prf_parameters_volume(subject,
                                         session,
                                         cross_validated=True,
                                         smoothed=smoothed,
                                         pca_confounds=pca_confounds,
                                         run=test_run,
                                         mask=mask,
                                         bids_folder=bids_folder)
        # pars = get_prf_parameters_volume(subject, session, cross_validated=False,  mask=mask, bids_folder=bids_folder)
        print(pars)

        model = GaussianPRF(parameters=pars)
        pred = model.predict(
            paradigm=train_paradigm['log(n1)'].astype(np.float32))

        r2 = get_rsq(train_data, pred)
        print(r2.describe())
        r2_mask = r2.sort_values(ascending=False).index[:n_voxels]

        train_data = train_data[r2_mask]
        test_data = test_data[r2_mask]

        print(r2.loc[r2_mask])
        model.apply_mask(r2_mask)

        model.init_pseudoWWT(stimulus_range, model.parameters)
        residfit = ResidualFitter(model, train_data,
                                  train_paradigm['log(n1)'].astype(np.float32))

        omega, dof = residfit.fit(init_sigma2=10.0,
                                  method='t',
                                  max_n_iterations=10000)

        print('DOF', dof)

        bins = stimulus_range.astype(np.float32)

        pdf = model.get_stimulus_pdf(test_data,
                                     bins,
                                     model.parameters,
                                     omega=omega,
                                     dof=dof)

        print(pdf)
        E = (pdf * pdf.columns).sum(1) / pdf.sum(1)

        print(pd.concat((E, test_paradigm['log(n1)']), axis=1))
        print(pingouin.corr(E, test_paradigm['log(n1)']))

        pdfs.append(pdf)

    pdfs = pd.concat(pdfs)

    target_fn = op.join(
        target_dir,
        f'sub-{subject}_ses-{session}_mask-{mask}_nvoxels-{n_voxels}_space-{space}_pars.tsv'
    )
    pdfs.to_csv(target_fn, sep='\t')
Example #27
0
                                      'rb'))  # load sigmas
            sigmasTab.append(sigmas['sigmas'].flatten())
            represenationsVariances = np.std(
                sigmas['representations'],
                axis=1)  # compute variances of representations
            represenationsVariancesTensor = np.reshape(represenationsVariances,
                                                       (128, 11, 22))
            strf_scale_rateVar, strf_freq_rateVar, strf_freq_scaleVar = avgvec2strfavg(
                strf2avgvec(represenationsVariancesTensor)
            )  # compute representations variances projections

            sigmas_ = sigmas['sigmas']  # load sigmas
            sigmasTensor = np.reshape(sigmas_, (128, 11, 22))
            strf_scale_rateSig, strf_freq_rateSig, strf_freq_scaleSig = avgvec2strfavg(
                strf2avgvec(sigmasTensor))  # compute metrics projections
            pg_ = pg.corr(sigmas_.flatten(), represenationsVariances.flatten())
            aa = pg_['CI95%'][0][0]**2
            bb = pg_['CI95%'][0][1]**2
            lower = np.min([aa, bb])
            upper = np.max([aa, bb])
            p = pg_['p-val'][0]
            r = pg_['r'][0]
            power = pg_['power'][0]
            df = pg_['n'][0] - 2
            # print(file+' r^2='+str(r**2)+' p='+str(p)+' lower='+str(lower)+' upper='+str(upper)+' power='+str(power)+' df='+str(df))
            print(
                str(df) + ' ' + "%.2f" % (r**2) + ' ' + "%.3f" % p + ' [' +
                "%.3f" % (lower) + ';' + "%.3f" % (upper) + '] ' + "%.3f" %
                (power))

print()
Example #28
0
def main(subject,
         session,
         n_voxels=250,
         bids_folder='/data',
         mask='wang15_ips'):

    session1 = session[:2] + '1'
    session2 = session[:2] + '2'

    pars = get_prf_parameters_volume(subject,
                                     session1,
                                     cross_validated=False,
                                     mask=mask,
                                     bids_folder=bids_folder).astype(
                                         np.float32)

    behavior = get_task_behavior(subject, session2, bids_folder)
    data = get_single_trial_volume(subject,
                                   session2,
                                   bids_folder=bids_folder,
                                   mask=mask).astype(np.float32)
    print(data)

    paradigm = behavior[['log(n1)']].astype(np.float32)
    paradigm.index = data.index
    print(paradigm)

    pdfs = []
    runs = range(1, 9)

    for test_run in runs:

        test_data, test_paradigm = data.xs(
            test_run, level='run').copy(), paradigm.xs(test_run,
                                                       level='run').copy()
        train_data, train_paradigm = data.drop(
            test_run, level='run').copy(), paradigm.drop(test_run,
                                                         level='run').copy()

        model = GaussianPRF(parameters=pars, paradigm=train_paradigm)
        parfitter = ParameterFitter(model, train_data, train_paradigm)

        new_pars = parfitter.refine_baseline_and_amplitude(pars)
        new_pars = parfitter.fit(init_pars=new_pars, fixed_pars=['mu', 'sd'])
        print(new_pars)
        model.parameters = new_pars.astype(np.float32)

        pred = model.predict()
        r2 = get_rsq(train_data, pred)
        print(r2.describe())
        r2_mask = r2.sort_values(ascending=False).index[:n_voxels]

        train_data = train_data[r2_mask]
        test_data = test_data[r2_mask]

        print(r2.loc[r2_mask])
        model.apply_mask(r2_mask)

        model.init_pseudoWWT(stimulus_range, model.parameters)

        residfit = ResidualFitter(model, train_data,
                                  train_paradigm['log(n1)'].astype(np.float32))

        omega, dof = residfit.fit(init_sigma2=10.0,
                                  method='t',
                                  max_n_iterations=10000)

        print('DOF', dof)

        bins = np.linspace(np.log(5), np.log(80), 150,
                           endpoint=True).astype(np.float32)

        pdf = model.get_stimulus_pdf(test_data,
                                     bins,
                                     model.parameters,
                                     omega=omega,
                                     dof=dof)

        print(pdf)
        E = (pdf * pdf.columns).sum(1) / pdf.sum(1)

        print(pd.concat((E, test_paradigm['log(n1)']), axis=1))
        print(pingouin.corr(E, test_paradigm['log(n1)']))

        pdfs.append(pdf)

    pdfs = pd.concat(pdfs)

    target_dir = op.join(bids_folder, 'derivatives',
                         'decoded_pdfs.volume.across_session')
    target_dir = op.join(target_dir, f'sub-{subject}', 'func')

    if not op.exists(target_dir):
        os.makedirs(target_dir)

    target_fn = op.join(
        target_dir,
        f'sub-{subject}_ses-{session2}_mask-{mask}_nvoxels-{n_voxels}_space-T1w_pars.tsv'
    )
    pdfs.to_csv(target_fn, sep='\t')
Example #29
0
def main(subject,
         session,
         smoothed,
         n_verts=100,
         bids_folder='/data',
         mask='wang15_ips'):

    target_dir = op.join(bids_folder, 'derivatives', 'decoded_pdfs')

    if smoothed:
        target_dir += '.smoothed'

    target_dir = op.join(target_dir, f'sub-{subject}', 'func')

    if not op.exists(target_dir):
        os.makedirs(target_dir)

    paradigm = [
        pd.read_csv(op.join(
            bids_folder, f'sub-{subject}', f'ses-{session}', 'func',
            f'sub-{subject}_ses-{session}_task-task_run-{run}_events.tsv'),
                    sep='\t') for run in range(1, 9)
    ]
    paradigm = pd.concat(paradigm, keys=range(1, 9),
                         names=['run']).droplevel(1)
    paradigm = paradigm[paradigm.trial_type == 'stimulus 1'].set_index(
        'trial_nr', append=True)

    paradigm['log(n1)'] = np.log(paradigm['n1'])
    print(paradigm)

    data = get_single_trial_surf_data(subject,
                                      session,
                                      bids_folder,
                                      mask=mask,
                                      smoothed=smoothed,
                                      space=space)
    data.index = paradigm.index

    # np.random.seed(666)
    # resample_mask = np.random.choice(data.columns, n_verts)
    # data = data[resample_mask].astype(np.float32)

    pdfs = []
    runs = range(1, 9)

    for test_run in runs:

        test_data, test_paradigm = data.loc[test_run].copy(
        ), paradigm.loc[test_run].copy()
        train_data, train_paradigm = data.drop(
            test_run, level='run').copy(), paradigm.drop(test_run,
                                                         level='run').copy()

        pars = get_prf_parameters(subject,
                                  session,
                                  run=test_run,
                                  mask=mask,
                                  bids_folder=bids_folder,
                                  smoothed=smoothed,
                                  space=space)

        # pars = pars.loc[resample_mask]

        model = GaussianPRF(parameters=pars)
        pred = model.predict(
            paradigm=train_paradigm['log(n1)'].astype(np.float32))

        r2 = get_rsq(train_data, pred)
        print(r2.describe())
        print(r2.sort_values(ascending=False))
        r2_mask = r2.sort_values(ascending=False).index[:n_verts]
        model.apply_mask(r2_mask)

        train_data = train_data[r2_mask].astype(np.float32)
        test_data = test_data[r2_mask].astype(np.float32)

        print(model.parameters)
        print(train_data)

        model.init_pseudoWWT(stimulus_range, model.parameters)
        residfit = ResidualFitter(model, train_data,
                                  train_paradigm['log(n1)'].astype(np.float32))

        omega, dof = residfit.fit(init_sigma2=10.0,
                                  method='t',
                                  max_n_iterations=10000)

        print('DOF', dof)

        bins = np.linspace(np.log(5), np.log(80), 150,
                           endpoint=True).astype(np.float32)

        pdf = model.get_stimulus_pdf(test_data,
                                     bins,
                                     model.parameters,
                                     omega=omega,
                                     dof=dof)

        print(pdf)
        E = (pdf * pdf.columns).sum(1) / pdf.sum(1)

        print(pd.concat((E, test_paradigm['log(n1)']), axis=1))
        print(pingouin.corr(E, test_paradigm['log(n1)']))

        pdfs.append(pdf)

    pdfs = pd.concat(pdfs)

    target_fn = op.join(
        target_dir,
        f'sub-{subject}_ses-{session}_mask-{mask}_nverts-{n_verts}_space-{space}_pars.tsv'
    )
    pdfs.to_csv(target_fn, sep='\t')