def plot_histograms(within_batch_corrs, between_batch_corrs, ncomps, batch_column, hist_folder, verbosity): filename = get_batch_effect_histogram_filename(hist_folder, ncomps) hist_bins = np.linspace(-1, 1, 41) bin_centers = 0.5 * (hist_bins[1:] + hist_bins[:-1]) # Due to a change in numpy, I must add "range=(bins.min(),bins.max())" # https://github.com/numpy/numpy/issues/7503 y_within_batch, bins = np.histogram(within_batch_corrs, bins = hist_bins, range=(hist_bins.min(), hist_bins.max()), density = True) y_between_batch, bins = np.histogram(between_batch_corrs, bins = hist_bins, range=(hist_bins.min(), hist_bins.max()), density = True) mean_within_batch_cor = np.nanmean(within_batch_corrs) mean_between_batch_cor = np.nanmean(between_batch_corrs) colors = pt.color_cycle() fig = plt.figure(figsize = (7, 7)) ax = fig.add_subplot(1,1,1) color = colors.next() ax.plot(bin_centers, y_within_batch, color + '-', label = 'Within batch (mean = {:.2f})'.format(mean_within_batch_cor)) color = colors.next() ax.plot(bin_centers, y_between_batch, color + '-', label = 'Between batch (mean = {:.2f})'.format(mean_between_batch_cor)) ax.set_xlim([-1, 1]) ax.set_xlabel('Average correlation within/between batch(es)') ax.set_ylabel('Density') lgd = ax.legend(bbox_to_anchor = (1.05, 1), loc = 2, title = "Correlation origin") plt.title('{} batch correlations,\n{} components removed'.format(batch_column, ncomps)) plt.savefig(filename, bbox_extra_artists = ([lgd]), bbox_inches = 'tight') # Here I compute the t-statistic for each distribution # where the null hypothesis is that the mean is 0. # Also, take the sample standard deviation, not the population # standard deviation (ddof = 1). #if verbosity >= 2: # print 'within batch t-statistic calculation:' # print '{} / ({} / sqrt({}))'.format(mean_within_batch_cor, np.nanstd(within_batch_corrs, ddof = 1), np.size(within_batch_corrs)) # print '' # print 'between batch t-statistic calculation:' # print '{} / ({} / sqrt({}))'.format(mean_between_batch_cor, np.nanstd(between_batch_corrs, ddof = 1), np.size(between_batch_corrs)) #within_batch_t = mean_within_batch_cor / (np.nanstd(within_batch_corrs, ddof = 1) / np.sqrt(np.size(within_batch_corrs))) #between_batch_t = mean_between_batch_cor / (np.nanstd(between_batch_corrs, ddof = 1) / np.sqrt(np.size(between_batch_corrs))) ## Take the t-stats and convert to pvalues. #within_batch_p = stats.t.sf(np.abs(within_batch_t), np.size(within_batch_corrs) - 1) * 2 #between_batch_p = stats.t.sf(np.abs(between_batch_t), np.size(between_batch_corrs) - 1) * 2 ## Return t-statstics and p-values #return [np.array([[within_batch_t, between_batch_t]]), np.array([[within_batch_p, between_batch_p]])] # Compute a Mann-Whitney U statistic and p-value comparing the distributions # of within-batch and between-batch correlations (Mann-Whitney U is equivalent # to the AUC for a ROC curve comparing the two classes. within_batch_corrs_nanfree = within_batch_corrs[np.invert(np.isnan(within_batch_corrs))] between_batch_corrs_nanfree = between_batch_corrs[np.invert(np.isnan(between_batch_corrs))] MW_U, MW_p = stats.mannwhitneyu(within_batch_corrs_nanfree, between_batch_corrs_nanfree) if verbosity >= 2 : print within_batch_corrs_nanfree print between_batch_corrs_nanfree MW_AUC = MW_U / (np.size(within_batch_corrs_nanfree) * np.size(between_batch_corrs_nanfree)) return MW_AUC, MW_p
def plot_t_stats(comps_removed, t_stats, batch_column, hist_folder): # This function plots the change in t-statistic for both # within- and between-batch correlations, as components # (either LDA or SVD) are removed. filename = get_batch_effect_histogram_ttest_filename(hist_folder) colors = pt.color_cycle() fig = plt.figure(figsize = (7, 7)) ax = fig.add_subplot(1,1,1) color = colors.next() ax.plot(comps_removed, t_stats[:, 0], color + '-', label = 'Within batch') color = colors.next() ax.plot(comps_removed, t_stats[:, 1], color + '-', label = 'Between batch') ax.set_xlabel('Number of components removed') ax.set_ylabel(r'$t$-statistic' '\n' r'($H_0: \bar \mu = 0$)') lgd = ax.legend(bbox_to_anchor = (1.05, 1), loc = 2, title = 'Correlation origin') plt.title(r'$t$-statistics of {}' '\nbatch correlation distributions'.format(batch_column)) plt.savefig(filename, bbox_extra_artists = ([lgd]), bbox_inches = 'tight')