def quantile_accuracy(gene_targets, gene_preds, gene_stat, out_pdf, numq=4): ''' Plot accuracy (PearsonR) in quantile bins across targets. ''' # plot PearsonR in variance statistic bins quant_indexes = stats.quantile_indexes(gene_stat, numq) quantiles_series = [] targets_series = [] pcor_series = [] for qi in range(numq): # slice quantile gene_targets_quant = gene_targets[quant_indexes[qi]].astype('float32') gene_preds_quant = gene_preds[quant_indexes[qi]].astype('float32') # compute target PearsonR for ti in range(gene_targets_quant.shape[1]): pcor, _ = pearsonr(gene_targets_quant[:,ti], gene_preds_quant[:,ti]) quantiles_series.append(qi) targets_series.append(ti) pcor_series.append(pcor) # construct DataFrame df_quant = pd.DataFrame({'Quantile':quantiles_series, 'Target':targets_series, 'PearsonR':pcor_series}) df_quant.to_csv('%s.csv' % out_pdf[:-4]) # print summary table table_out = open('%s.txt' % out_pdf[:-4], 'w') for qi in range(numq): quantile_cors = df_quant[df_quant.Quantile == qi].PearsonR print('%2d %.4f %.4f' % \ (qi, np.mean(quantile_cors),np.median(quantile_cors)), file=table_out) table_out.close() # construct figure plt.figure() # plot individual targets as light lines for ti in range(gene_targets.shape[1]): df_quant_target = df_quant[df_quant.Target == ti] plt.plot(df_quant_target.Quantile, df_quant_target.PearsonR, alpha=0.1) # plot PearsonR distributions in quantiles sns.violinplot(x='Quantile', y='PearsonR', data=df_quant, color='tomato') plt.savefig(out_pdf) plt.close() # sort targets by their decrease target_ratios = [] for ti in range(gene_targets.shape[1]): df_quant_target = df_quant[df_quant.Target == ti] assert(df_quant_target.Quantile.iloc[0] == 0) assert(df_quant_target.Quantile.iloc[-1] == numq-1) cor_ratio = df_quant_target.PearsonR.iloc[-1] / df_quant_target.PearsonR.iloc[0] target_ratios.append((cor_ratio,ti)) target_ratios = sorted(target_ratios) # take 10 samples across pct_indexes = np.linspace(0, len(target_ratios)-1, 10+1).astype('int') # write quantile targets table_out = open('%s_qt.txt' % out_pdf[:-4], 'w') sns.set(font_scale=1.2, style='ticks') # scatter plot each quantile for qi in range(numq): # slice quantile gene_targets_quant = gene_targets[quant_indexes[qi]].astype('float32') gene_preds_quant = gene_preds[quant_indexes[qi]].astype('float32') for pqi in range(len(pct_indexes)): pct_i = pct_indexes[pqi] ti = target_ratios[pct_i][1] print(qi, pqi, ti, target_ratios[ti], file=table_out) qout_pdf = '%s_pq%d_q%d.pdf' % (out_pdf[:-4], pqi, qi) plots.jointplot(gene_targets_quant[:,ti], gene_preds_quant[:,ti], qout_pdf, alpha=0.8, point_size=8, kind='reg', figsize=5, x_label='log2 Experiment', y_label='log2 Prediction') table_out.close()
def variance_accuracy(gene_targets, gene_preds, out_prefix, log_pseudo=None): """ Compare MSE accuracy to gene mean and variance. Assumes the targets and predictions have been normalized. """ # compute mean, var, and MSE across targets print('gene_targets', gene_targets.shape) gene_mean = np.mean(gene_targets, axis=1, dtype='float64') gene_max = np.max(gene_targets, axis=1) gene_std = np.std(gene_targets, axis=1, dtype='float64') gene_mse = np.power(gene_targets - gene_preds, 2).mean(axis=1, dtype='float64') # filter for sufficient expression expr_indexes = (gene_mean > 0.5) & (gene_max > 3) gene_targets = gene_targets[expr_indexes,:] gene_preds = gene_preds[expr_indexes,:] gene_mse = gene_mse[expr_indexes] gene_mean = gene_mean[expr_indexes] gene_std = gene_std[expr_indexes] print('%d "expressed genes" considered in variance plots' % expr_indexes.sum()) sns.set(style='ticks', font_scale=1.3) if len(gene_mse) < 2000: ri = np.arange(len(gene_mse)) else: ri = np.random.choice(np.arange(len(gene_mse)), 2000, replace=False) # plot mean vs std out_pdf = '%s_mean-std.pdf' % out_prefix plots.jointplot(gene_mean[ri], gene_std[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Mean across experiments', y_label='Std Dev across experiments') # plot mean vs MSE out_pdf = '%s_mean.pdf' % out_prefix plots.jointplot(gene_mean[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Mean across experiments', y_label='Mean squared prediction error') # plot std vs MSE out_pdf = '%s_std.pdf' % out_prefix plots.jointplot(gene_std[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Std Dev across experiments', y_label='Mean squared prediction error') # plot CV vs MSE gene_cv = np.divide(gene_std, gene_mean) out_pdf = '%s_cv.pdf' % out_prefix plots.jointplot(gene_cv[ri], gene_mse[ri], out_pdf, point_size=10, cor='spearmanr', x_label='Coef Var across experiments', y_label='Mean squared prediction error') # plot MSE distributions in CV bins numq = 5 quant_indexes = stats.quantile_indexes(gene_cv, numq) quant_mse = [] for qi in range(numq): for gi in quant_indexes[qi]: quant_mse.append([qi, gene_mse[gi]]) quant_mse = pd.DataFrame(quant_mse, columns=['Quantile','MSE']) quant_mse.to_csv('%s_quant.txt' % out_prefix, sep='\t') plt.figure() sns.boxplot(x='Quantile', y='MSE', data=quant_mse, palette=sns.cubehelix_palette(numq), showfliers=False) ax = plt.gca() ax.grid(True, linestyle=':') ax.set_ylabel('Mean squared prediction error') plt.savefig('%s_quant.pdf' % out_prefix) plt.close() # CV quantiles quantile_accuracy(gene_targets, gene_preds, gene_cv, '%s_qcv.pdf'%out_prefix, 4) # stdev quantiles quantile_accuracy(gene_targets, gene_preds, gene_std, '%s_qstd.pdf'%out_prefix, 4)
def replicate_correlations(replicate_lists, gene_targets, gene_preds, target_indexes, out_prefix, scatter_plots=False): """ Study replicate correlations. """ # for intersections target_set = set(target_indexes) rep_cors = [] pred_cors = [] table_out = open('%s.txt' % out_prefix, 'w') sns.set(style='ticks', font_scale=1.3) num_genes = gene_targets.shape[0] li = 0 replicate_labels = sorted(replicate_lists.keys()) for label in replicate_labels: if len(replicate_lists[label]) > 1 and target_set & set( replicate_lists[label]): ti1 = replicate_lists[label][0] ti2 = replicate_lists[label][1] # retrieve targets gene_targets_rep1 = np.log2(gene_targets[:, ti1].astype('float32') + 1) gene_targets_rep2 = np.log2(gene_targets[:, ti2].astype('float32') + 1) # retrieve predictions gene_preds_rep1 = np.log2(gene_preds[:, ti1].astype('float32') + 1) gene_preds_rep2 = np.log2(gene_preds[:, ti2].astype('float32') + 1) ##################################### # replicate # compute replicate correlation rcor, _ = pearsonr(gene_targets_rep1, gene_targets_rep2) rep_cors.append(rcor) # scatter plot rep vs rep if scatter_plots: out_pdf = '%s_s%d.pdf' % (out_prefix, li) gene_indexes = np.random.choice(range(num_genes), 1000, replace=False) plots.regplot( gene_targets_rep1[gene_indexes], gene_targets_rep2[gene_indexes], out_pdf, poly_order=3, alpha=0.3, x_label='log2 Replicate 1', y_label='log2 Replicate 2') ##################################### # prediction # compute prediction correlation pcor1, _ = pearsonr(gene_targets_rep1, gene_preds_rep1) pcor2, _ = pearsonr(gene_targets_rep2, gene_preds_rep2) pcor = 0.5 * pcor1 + 0.5 * pcor2 pred_cors.append(pcor) # scatter plot vs pred if scatter_plots: # scatter plot rep vs pred out_pdf = '%s_s%d_rep1.pdf' % (out_prefix, li) plots.regplot( gene_targets_rep1[gene_indexes], gene_preds_rep1[gene_indexes], out_pdf, poly_order=3, alpha=0.3, x_label='log2 Experiment', y_label='log2 Prediction') # scatter plot rep vs pred out_pdf = '%s_s%d_rep2.pdf' % (out_prefix, li) plots.regplot( gene_targets_rep2[gene_indexes], gene_preds_rep2[gene_indexes], out_pdf, poly_order=3, alpha=0.3, x_label='log2 Experiment', y_label='log2 Prediction') ##################################### # table print( '%4d %4d %4d %7.4f %7.4f %s' % (li, ti1, ti2, rcor, pcor, label), file=table_out) # update counter li += 1 table_out.close() ####################################################### # scatter plot replicate versus prediction correlation rep_cors = np.array(rep_cors) pred_cors = np.array(pred_cors) out_pdf = '%s_scatter.pdf' % out_prefix plots.jointplot( rep_cors, pred_cors, out_pdf, square=True, x_label='Replicate R', y_label='Prediction R')
def cor_table(gene_targets, gene_preds, target_ids, target_labels, target_indexes, out_file, draw_plots=False): """ Print a table and plot the distribution of target correlations. """ table_out = open(out_file, 'w') cors = [] cors_nz = [] for ti in target_indexes: # convert targets and predictions to float32 gti = np.array(gene_targets[:, ti], dtype='float32') gpi = np.array(gene_preds[:, ti], dtype='float32') # log transform gti = np.log2(gti + 1) gpi = np.log2(gpi + 1) # compute correlations scor, _ = spearmanr(gti, gpi) pcor, _ = pearsonr(gti, gpi) cors.append(pcor) # compute non-zero correlations nzi = (gti > 0) scor_nz, _ = spearmanr(gti[nzi], gpi[nzi]) pcor_nz, _ = pearsonr(gti[nzi], gpi[nzi]) cors_nz.append(pcor_nz) # print cols = (ti, scor, pcor, scor_nz, pcor_nz, target_ids[ti], target_labels[ti]) print('%-4d %7.3f %7.3f %7.3f %7.3f %s %s' % cols, file=table_out) cors = np.array(cors) cors_nz = np.array(cors_nz) table_out.close() if draw_plots: # plot correlation distribution out_base = os.path.splitext(out_file)[0] sns.set(style='ticks', font_scale=1.3) # plot correlations versus target signal gene_targets_log = np.log2(gene_targets[:, target_indexes] + 1) target_signal = gene_targets_log.sum(axis=0) plots.jointplot( target_signal, cors, '%s_sig.pdf' % out_base, x_label='Aligned TSS reads', y_label='Pearson R', cor=None, table=True) # plot nonzero correlations versus target signal plots.jointplot( target_signal, cors_nz, '%s_nz_sig.pdf' % out_base, x_label='Aligned TSS reads', y_label='Pearson R', cor=None, table=True) return cors
def main(): usage = 'usage: %prog [options] arg' parser = OptionParser(usage) parser.add_option('-o', dest='out_dir', default='sad_norm') parser.add_option( '-s', dest='sample', default=100000, type='int', help='Number of SNPs to sample for fit [Default: %default]') (options, args) = parser.parse_args() if len(args) != 1: parser.error('Must provide SAD HDF5 path') else: sad_h5_path = args[0] # retrieve chromosome SAD HDF5 files chr_sad_h5_files = sorted(glob.glob('%s/*/sad.h5' % sad_h5_path)) assert (len(chr_sad_h5_files) > 0) # clean out any existing fits # count SNPs across chromosomes num_snps = 0 for chr_sad_h5_file in chr_sad_h5_files: chr_sad_h5 = h5py.File(chr_sad_h5_file, 'r+') # delete fit params if 'target_cauchy_fit_loc' in chr_sad_h5.keys(): del chr_sad_h5['target_cauchy_fit_loc'] del chr_sad_h5['target_cauchy_fit_scale'] # delete norm params if 'target_cauchy_norm_loc' in chr_sad_h5.keys(): del chr_sad_h5['target_cauchy_norm_loc'] del chr_sad_h5['target_cauchy_norm_scale'] # count SNPs num_snps += chr_sad_h5['SAD'].shape[0] num_targets = chr_sad_h5['SAD'].shape[-1] chr_sad_h5.close() # sample SNPs across chromosomes sad = sample_sad(chr_sad_h5_files, options.sample, num_snps, num_targets) # initialize fit parameters target_cauchy_fit_loc = np.zeros(num_targets) target_cauchy_fit_scale = np.zeros(num_targets) # fit parameters for ti in range(num_targets): print('Fitting t%d' % ti, flush=True) cp = cauchy.fit(sad[:, ti]) target_cauchy_fit_loc[ti] = cp[0] target_cauchy_fit_scale[ti] = cp[1] del sad # write across chromosomes for chr_sad_h5_file in chr_sad_h5_files: chr_sad_h5 = h5py.File(chr_sad_h5_file, 'r+') chr_sad_h5.create_dataset('target_cauchy_fit_loc', data=target_cauchy_fit_loc) chr_sad_h5.create_dataset('target_cauchy_fit_scale', data=target_cauchy_fit_scale) chr_sad_h5.close() # compute normalization parameters for chr_sad_h5_file in chr_sad_h5_files: chr_sad5 = SAD5(chr_sad_h5_file) # QC fit table if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) fit_out = open('%s/fits.txt' % options.out_dir, 'w') for ti in range(num_targets): print('%-4d %7.1e %7.1e' % (ti, target_cauchy_fit_loc[ti], target_cauchy_fit_scale[ti]), file=fit_out) fit_out.close() # QC quantiles quantile_dir = '%s/quantiles' % options.out_dir if not os.path.isdir(quantile_dir): os.mkdir(quantile_dir) sad_qc = sample_sad(chr_sad_h5_files, 2048, num_snps, num_targets) for ti in np.linspace(0, num_targets - 1, 64, dtype='int'): # compute cauchy and argsort quantiles cauchy_q = cauchy.cdf(sad_qc[:, ti], loc=target_cauchy_fit_loc[ti], scale=target_cauchy_fit_scale[ti]) sort_i = np.argsort(sad_qc[:, ti]) quantile_pdf = '%s/t%d.pdf' % (quantile_dir, ti) jointplot(np.linspace(0, 1, len(sort_i)), cauchy_q[sort_i], quantile_pdf, square=True, cor=None, x_label='Empirical', y_label='Cauchy') # QC plots norm_dir = '%s/norm' % options.out_dir if not os.path.isdir(norm_dir): os.mkdir(norm_dir) chr_sad5 = SAD5(chr_sad_h5_files[0]) qc_sample = 2048 if qc_sample < chr_sad5.num_snps: ri = sorted( np.random.choice(np.arange(chr_sad5.num_snps), size=qc_sample, replace=False)) else: ri = np.arange(chr_sad5.num_snps) qc_sad_raw = chr_sad5.sad_matrix[ri] qc_sad_norm = chr_sad5[ri] for ti in np.linspace(0, num_targets - 1, 32, dtype='int'): plt.figure() sns.jointplot(qc_sad_raw[:, ti], qc_sad_norm[:, ti], joint_kws={ 'alpha': 0.5, 's': 10 }) plt.savefig('%s/t%d.pdf' % (norm_dir, ti)) plt.close()
def quantile_accuracy(gene_targets, gene_preds, gene_stat, out_pdf, numq=4): """ Plot accuracy (PearsonR) in quantile bins across targets. """ # plot PearsonR in variance statistic bins quant_indexes = quantile_indexes(gene_stat, numq) quantiles_series = [] targets_series = [] pcor_series = [] for qi in range(numq): # slice quantile gene_targets_quant = gene_targets[quant_indexes[qi]].astype("float32") gene_preds_quant = gene_preds[quant_indexes[qi]].astype("float32") # compute target PearsonR for ti in range(gene_targets_quant.shape[1]): pcor, _ = pearsonr(gene_targets_quant[:, ti], gene_preds_quant[:, ti]) quantiles_series.append(qi) targets_series.append(ti) pcor_series.append(pcor) # construct DataFrame df_quant = pd.DataFrame( { "Quantile": quantiles_series, "Target": targets_series, "PearsonR": pcor_series, } ) df_quant.to_csv("%s.csv" % out_pdf[:-4]) # print summary table table_out = open("%s.txt" % out_pdf[:-4], "w") for qi in range(numq): quantile_cors = df_quant[df_quant.Quantile == qi].PearsonR print( "%2d %.4f %.4f" % (qi, np.mean(quantile_cors), np.median(quantile_cors)), file=table_out, ) table_out.close() # construct figure plt.figure() # plot individual targets as light lines for ti in range(gene_targets.shape[1]): df_quant_target = df_quant[df_quant.Target == ti] plt.plot(df_quant_target.Quantile, df_quant_target.PearsonR, alpha=0.1) # plot PearsonR distributions in quantiles sns.violinplot(x="Quantile", y="PearsonR", data=df_quant, color="tomato") plt.savefig(out_pdf) plt.close() # sort targets by their decrease target_ratios = [] for ti in range(gene_targets.shape[1]): df_quant_target = df_quant[df_quant.Target == ti] assert df_quant_target.Quantile.iloc[0] == 0 assert df_quant_target.Quantile.iloc[-1] == numq - 1 cor_ratio = df_quant_target.PearsonR.iloc[-1] / df_quant_target.PearsonR.iloc[0] target_ratios.append((cor_ratio, ti)) target_ratios = sorted(target_ratios) # take 10 samples across pct_indexes = np.linspace(0, len(target_ratios) - 1, 10 + 1).astype("int") # write quantile targets table_out = open("%s_qt.txt" % out_pdf[:-4], "w") sns.set(font_scale=1.2, style="ticks") # scatter plot each quantile for qi in range(numq): # slice quantile gene_targets_quant = gene_targets[quant_indexes[qi]].astype("float32") gene_preds_quant = gene_preds[quant_indexes[qi]].astype("float32") for pqi in range(len(pct_indexes)): pct_i = pct_indexes[pqi] ti = target_ratios[pct_i][1] print(qi, pqi, ti, target_ratios[ti], file=table_out) qout_pdf = "%s_pq%d_q%d.pdf" % (out_pdf[:-4], pqi, qi) plots.jointplot( gene_targets_quant[:, ti], gene_preds_quant[:, ti], qout_pdf, alpha=0.8, point_size=8, kind="reg", figsize=5, x_label="log2 Experiment", y_label="log2 Prediction", ) table_out.close()
def main(): usage = "usage: %prog [options] arg" parser = OptionParser(usage) parser.add_option("-o", dest="out_dir", default="sad_norm") parser.add_option( "-s", dest="sample", default=100000, type="int", help="Number of SNPs to sample for fit [Default: %default]", ) (options, args) = parser.parse_args() if len(args) != 1: parser.error("Must provide SAD HDF5 path") else: sad_h5_path = args[0] # retrieve chromosome SAD HDF5 files chr_sad_h5_files = sorted(glob.glob("%s/*/sad.h5" % sad_h5_path)) assert len(chr_sad_h5_files) > 0 # clean out any existing fits # count SNPs across chromosomes num_snps = 0 for chr_sad_h5_file in chr_sad_h5_files: chr_sad_h5 = h5py.File(chr_sad_h5_file, "r+") # delete fit params if "target_cauchy_fit_loc" in chr_sad_h5.keys(): del chr_sad_h5["target_cauchy_fit_loc"] del chr_sad_h5["target_cauchy_fit_scale"] # delete norm params if "target_cauchy_norm_loc" in chr_sad_h5.keys(): del chr_sad_h5["target_cauchy_norm_loc"] del chr_sad_h5["target_cauchy_norm_scale"] # count SNPs num_snps += chr_sad_h5["SAD"].shape[0] num_targets = chr_sad_h5["SAD"].shape[-1] chr_sad_h5.close() # sample SNPs across chromosomes sad = sample_sad(chr_sad_h5_files, options.sample, num_snps, num_targets) # initialize fit parameters target_cauchy_fit_loc = np.zeros(num_targets) target_cauchy_fit_scale = np.zeros(num_targets) # fit parameters for ti in range(num_targets): print("Fitting t%d" % ti, flush=True) cp = cauchy.fit(sad[:, ti]) target_cauchy_fit_loc[ti] = cp[0] target_cauchy_fit_scale[ti] = cp[1] del sad # write across chromosomes for chr_sad_h5_file in chr_sad_h5_files: chr_sad_h5 = h5py.File(chr_sad_h5_file, "r+") chr_sad_h5.create_dataset("target_cauchy_fit_loc", data=target_cauchy_fit_loc) chr_sad_h5.create_dataset( "target_cauchy_fit_scale", data=target_cauchy_fit_scale ) chr_sad_h5.close() # compute normalization parameters for chr_sad_h5_file in chr_sad_h5_files: chr_sad5 = SAD5(chr_sad_h5_file) # QC fit table if not os.path.isdir(options.out_dir): os.mkdir(options.out_dir) fit_out = open("%s/fits.txt" % options.out_dir, "w") for ti in range(num_targets): print( "%-4d %7.1e %7.1e" % (ti, target_cauchy_fit_loc[ti], target_cauchy_fit_scale[ti]), file=fit_out, ) fit_out.close() # QC quantiles quantile_dir = "%s/quantiles" % options.out_dir if not os.path.isdir(quantile_dir): os.mkdir(quantile_dir) sad_qc = sample_sad(chr_sad_h5_files, 2048, num_snps, num_targets) for ti in np.linspace(0, num_targets - 1, 64, dtype="int"): # compute cauchy and argsort quantiles cauchy_q = cauchy.cdf( sad_qc[:, ti], loc=target_cauchy_fit_loc[ti], scale=target_cauchy_fit_scale[ti], ) sort_i = np.argsort(sad_qc[:, ti]) quantile_pdf = "%s/t%d.pdf" % (quantile_dir, ti) jointplot( np.linspace(0, 1, len(sort_i)), cauchy_q[sort_i], quantile_pdf, square=True, cor=None, x_label="Empirical", y_label="Cauchy", ) # QC plots norm_dir = "%s/norm" % options.out_dir if not os.path.isdir(norm_dir): os.mkdir(norm_dir) chr_sad5 = SAD5(chr_sad_h5_files[0]) qc_sample = 2048 if qc_sample < chr_sad5.num_snps: ri = sorted( np.random.choice( np.arange(chr_sad5.num_snps), size=qc_sample, replace=False ) ) else: ri = np.arange(chr_sad5.num_snps) qc_sad_raw = chr_sad5.sad_matrix[ri] qc_sad_norm = chr_sad5[ri] for ti in np.linspace(0, num_targets - 1, 32, dtype="int"): plt.figure() sns.jointplot( qc_sad_raw[:, ti], qc_sad_norm[:, ti], joint_kws={"alpha": 0.5, "s": 10} ) plt.savefig("%s/t%d.pdf" % (norm_dir, ti)) plt.close()