def aei_plot_single(self, cissnp, ax=None, focus_snp=None): """ Arguments --------- tsnp - a particular tag snp to plot association with """ x_scale = 1e6 cm = plt.cm.get_cmap('Blues') size_maf = ((200 * self.maf) + 20) pos = self.snp_annot.loc[:, 'pos'] pos = np.asarray(pos, dtype=np.uint64) / x_scale if ax: pass else: fig, ax = plt.subplots(nrows=1, ncols=1, sharey=False, sharex=True, subplot_kw=dict(axisbg='#FFFFFF')) adj_pvalue = -1 * np.log10(self.pvalues.loc[:, cissnp]) if focus_snp: print('focus snp') snp = focus_snp else: # :TODO fix for both use cases #snp = subset.iloc[np.nanargmax(adj_pv), 0] snp = self.pvalues.index[np.nanargmax(adj_pvalue)] print(snp) snp_iloc = [i for i, j in enumerate(adj_pvalue.index)\ if j == snp][0] color1 = calculate_ld(self.geno, snp)[adj_pvalue.index].values scatter = ax.scatter(pos, adj_pvalue, s=size_maf, c=color1) ax.set_ylabel(r'-$log_{10}$ AEI p-value') ylim = (max(adj_pvalue) + max(adj_pvalue / 6.0)) ax.set_ylim((-0.01, ylim)) #ax = add_snp_arrow(adj_pvalue[snp_iloc], pos[snp_iloc], snp, ax) ax = snp_arrow(pos[snp_iloc], adj_pvalue[snp_iloc], snp, ax) if ax: return (ax) else: return (fig)
def aei_plot_single(self, cissnp, ax=None, focus_snp=None): """ Arguments --------- tsnp - a particular tag snp to plot association with """ x_scale = 1e6 cm = plt.cm.get_cmap('Blues') size_maf = ((200 * self.maf) + 20) pos = self.snp_annot.loc[:, 'pos'] pos = np.asarray(pos, dtype=np.uint64)/x_scale if ax: pass else: fig, ax = plt.subplots(nrows=1 , ncols=1, sharey=False, sharex=True, subplot_kw=dict(axisbg='#FFFFFF')) adj_pvalue = -1*np.log10(self.pvalues.loc[:, cissnp]) if focus_snp: print('focus snp') snp = focus_snp else: # :TODO fix for both use cases #snp = subset.iloc[np.nanargmax(adj_pv), 0] snp = self.pvalues.index[np.nanargmax(adj_pvalue)] print(snp) snp_iloc = [i for i, j in enumerate(adj_pvalue.index)\ if j == snp][0] color1 = calculate_ld(self.geno, snp)[adj_pvalue.index].values scatter = ax.scatter(pos, adj_pvalue, s=size_maf, c=color1) ax.set_ylabel(r'-$log_{10}$ AEI p-value') ylim = (max(adj_pvalue) + max(adj_pvalue/6.0)) ax.set_ylim((-0.01, ylim)) #ax = add_snp_arrow(adj_pvalue[snp_iloc], pos[snp_iloc], snp, ax) ax = snp_arrow(pos[snp_iloc], adj_pvalue[snp_iloc], snp, ax) if ax: return(ax) else: return(fig)
def _all_eQTL_comp(chrom, base_path, debug=None, count_threshold=200): chrom_dict = {'chrom': str(chrom)} config = ConfigParser.ConfigParser() config.read('test.cfg') aei_base = base_path + config.get('data', 'aei_prefix') dos_path = base_path + config.get('data', 'dosage_prefix', 0, chrom_dict) snp_annot = base_path + config.get('annotation', 'snp_annot_prefix', 0, chrom_dict) gene_snps_path = base_path + config.get('annotation', 'gene_snps', 0, chrom_dict) eqtl_path = base_path + config.get('data', 'eqtl_prefix', 0, chrom_dict) ###### Loading the data ########## aei = get_aei((aei_base + 'chr{chrom!s}.pkl'.format(chrom=chrom))) dos = pd.read_csv(dos_path, sep=" ", index_col=0, header=0) s_ann = pd.read_pickle(snp_annot) gene_snps = pd.read_pickle(gene_snps_path) eqtl_matrix = pd.read_pickle(eqtl_path) ##### Restrict to only Europeans vsd_counts = pd.read_csv(base_path + 'eQTL/gene_vsd_eQTL_april.txt', sep=",", index_col=0) af_euro = calculate_minor_allele_frequency(dos.ix[:,vsd_counts.columns]) maf_euro = af_euro.index[(af_euro >= 0.01) & (af_euro <= 0.99)] eqtl_matrix =\ eqtl_matrix.ix[eqtl_matrix.index.get_level_values('SNP').isin(maf_euro)] print(eqtl_matrix.shape) count_threshold = 200 outfile = open(base_path +\ 'eQTL/tables/global_aei/{0}_aei_rep.txt'.format(chrom), 'w+') eqtl_matrix = eqtl_matrix.swaplevel(0, 1) eqtl = eqtl_matrix.groupby(level=0) pvalues_fdr_calc = [] idx = eqtl.apply(lambda x: x['p-value'].argmin()) print(eqtl.groups.keys()[0:10]) header = ['Symbol', 'ensid', 'Chrom', 'Indicator', 'Best AEI SNP', ''] for i, j in gene_snps.iteritems(): if i == 'ENSG00000054654': pass else: print('going') continue symbol = get_symbol_ensembl(i) print(i, symbol) empty_out = [str(symbol), str(i), str(chrom), 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'] aei_t = aei.ix[j, :] aei_t = aei_t.ix[aei_t.sum(axis=1) >= count_threshold, :] if aei_t.shape[0] == 0: outfile.write("\t".join(empty_out) + "\n") #si = # Or grab from eQTL.matrix? snps_cis = [eqtl_i[1] for eqtl_i in eqtl.groups[i]] gaei2 = AEI(aei_t, dos.ix[snps_cis, :], s_ann.ix[snps_cis,:], i) gaei2.calc_aei(num_threshold=20) bt = gaei2.pvalues bt = bt.ix[:, np.logical_not(bt.min().isnull())] cur_best = None cur_best_pvalue = 1 for ind in bt.columns: cissnp = bt.ix[:, ind].idxmin() outliers_g = gaei2.outliers.ix[gaei2.hets_dict[ind], ind].values tgeno = dosage_round(gaei2.geno.ix[cissnp, gaei2.hets_dict[ind]][np.logical_not(outliers_g)]) sum_hets_homos = np.sum((tgeno == 0) | (tgeno == 2)) if (sum_hets_homos > 15) and (bt.ix[cissnp, ind] < cur_best_pvalue): cur_best = ind cur_best_pvalue = bt.ix[cissnp, ind] if cur_best: good = cur_best pvalue_good = cur_best_pvalue cissnp = gaei2.pvalues.ix[:,good].idxmin() else: outfile.write("\t".join(empty_out) + "\n") continue # :TODO get beta estimate pvalues_fdr_calc.extend(gaei2.pvalues.ix[:,good].values) #indsnp = gaei2.pvalues.columns[good] indsnp = good # Beta estimates try: outliers_g = gaei2.outliers.ix[gaei2.hets_dict[indsnp] ,indsnp].values except KeyError: continue tgeno = dosage_round(gaei2.geno.ix[cissnp, gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)]) ar = gaei2.ratios.ix[gaei2.hets_dict[indsnp], indsnp][np.logical_not(outliers_g)] ar[ar > 1] = 1/ar[ar > 1] het_combined = ar[np.array(tgeno == 1)] homo_combined = ar[np.array((tgeno == 0) | (tgeno == 2))] beta_best = np.mean(het_combined)/np.mean(homo_combined) try: aei_eqtl_best = gaei2.pvalues.ix[idx[i][1], good] tgeno = dosage_round(gaei2.geno.ix[idx[i][1], gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)]) het_combined_e = ar[np.array(tgeno == 1)] homo_combined_e = ar[np.array((tgeno == 0) | (tgeno == 2))] eqtl_best_aei_beta =\ np.mean(het_combined_e)/np.mean(homo_combined_e) except KeyError: aei_eqtl_best = 'NA' eqtl_best_aei_beta = 'NA' if not cissnp == idx[i][1]: ldbest = calculate_ld(dos.ix[[cissnp, idx[i][1]],:], cissnp)[idx[i][1]] ** 2 else: ldbest = 1 out_l = [symbol, i, chrom, indsnp, cissnp, beta_best, pvalue_good, eqtl_matrix.ix[(i, cissnp), 'beta'], eqtl_matrix.ix[(i, cissnp), 'p-value'], idx[i][1], eqtl_best_aei_beta, aei_eqtl_best, eqtl_matrix.ix[(i, idx[i][1]), 'beta'], eqtl_matrix.ix[(i, idx[i][1]), 'p-value'], float(np.sum(outliers_g))/len(gaei2.hets_dict[indsnp]), ldbest ] if (symbol == str(debug)) or (i==str(debug)): embed() break else: pass out_l = [str(out_column) for out_column in out_l] outfile.write("\t".join(out_l) + "\n") ''' if debug >= 30: embed() break else: debug += 1 ''' ''' outfile_pvalues.write("\n".join([str(i) for i in pvalues_fdr_calc])) ''' outfile.close()
def plot_eQTL(meQTL, gene_name, annotation, dosage, ax=None, symbol=None, focus_snp=None, gene_annot=None, size_shift=0, **kwargs): """ Plot eQTL from a full_matrix object Arguments --------- meQTL - a matrix eQTL dataframe or a series of pvalues gene_name - gene name annotation - snp annotation dataframe, index is rsID dosage - a dosage dataframe ax - axis to plot into """ subset = subset_meQTL(meQTL, gene_name) if isinstance(subset.index, pd.core.index.MultiIndex): subset.index = subset.index.get_level_values('SNP') else: pass x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) x_scale = 1e6 try: adj_pv = -1 * np.log10(subset.ix[:, 'p-value']) except IndexError: adj_pv = -1 * np.log10(subset.iloc[:, 0]) except pd.core.indexing.IndexingError: adj_pv = -1 * np.log10(subset.iloc[:]) try: pos = np.asarray(annotation.ix[subset.index, 'pos'], dtype=np.double) / x_scale except KeyError: pos = np.asarray(annotation.ix[subset.index, 1], dtype=np.double) / x_scale dosage_sub = dosage.ix[subset.index, :] print('subset shape') print(subset.shape) print(kwargs) dosage_maf =\ calculate_minor_allele_frequency(dosage_sub) dosage_maf[dosage_maf > 0.5] = 1 - dosage_maf[dosage_maf > 0.5] dosage_maf = ((200 * dosage_maf) + 20) + size_shift if focus_snp: snp = focus_snp else: # :TODO fix for both use cases #snp = subset.iloc[np.nanargmax(adj_pv), 0] snp = subset.index[np.nanargmax(adj_pv)] try: iix = [i for i, j in enumerate(subset["SNP"]) if j == snp] except KeyError: iix = [i for i, j in enumerate(subset.index) if j == snp] # Need this since pos is a numpy array not pandas series snpx = pos[iix[0]] snp_pv = adj_pv.iloc[iix[0]] color_ld = calculate_ld(dosage_sub, snp)[dosage_sub.index].values if ax is None: ax_orig = False fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6), sharey=False, sharex=True, subplot_kw=dict(axisbg='#FFFFFF')) fig.tight_layout() fig.subplots_adjust(right=0.8, bottom=0.2) else: ax_orig = True ax.set_xlim((min(pos) - 0.01, max(pos) + 0.01)) ylim = (max(adj_pv) + max(adj_pv / 6.0)) ax.set_ylim((-0.01, ylim)) ax.xaxis.set_major_formatter(x_formatter) ### Actual scatter ############################# im = ax.scatter(pos, adj_pv, s=dosage_maf, c=color_ld, **kwargs) #:TODO make the arrow into a funciton ax.set_ylabel(r'$-log_{10}$ eQTL p-value') ax.set_xlabel(r'Position (Mb)') if symbol: gene_name = symbol if ax_orig: return (ax) else: cbar_ax = fig.add_axes([0.87, 0.15, 0.05, 0.7]) bar = fig.colorbar(im, cax=cbar_ax) bar.ax.tick_params(labelsize=18) bar.set_label('r$^{2}$') return (fig)
def _all_eQTL_comp(chrom, base_path, debug=None, count_threshold=200): chrom_dict = {'chrom': str(chrom)} config = ConfigParser.ConfigParser() config.read('test.cfg') aei_base = base_path + config.get('data', 'aei_prefix') dos_path = base_path + config.get('data', 'dosage_prefix', 0, chrom_dict) snp_annot = base_path + config.get('annotation', 'snp_annot_prefix', 0, chrom_dict) gene_snps_path = base_path + config.get('annotation', 'gene_snps', 0, chrom_dict) eqtl_path = base_path + config.get('data', 'eqtl_prefix', 0, chrom_dict) ###### Loading the data ########## aei = get_aei((aei_base + 'chr{chrom!s}.pkl'.format(chrom=chrom))) dos = pd.read_csv(dos_path, sep=" ", index_col=0, header=0) s_ann = pd.read_pickle(snp_annot) gene_snps = pd.read_pickle(gene_snps_path) eqtl_matrix = pd.read_pickle(eqtl_path) ##### Restrict to only Europeans vsd_counts = pd.read_csv(base_path + 'eQTL/gene_vsd_eQTL_april.txt', sep=",", index_col=0) af_euro = calculate_minor_allele_frequency(dos.ix[:, vsd_counts.columns]) maf_euro = af_euro.index[(af_euro >= 0.01) & (af_euro <= 0.99)] eqtl_matrix =\ eqtl_matrix.ix[eqtl_matrix.index.get_level_values('SNP').isin(maf_euro)] print(eqtl_matrix.shape) count_threshold = 200 outfile = open(base_path +\ 'eQTL/tables/global_aei/{0}_aei_rep.txt'.format(chrom), 'w+') eqtl_matrix = eqtl_matrix.swaplevel(0, 1) eqtl = eqtl_matrix.groupby(level=0) pvalues_fdr_calc = [] idx = eqtl.apply(lambda x: x['p-value'].argmin()) print(eqtl.groups.keys()[0:10]) header = ['Symbol', 'ensid', 'Chrom', 'Indicator', 'Best AEI SNP', ''] for i, j in gene_snps.iteritems(): if i == 'ENSG00000054654': pass else: print('going') continue symbol = get_symbol_ensembl(i) print(i, symbol) empty_out = [ str(symbol), str(i), str(chrom), 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA' ] aei_t = aei.ix[j, :] aei_t = aei_t.ix[aei_t.sum(axis=1) >= count_threshold, :] if aei_t.shape[0] == 0: outfile.write("\t".join(empty_out) + "\n") #si = # Or grab from eQTL.matrix? snps_cis = [eqtl_i[1] for eqtl_i in eqtl.groups[i]] gaei2 = AEI(aei_t, dos.ix[snps_cis, :], s_ann.ix[snps_cis, :], i) gaei2.calc_aei(num_threshold=20) bt = gaei2.pvalues bt = bt.ix[:, np.logical_not(bt.min().isnull())] cur_best = None cur_best_pvalue = 1 for ind in bt.columns: cissnp = bt.ix[:, ind].idxmin() outliers_g = gaei2.outliers.ix[gaei2.hets_dict[ind], ind].values tgeno = dosage_round(gaei2.geno.ix[cissnp, gaei2.hets_dict[ind]][ np.logical_not(outliers_g)]) sum_hets_homos = np.sum((tgeno == 0) | (tgeno == 2)) if (sum_hets_homos > 15) and (bt.ix[cissnp, ind] < cur_best_pvalue): cur_best = ind cur_best_pvalue = bt.ix[cissnp, ind] if cur_best: good = cur_best pvalue_good = cur_best_pvalue cissnp = gaei2.pvalues.ix[:, good].idxmin() else: outfile.write("\t".join(empty_out) + "\n") continue # :TODO get beta estimate pvalues_fdr_calc.extend(gaei2.pvalues.ix[:, good].values) #indsnp = gaei2.pvalues.columns[good] indsnp = good # Beta estimates try: outliers_g = gaei2.outliers.ix[gaei2.hets_dict[indsnp], indsnp].values except KeyError: continue tgeno = dosage_round( gaei2.geno.ix[cissnp, gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)]) ar = gaei2.ratios.ix[gaei2.hets_dict[indsnp], indsnp][np.logical_not(outliers_g)] ar[ar > 1] = 1 / ar[ar > 1] het_combined = ar[np.array(tgeno == 1)] homo_combined = ar[np.array((tgeno == 0) | (tgeno == 2))] beta_best = np.mean(het_combined) / np.mean(homo_combined) try: aei_eqtl_best = gaei2.pvalues.ix[idx[i][1], good] tgeno = dosage_round(gaei2.geno.ix[idx[i][1], gaei2.hets_dict[indsnp]][ np.logical_not(outliers_g)]) het_combined_e = ar[np.array(tgeno == 1)] homo_combined_e = ar[np.array((tgeno == 0) | (tgeno == 2))] eqtl_best_aei_beta =\ np.mean(het_combined_e)/np.mean(homo_combined_e) except KeyError: aei_eqtl_best = 'NA' eqtl_best_aei_beta = 'NA' if not cissnp == idx[i][1]: ldbest = calculate_ld(dos.ix[[cissnp, idx[i][1]], :], cissnp)[idx[i][1]]**2 else: ldbest = 1 out_l = [ symbol, i, chrom, indsnp, cissnp, beta_best, pvalue_good, eqtl_matrix.ix[(i, cissnp), 'beta'], eqtl_matrix.ix[(i, cissnp), 'p-value'], idx[i][1], eqtl_best_aei_beta, aei_eqtl_best, eqtl_matrix.ix[(i, idx[i][1]), 'beta'], eqtl_matrix.ix[(i, idx[i][1]), 'p-value'], float(np.sum(outliers_g)) / len(gaei2.hets_dict[indsnp]), ldbest ] if (symbol == str(debug)) or (i == str(debug)): embed() break else: pass out_l = [str(out_column) for out_column in out_l] outfile.write("\t".join(out_l) + "\n") ''' if debug >= 30: embed() break else: debug += 1 ''' ''' outfile_pvalues.write("\n".join([str(i) for i in pvalues_fdr_calc])) ''' outfile.close()
def plot_eQTL(meQTL, gene_name, annotation, dosage, ax=None, symbol=None, focus_snp=None, gene_annot=None, size_shift=0, **kwargs): """ Plot eQTL from a full_matrix object Arguments --------- meQTL - a matrix eQTL dataframe or a series of pvalues gene_name - gene name annotation - snp annotation dataframe, index is rsID dosage - a dosage dataframe ax - axis to plot into """ subset = subset_meQTL(meQTL, gene_name) if isinstance(subset.index, pd.core.index.MultiIndex): subset.index = subset.index.get_level_values('SNP') else: pass x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) x_scale = 1e6 try: adj_pv = -1 * np.log10(subset.ix[:, 'p-value']) except IndexError: adj_pv = -1 * np.log10(subset.iloc[:, 0]) except pd.core.indexing.IndexingError: adj_pv = -1 * np.log10(subset.iloc[:]) try: pos = np.asarray(annotation.ix[subset.index, 'pos'], dtype=np.double) / x_scale except KeyError: pos = np.asarray(annotation.ix[subset.index, 1], dtype=np.double) / x_scale dosage_sub = dosage.ix[subset.index, :] print('subset shape') print(subset.shape) dosage_maf =\ calculate_minor_allele_frequency(dosage_sub) dosage_maf[dosage_maf > 0.5] = 1 - dosage_maf[dosage_maf > 0.5] dosage_maf = ((200 * dosage_maf) + 20) + size_shift if focus_snp: snp = focus_snp else: # :TODO fix for both use cases #snp = subset.iloc[np.nanargmax(adj_pv), 0] snp = subset.index[np.nanargmax(adj_pv)] try: iix = [i for i, j in enumerate(subset["SNP"]) if j == snp] except KeyError: iix = [i for i, j in enumerate(subset.index) if j == snp] # Need this since pos is a numpy array not pandas series snpx = pos[iix[0]] snp_pv = adj_pv.iloc[iix[0]] color1 = calculate_ld(dosage_sub, snp)[dosage_sub.index].values if ax is None: ax_orig = False fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6), sharey=False, sharex=True, subplot_kw=dict(axisbg='#FFFFFF')) fig.tight_layout() fig.subplots_adjust(right=0.8, bottom=0.2) else: ax_orig = True ax.set_xlim((min(pos) - 0.01, max(pos) + 0.01)) ylim = (max(adj_pv) + max(adj_pv / 6.0)) ax.set_ylim((-0.01, ylim)) ax.xaxis.set_major_formatter(x_formatter) ### Actual scatter ############################# im = ax.scatter(pos, adj_pv, s=dosage_maf, c=color1) #:TODO make the arrow into a funciton ax.set_ylabel(r'$-log_{10}$ eQTL p-value') ax.set_xlabel(r'Position (Mb)') if symbol: gene_name = symbol if ax_orig: return(ax) else: cbar_ax = fig.add_axes([0.87, 0.15, 0.05, 0.7]) bar = fig.colorbar(im, cax=cbar_ax) bar.ax.tick_params(labelsize=18) bar.set_label('r$^{2}$') return(fig)