def calculate_D_prime(geno1, geno2): """ Calculates pairwise D_prime of genotype with the geno1 - an array or series geno2 - an array or series """ assert len(geno1) == len(geno2) p1 = calculate_minor_allele_frequency(geno1) p2 = calculate_minor_allele_frequency(geno2) p12 = (geno1 == 2) & (geno2 == 2) p21 = (geno1 == 0) & (geno2 == 0) return (p1 * p2)
def calculate_D_prime(geno1, geno2): """ Calculates pairwise D_prime of genotype with the geno1 - an array or series geno2 - an array or series """ assert len(geno1) == len(geno2) p1 = calculate_minor_allele_frequency(geno1) p2 = calculate_minor_allele_frequency(geno2) p12 = (geno1 == 2) & (geno2 == 2) p21 = (geno1 == 0) & (geno2 == 0) return(p1 * p2)
def __init__(self, aei_dataframe, geno, snp_annot, gene_name, maf_threshold=0.05, ld_prune=False): """ Arguments --------- aei_dataframe - AEI count dataframe gotton from func :TODO list script to obtain aei from bam file in genda/scripts. Must be subsetted for SNPs within gene. geno - genotype dataframe to do testing on surrounding SNP. snp_annot - dataframe of SNP annotations """ self.aei = aei_dataframe # :TODO assert self.geno and self.snp_annot are same shape try: self.sample_ids = [i[0] for i in self.aei.columns][::4] except AttributeError: self.sample_ids = [i[0] for i in self.aei.index][::4] ids = (pd.Index(self.sample_ids).intersection(geno.columns)) idx = pd.IndexSlice try: self.aei.sort_index(axis=1, inplace=True) self.aei = self.aei.loc[:, idx[ids, :]] self.aei.sort_index(axis=1, inplace=True) self.geno = geno.ix[:, self.aei.columns.get_level_values(0)[::4]] except TypeError: # Case where aei_dataframe is just a series self.aei = self.aei.sort_index() self.aei = self.aei.loc[idx[ids, :]] self.aei = self.aei.sort_index() self.geno = geno.ix[:, self.aei.index.get_level_values(0)[::4]] self.ids = self.geno.columns self.maf = calculate_minor_allele_frequency(self.geno.ix[:, ids]) # Restrict to > 5% self.geno = self.geno.ix[(self.maf >= maf_threshold) &\ (self.maf <= 1 - maf_threshold), :] self.snp_annot = snp_annot.ix[self.geno.index, :] self.maf = self.maf[self.geno.index] self.gene_name = gene_name self.aei = self.aei.ix[self.geno.index.intersection(self.aei.index), :]
def __init__(self, aei_dataframe, geno, snp_annot, gene_name, maf_threshold = 0.05, ld_prune=False): """ Arguments --------- aei_dataframe - AEI count dataframe gotton from func :TODO list script to obtain aei from bam file in genda/scripts. Must be subsetted for SNPs within gene. geno - genotype dataframe to do testing on surrounding SNP. snp_annot - dataframe of SNP annotations """ self.aei = aei_dataframe # :TODO assert self.geno and self.snp_annot are same shape try: self.sample_ids = [i[0] for i in self.aei.columns][::4] except AttributeError: self.sample_ids = [i[0] for i in self.aei.index][::4] ids = (pd.Index(self.sample_ids) .intersection(geno.columns)) idx = pd.IndexSlice try: self.aei.sort_index(axis=1, inplace=True) self.aei = self.aei.loc[:, idx[ids, :]] self.aei.sort_index(axis=1, inplace=True) self.geno = geno.ix[:, self.aei.columns.get_level_values(0)[::4]] except TypeError: # Case where aei_dataframe is just a series self.aei = self.aei.sort_index() self.aei = self.aei.loc[idx[ids, :]] self.aei = self.aei.sort_index() self.geno = geno.ix[:, self.aei.index.get_level_values(0)[::4]] self.ids = self.geno.columns self.maf = calculate_minor_allele_frequency(self.geno.ix[:, ids]) # Restrict to > 5% self.geno = self.geno.ix[(self.maf >= maf_threshold) &\ (self.maf <= 1 - maf_threshold), :] self.snp_annot = snp_annot.ix[self.geno.index, :] self.maf = self.maf[self.geno.index] self.gene_name = gene_name self.aei = self.aei.ix[self.geno.index.intersection(self.aei.index),:]
def main(gene, de, rsid, expr, cov=None): """rsid is simply a SNP within the region # Refactor transcript order shouldn't matter Arguments --------- gene : genda.transcripts.gene object covariates : add covriates to the fit """ ''' pheno = pd.read_csv('/home/hsuj/Afib/eQTL/pheno_for_miso.txt', sep="\t", header=None, skiprows=1, index_col=1) ''' pheno2 = pd.read_csv('gene_pheno_eQTL_april.txt', sep=",", index_col=0) new_col = [i.replace('.', '-') for i in pheno2.columns] pheno2.columns = new_col pheno2 = pheno2.T srs = 'ENST00000453840' # Normalize ## PCA covariates base_path = '/home/hsuj/Afib/' # Need at least 2 transcripts to compare. path_dict = gene.transcripts chrom = gene.chrom sann = pd.read_pickle(base_path + 'ref/snp_annot/{0!s}.pkl'.format(chrom)) sann['pos'] = sann['pos'].astype(int) i = get_genotype(chrom, rsid) geno = i.ix[:, expr.columns] gaf = calculate_minor_allele_frequency(geno) gaf = ((gaf >= 0.10) & (gaf <= 0.90)).values geno = geno.ix[gaf, :] plot_dict = {} fig2, ax2 = plt.subplots(figsize=(10, 10), nrows=4, sharex=True) x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) if de.exon_num[0]: sea = 0 else: sea = 1 eoi = [de.exon_num[sea]] cigar_skipped = getattr(de, 'cigar{0!s}'.format(sea * 1 + 1)) if len(cigar_skipped) > 3: eoi.append(eoi[0] + 1) else: pass cpath = path_dict[de.transcript_ids[sea]] eoi_intron_lengths = [i[1] for i in cigar_skipped if i[0] == 3] eoi2 = getattr(de, 'cigar{0!s}'.format(2 - 1 * sea))[0][1] # Move generation of to plot into diffevents? # rough size normalization factor norm_fact1 = (float(sum([i[1] - i[0] for i in to_plot1])) / sum([i[1] - i[0] for i in to_plot2])) x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) xmin = min(to_plot1[0][0], to_plot2[0][0]) xmax = max(to_plot1[2][1], to_plot2[1][1]) sample_mappings = sample_mappings.ix[ np.logical_not(sample_mappings.index.duplicated()), :] nsamples = sample_mappings.shape[0] out_frame = pd.DataFrame( { de.transcript_ids[0]: np.zeros(nsamples), de.transcript_ids[1]: np.zeros(sample_mappings.shape[0]) }, index=sample_mappings.index) buffer_bp = 0 t1 = de.transcript_ids[0] t2 = de.transcript_ids[1] for bami in sample_mappings.index: # :TODO Make this more generalizable fname = sample_mappings.ix[bami, 2].split("/")[-1] bi = '/home/hsuj/lustre/AFdata/{0!s}'.format(fname) bamf = pysam.Samfile(bi) bamiter = bamf.fetch('chr' + chrom, xmin - buffer_bp, xmax + buffer_bp) # Conver this to cython c0 = 0 c1 = 0 cnot = 0 for i in bamiter: #start = i.pos exons = [j[1] for j in i.cigar if j[0] == 0] introns = [j[1] for j in i.cigar if j[0] == 3] # Probably need to grab exact positions even though we are fetching # in small region matches1 = [zi for zi in eoi_intron_lengths if zi in introns] try: matches2 = [zi for zi in eoi2 if zi in introns] except TypeError: matches2 = [zi for zi in [eoi2] if zi in introns] if (len(matches1) > 0): c0 += 1 elif len(matches2) > 0: c1 += 1 else: cnot += 1 out_frame.ix[bami, t1] = c0 out_frame.ix[bami, t2] = c1 # Filter low counts count_threshold = 100 out_frame = out_frame.ix[out_frame.sum(axis=1) > count_threshold, :] read_length = 100 if to_plot2 > 2: intron_factor = 3 else: intron_factor = 1 propi = (out_frame.ix[:, t1] + 1) / intron_factor / ( out_frame.ix[:, t1] / intron_factor + out_frame.ix[:, t2]) ''' bii = (sann['pos'] > xmin) &\ (sann['pos'] < xmax) bii = sann.index[bii.values] goi = geno.ix[bii, :] ''' X = test.ix[srs, out_frame.index] X = sm.add_constant(X) X2 = X.copy() X2 = X2.join(pheno2.ix[:, 0:5], how='inner') X2['prop'] = propi[X2.index] X2['fullsum'] = out_frame.ix[X2.index].sum(axis=1) try: prop_model =\ ols('prop~sexFemale+ENST00000453840+MDS4+MDS1+MDS3+MDS2+fullsum', data=X2, missing='drop').fit() except ValueError: embed() fig, ax = plt.subplots(figsize=(6, 6)) fig = sm.graphics.plot_partregress("prop", "ENST00000453840", ['sexFemale', 'MDS4', 'fullsum'], data=X2, ax=ax, obs_labels=False) ax.text(0.5, ax.get_ylim()[1] - 0.02, 'p-value: %.2E' % Decimal(prop_model.pvalues['ENST00000453840']), size=12) ax.set_xlabel('SRSF10 expression') ax.set_ylabel('{0!s} included exon / skipped exon proportion'.format( gene.symbol)) ax.set_title('') plt.tight_layout() fig.savefig(base_path + 'eQTL/graphs/{0!s}_srsf10_fit.png'.format(gene.symbol)) print(prop_model.pvalues) from lin_test import test_transcript # Let's get all the SNPs that fall within a certain region if gene.symbol == 'CAST' or gene.symbol == 'GDAP1L1': for i, j in enumerate(de.transcript_ids): ax2[0] = plot_transcript(j, plot_dict, ax2[0], y=i * 2.5, height=2.) ax2[0].hlines((i * 2.5 + 2) - 1, xmin, xmax, colors='darkgrey', lw=2) ax2[0].xaxis.set_major_formatter(x_formatter) ax2[0] = remove_tr_spines(ax2[0]) ax2[0].set_xlim((xmin, xmax)) ax2[0].set_ylim((-0.5, 2 * 2.5 + 0.5)) goi = geno goi = goi.ix[:, out_frame.index] gfits = goi.apply(test_transcript, axis=1, args=(X, propi)) pvalues = [i.pvalues['geno'] for i in gfits] best_snp = geno.index[np.nanargmin(pvalues)] pvalues = pd.Series(pvalues, index=geno.index) print(gfits[np.nanargmin(pvalues)].pvalues) color = plt.rcParams['axes.color_cycle'][0] embed() for i in range(3): if i == 0: geno_string = sann.ix[best_snp, 'a0'] * 2 elif i == 1: geno_string = sann.ix[best_snp, 'a0'] +\ sann.ix[best_snp, 'a1'] elif i == 2: geno_string = sann.ix[best_snp, 'a1'] * 2 hist = np.zeros(xmax - xmin, dtype=np.uint64) c_geno = (goi.ix[best_snp, :] == i) # Random from out_Frame try: b_example = np.random.choice(goi.columns[c_geno.values], size=1)[0] except ValueError: continue het = pysam.Samfile(sample_mappings.ix[b_example, 2]) het_bamf = het.fetch('chr' + str(chrom), xmin, xmax) color = plt.rcParams['axes.color_cycle'][i] for read in het_bamf: coverage_hist(read, hist, xmin) ax2[i + 1].plot(np.linspace(xmin, xmax, num=len(hist)), hist, color) ax2[i + 1].fill_between(np.arange(xmin, xmax), 0, hist, facecolor=color) ax2[i + 1].set_ylim( (np.min(hist), np.max(hist) + 0.2 * np.max(hist))) try: ax2[i + 1].text((xmax + xmin) / 2, np.max(hist), str(out_frame.ix[b_example, 1])) except KeyError: pass ax2[i + 1].set_ylabel('{0} Genotype'.format(geno_string)) #from lin_test import _temp_plot # Resave the pickeld file with correct int type ax2[0].text((xmax - xmin) / 2, ax2[0].get_ylim()[1] - 1, str(min(pvalues))) ax2[0].axvline(sann.ix[best_snp, 'pos'], color='r', linestyle='solid') ax2[0].set_title('{0!s}'.format(gene.symbol)) ax2[-1].set_xlabel('Position') fig2.savefig(base_path + 'eQTL/graphs/{0!s}_transcript.png'.format(gene.symbol)) out_frame.columns = [ '{0} IE'.format(gene.symbol), '{0} SE'.format(gene.symbol) ] return (propi)
def _all_eQTL_comp(chrom, base_path, debug=None, count_threshold=200): chrom_dict = {'chrom': str(chrom)} config = ConfigParser.ConfigParser() config.read('test.cfg') aei_base = base_path + config.get('data', 'aei_prefix') dos_path = base_path + config.get('data', 'dosage_prefix', 0, chrom_dict) snp_annot = base_path + config.get('annotation', 'snp_annot_prefix', 0, chrom_dict) gene_snps_path = base_path + config.get('annotation', 'gene_snps', 0, chrom_dict) eqtl_path = base_path + config.get('data', 'eqtl_prefix', 0, chrom_dict) ###### Loading the data ########## aei = get_aei((aei_base + 'chr{chrom!s}.pkl'.format(chrom=chrom))) dos = pd.read_csv(dos_path, sep=" ", index_col=0, header=0) s_ann = pd.read_pickle(snp_annot) gene_snps = pd.read_pickle(gene_snps_path) eqtl_matrix = pd.read_pickle(eqtl_path) ##### Restrict to only Europeans vsd_counts = pd.read_csv(base_path + 'eQTL/gene_vsd_eQTL_april.txt', sep=",", index_col=0) af_euro = calculate_minor_allele_frequency(dos.ix[:,vsd_counts.columns]) maf_euro = af_euro.index[(af_euro >= 0.01) & (af_euro <= 0.99)] eqtl_matrix =\ eqtl_matrix.ix[eqtl_matrix.index.get_level_values('SNP').isin(maf_euro)] print(eqtl_matrix.shape) count_threshold = 200 outfile = open(base_path +\ 'eQTL/tables/global_aei/{0}_aei_rep.txt'.format(chrom), 'w+') eqtl_matrix = eqtl_matrix.swaplevel(0, 1) eqtl = eqtl_matrix.groupby(level=0) pvalues_fdr_calc = [] idx = eqtl.apply(lambda x: x['p-value'].argmin()) print(eqtl.groups.keys()[0:10]) header = ['Symbol', 'ensid', 'Chrom', 'Indicator', 'Best AEI SNP', ''] for i, j in gene_snps.iteritems(): if i == 'ENSG00000054654': pass else: print('going') continue symbol = get_symbol_ensembl(i) print(i, symbol) empty_out = [str(symbol), str(i), str(chrom), 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA'] aei_t = aei.ix[j, :] aei_t = aei_t.ix[aei_t.sum(axis=1) >= count_threshold, :] if aei_t.shape[0] == 0: outfile.write("\t".join(empty_out) + "\n") #si = # Or grab from eQTL.matrix? snps_cis = [eqtl_i[1] for eqtl_i in eqtl.groups[i]] gaei2 = AEI(aei_t, dos.ix[snps_cis, :], s_ann.ix[snps_cis,:], i) gaei2.calc_aei(num_threshold=20) bt = gaei2.pvalues bt = bt.ix[:, np.logical_not(bt.min().isnull())] cur_best = None cur_best_pvalue = 1 for ind in bt.columns: cissnp = bt.ix[:, ind].idxmin() outliers_g = gaei2.outliers.ix[gaei2.hets_dict[ind], ind].values tgeno = dosage_round(gaei2.geno.ix[cissnp, gaei2.hets_dict[ind]][np.logical_not(outliers_g)]) sum_hets_homos = np.sum((tgeno == 0) | (tgeno == 2)) if (sum_hets_homos > 15) and (bt.ix[cissnp, ind] < cur_best_pvalue): cur_best = ind cur_best_pvalue = bt.ix[cissnp, ind] if cur_best: good = cur_best pvalue_good = cur_best_pvalue cissnp = gaei2.pvalues.ix[:,good].idxmin() else: outfile.write("\t".join(empty_out) + "\n") continue # :TODO get beta estimate pvalues_fdr_calc.extend(gaei2.pvalues.ix[:,good].values) #indsnp = gaei2.pvalues.columns[good] indsnp = good # Beta estimates try: outliers_g = gaei2.outliers.ix[gaei2.hets_dict[indsnp] ,indsnp].values except KeyError: continue tgeno = dosage_round(gaei2.geno.ix[cissnp, gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)]) ar = gaei2.ratios.ix[gaei2.hets_dict[indsnp], indsnp][np.logical_not(outliers_g)] ar[ar > 1] = 1/ar[ar > 1] het_combined = ar[np.array(tgeno == 1)] homo_combined = ar[np.array((tgeno == 0) | (tgeno == 2))] beta_best = np.mean(het_combined)/np.mean(homo_combined) try: aei_eqtl_best = gaei2.pvalues.ix[idx[i][1], good] tgeno = dosage_round(gaei2.geno.ix[idx[i][1], gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)]) het_combined_e = ar[np.array(tgeno == 1)] homo_combined_e = ar[np.array((tgeno == 0) | (tgeno == 2))] eqtl_best_aei_beta =\ np.mean(het_combined_e)/np.mean(homo_combined_e) except KeyError: aei_eqtl_best = 'NA' eqtl_best_aei_beta = 'NA' if not cissnp == idx[i][1]: ldbest = calculate_ld(dos.ix[[cissnp, idx[i][1]],:], cissnp)[idx[i][1]] ** 2 else: ldbest = 1 out_l = [symbol, i, chrom, indsnp, cissnp, beta_best, pvalue_good, eqtl_matrix.ix[(i, cissnp), 'beta'], eqtl_matrix.ix[(i, cissnp), 'p-value'], idx[i][1], eqtl_best_aei_beta, aei_eqtl_best, eqtl_matrix.ix[(i, idx[i][1]), 'beta'], eqtl_matrix.ix[(i, idx[i][1]), 'p-value'], float(np.sum(outliers_g))/len(gaei2.hets_dict[indsnp]), ldbest ] if (symbol == str(debug)) or (i==str(debug)): embed() break else: pass out_l = [str(out_column) for out_column in out_l] outfile.write("\t".join(out_l) + "\n") ''' if debug >= 30: embed() break else: debug += 1 ''' ''' outfile_pvalues.write("\n".join([str(i) for i in pvalues_fdr_calc])) ''' outfile.close()
def plot_eQTL(meQTL, gene_name, annotation, dosage, ax=None, symbol=None, focus_snp=None, gene_annot=None, size_shift=0, **kwargs): """ Plot eQTL from a full_matrix object Arguments --------- meQTL - a matrix eQTL dataframe or a series of pvalues gene_name - gene name annotation - snp annotation dataframe, index is rsID dosage - a dosage dataframe ax - axis to plot into """ subset = subset_meQTL(meQTL, gene_name) if isinstance(subset.index, pd.core.index.MultiIndex): subset.index = subset.index.get_level_values('SNP') else: pass x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) x_scale = 1e6 try: adj_pv = -1 * np.log10(subset.ix[:, 'p-value']) except IndexError: adj_pv = -1 * np.log10(subset.iloc[:, 0]) except pd.core.indexing.IndexingError: adj_pv = -1 * np.log10(subset.iloc[:]) try: pos = np.asarray(annotation.ix[subset.index, 'pos'], dtype=np.double) / x_scale except KeyError: pos = np.asarray(annotation.ix[subset.index, 1], dtype=np.double) / x_scale dosage_sub = dosage.ix[subset.index, :] print('subset shape') print(subset.shape) print(kwargs) dosage_maf =\ calculate_minor_allele_frequency(dosage_sub) dosage_maf[dosage_maf > 0.5] = 1 - dosage_maf[dosage_maf > 0.5] dosage_maf = ((200 * dosage_maf) + 20) + size_shift if focus_snp: snp = focus_snp else: # :TODO fix for both use cases #snp = subset.iloc[np.nanargmax(adj_pv), 0] snp = subset.index[np.nanargmax(adj_pv)] try: iix = [i for i, j in enumerate(subset["SNP"]) if j == snp] except KeyError: iix = [i for i, j in enumerate(subset.index) if j == snp] # Need this since pos is a numpy array not pandas series snpx = pos[iix[0]] snp_pv = adj_pv.iloc[iix[0]] color_ld = calculate_ld(dosage_sub, snp)[dosage_sub.index].values if ax is None: ax_orig = False fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6), sharey=False, sharex=True, subplot_kw=dict(axisbg='#FFFFFF')) fig.tight_layout() fig.subplots_adjust(right=0.8, bottom=0.2) else: ax_orig = True ax.set_xlim((min(pos) - 0.01, max(pos) + 0.01)) ylim = (max(adj_pv) + max(adj_pv / 6.0)) ax.set_ylim((-0.01, ylim)) ax.xaxis.set_major_formatter(x_formatter) ### Actual scatter ############################# im = ax.scatter(pos, adj_pv, s=dosage_maf, c=color_ld, **kwargs) #:TODO make the arrow into a funciton ax.set_ylabel(r'$-log_{10}$ eQTL p-value') ax.set_xlabel(r'Position (Mb)') if symbol: gene_name = symbol if ax_orig: return (ax) else: cbar_ax = fig.add_axes([0.87, 0.15, 0.05, 0.7]) bar = fig.colorbar(im, cax=cbar_ax) bar.ax.tick_params(labelsize=18) bar.set_label('r$^{2}$') return (fig)
def _all_eQTL_comp(chrom, base_path, debug=None, count_threshold=200): chrom_dict = {'chrom': str(chrom)} config = ConfigParser.ConfigParser() config.read('test.cfg') aei_base = base_path + config.get('data', 'aei_prefix') dos_path = base_path + config.get('data', 'dosage_prefix', 0, chrom_dict) snp_annot = base_path + config.get('annotation', 'snp_annot_prefix', 0, chrom_dict) gene_snps_path = base_path + config.get('annotation', 'gene_snps', 0, chrom_dict) eqtl_path = base_path + config.get('data', 'eqtl_prefix', 0, chrom_dict) ###### Loading the data ########## aei = get_aei((aei_base + 'chr{chrom!s}.pkl'.format(chrom=chrom))) dos = pd.read_csv(dos_path, sep=" ", index_col=0, header=0) s_ann = pd.read_pickle(snp_annot) gene_snps = pd.read_pickle(gene_snps_path) eqtl_matrix = pd.read_pickle(eqtl_path) ##### Restrict to only Europeans vsd_counts = pd.read_csv(base_path + 'eQTL/gene_vsd_eQTL_april.txt', sep=",", index_col=0) af_euro = calculate_minor_allele_frequency(dos.ix[:, vsd_counts.columns]) maf_euro = af_euro.index[(af_euro >= 0.01) & (af_euro <= 0.99)] eqtl_matrix =\ eqtl_matrix.ix[eqtl_matrix.index.get_level_values('SNP').isin(maf_euro)] print(eqtl_matrix.shape) count_threshold = 200 outfile = open(base_path +\ 'eQTL/tables/global_aei/{0}_aei_rep.txt'.format(chrom), 'w+') eqtl_matrix = eqtl_matrix.swaplevel(0, 1) eqtl = eqtl_matrix.groupby(level=0) pvalues_fdr_calc = [] idx = eqtl.apply(lambda x: x['p-value'].argmin()) print(eqtl.groups.keys()[0:10]) header = ['Symbol', 'ensid', 'Chrom', 'Indicator', 'Best AEI SNP', ''] for i, j in gene_snps.iteritems(): if i == 'ENSG00000054654': pass else: print('going') continue symbol = get_symbol_ensembl(i) print(i, symbol) empty_out = [ str(symbol), str(i), str(chrom), 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA' ] aei_t = aei.ix[j, :] aei_t = aei_t.ix[aei_t.sum(axis=1) >= count_threshold, :] if aei_t.shape[0] == 0: outfile.write("\t".join(empty_out) + "\n") #si = # Or grab from eQTL.matrix? snps_cis = [eqtl_i[1] for eqtl_i in eqtl.groups[i]] gaei2 = AEI(aei_t, dos.ix[snps_cis, :], s_ann.ix[snps_cis, :], i) gaei2.calc_aei(num_threshold=20) bt = gaei2.pvalues bt = bt.ix[:, np.logical_not(bt.min().isnull())] cur_best = None cur_best_pvalue = 1 for ind in bt.columns: cissnp = bt.ix[:, ind].idxmin() outliers_g = gaei2.outliers.ix[gaei2.hets_dict[ind], ind].values tgeno = dosage_round(gaei2.geno.ix[cissnp, gaei2.hets_dict[ind]][ np.logical_not(outliers_g)]) sum_hets_homos = np.sum((tgeno == 0) | (tgeno == 2)) if (sum_hets_homos > 15) and (bt.ix[cissnp, ind] < cur_best_pvalue): cur_best = ind cur_best_pvalue = bt.ix[cissnp, ind] if cur_best: good = cur_best pvalue_good = cur_best_pvalue cissnp = gaei2.pvalues.ix[:, good].idxmin() else: outfile.write("\t".join(empty_out) + "\n") continue # :TODO get beta estimate pvalues_fdr_calc.extend(gaei2.pvalues.ix[:, good].values) #indsnp = gaei2.pvalues.columns[good] indsnp = good # Beta estimates try: outliers_g = gaei2.outliers.ix[gaei2.hets_dict[indsnp], indsnp].values except KeyError: continue tgeno = dosage_round( gaei2.geno.ix[cissnp, gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)]) ar = gaei2.ratios.ix[gaei2.hets_dict[indsnp], indsnp][np.logical_not(outliers_g)] ar[ar > 1] = 1 / ar[ar > 1] het_combined = ar[np.array(tgeno == 1)] homo_combined = ar[np.array((tgeno == 0) | (tgeno == 2))] beta_best = np.mean(het_combined) / np.mean(homo_combined) try: aei_eqtl_best = gaei2.pvalues.ix[idx[i][1], good] tgeno = dosage_round(gaei2.geno.ix[idx[i][1], gaei2.hets_dict[indsnp]][ np.logical_not(outliers_g)]) het_combined_e = ar[np.array(tgeno == 1)] homo_combined_e = ar[np.array((tgeno == 0) | (tgeno == 2))] eqtl_best_aei_beta =\ np.mean(het_combined_e)/np.mean(homo_combined_e) except KeyError: aei_eqtl_best = 'NA' eqtl_best_aei_beta = 'NA' if not cissnp == idx[i][1]: ldbest = calculate_ld(dos.ix[[cissnp, idx[i][1]], :], cissnp)[idx[i][1]]**2 else: ldbest = 1 out_l = [ symbol, i, chrom, indsnp, cissnp, beta_best, pvalue_good, eqtl_matrix.ix[(i, cissnp), 'beta'], eqtl_matrix.ix[(i, cissnp), 'p-value'], idx[i][1], eqtl_best_aei_beta, aei_eqtl_best, eqtl_matrix.ix[(i, idx[i][1]), 'beta'], eqtl_matrix.ix[(i, idx[i][1]), 'p-value'], float(np.sum(outliers_g)) / len(gaei2.hets_dict[indsnp]), ldbest ] if (symbol == str(debug)) or (i == str(debug)): embed() break else: pass out_l = [str(out_column) for out_column in out_l] outfile.write("\t".join(out_l) + "\n") ''' if debug >= 30: embed() break else: debug += 1 ''' ''' outfile_pvalues.write("\n".join([str(i) for i in pvalues_fdr_calc])) ''' outfile.close()
def plot_eQTL(meQTL, gene_name, annotation, dosage, ax=None, symbol=None, focus_snp=None, gene_annot=None, size_shift=0, **kwargs): """ Plot eQTL from a full_matrix object Arguments --------- meQTL - a matrix eQTL dataframe or a series of pvalues gene_name - gene name annotation - snp annotation dataframe, index is rsID dosage - a dosage dataframe ax - axis to plot into """ subset = subset_meQTL(meQTL, gene_name) if isinstance(subset.index, pd.core.index.MultiIndex): subset.index = subset.index.get_level_values('SNP') else: pass x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) x_scale = 1e6 try: adj_pv = -1 * np.log10(subset.ix[:, 'p-value']) except IndexError: adj_pv = -1 * np.log10(subset.iloc[:, 0]) except pd.core.indexing.IndexingError: adj_pv = -1 * np.log10(subset.iloc[:]) try: pos = np.asarray(annotation.ix[subset.index, 'pos'], dtype=np.double) / x_scale except KeyError: pos = np.asarray(annotation.ix[subset.index, 1], dtype=np.double) / x_scale dosage_sub = dosage.ix[subset.index, :] print('subset shape') print(subset.shape) dosage_maf =\ calculate_minor_allele_frequency(dosage_sub) dosage_maf[dosage_maf > 0.5] = 1 - dosage_maf[dosage_maf > 0.5] dosage_maf = ((200 * dosage_maf) + 20) + size_shift if focus_snp: snp = focus_snp else: # :TODO fix for both use cases #snp = subset.iloc[np.nanargmax(adj_pv), 0] snp = subset.index[np.nanargmax(adj_pv)] try: iix = [i for i, j in enumerate(subset["SNP"]) if j == snp] except KeyError: iix = [i for i, j in enumerate(subset.index) if j == snp] # Need this since pos is a numpy array not pandas series snpx = pos[iix[0]] snp_pv = adj_pv.iloc[iix[0]] color1 = calculate_ld(dosage_sub, snp)[dosage_sub.index].values if ax is None: ax_orig = False fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6), sharey=False, sharex=True, subplot_kw=dict(axisbg='#FFFFFF')) fig.tight_layout() fig.subplots_adjust(right=0.8, bottom=0.2) else: ax_orig = True ax.set_xlim((min(pos) - 0.01, max(pos) + 0.01)) ylim = (max(adj_pv) + max(adj_pv / 6.0)) ax.set_ylim((-0.01, ylim)) ax.xaxis.set_major_formatter(x_formatter) ### Actual scatter ############################# im = ax.scatter(pos, adj_pv, s=dosage_maf, c=color1) #:TODO make the arrow into a funciton ax.set_ylabel(r'$-log_{10}$ eQTL p-value') ax.set_xlabel(r'Position (Mb)') if symbol: gene_name = symbol if ax_orig: return(ax) else: cbar_ax = fig.add_axes([0.87, 0.15, 0.05, 0.7]) bar = fig.colorbar(im, cax=cbar_ax) bar.ax.tick_params(labelsize=18) bar.set_label('r$^{2}$') return(fig)
def main(gene, de, rsid, expr, cov=None): """rsid is simply a SNP within the region # Refactor transcript order shouldn't matter # :TODO refactor so that it can handle multiple transcripts Arguments --------- gene : genda.transcripts.gene object de : diffevent covariates : add covriates to the fit """ # :TODO add this to configure parser chrom = gene.chrom base_path = '/home/hsuj/Afib/' sann = pd.read_pickle(base_path + 'ref/snp_annot/{0!s}.pkl'.format(chrom)) sample_mappings = pd.read_csv( '/home/hsuj/lustre/sample_mapping.txt', sep="\t", index_col=0, ) sample_mappings['new_name'] = [i.split("/")[-1].rstrip(".bam") \ for i in sample_mappings.ix[:, 2]] coverages = pd.read_csv(('/home/hsuj/Afib/eQTL/gene_data.txt'), sep=",", index_col=0) coverages = coverages.sum(axis=0) # Normalize ## PCA covariates cov_mat = pd.read_csv('/home/hsuj/Afib/eQTL/Exons/pca_final.csv', sep=",", index_col=0) pheno2 = cov_mat # Need at least 2 transcripts to compare. #graf = gr.genotype_reader_h5py('/home/hsuj/lustre/AF_miso_AFE.hdf') path_dict = gene.transcripts ## Handle genotypes ############################################ sann['pos'] = sann['pos'].astype(int) i = get_genotype(chrom, rsid) geno = i.ix[:, expr.columns] gaf = calculate_minor_allele_frequency(geno) gaf = ((gaf >= 0.10) & (gaf <= 0.90)).values geno = geno.ix[gaf, :] ################################################################ plot_dict = {} fig2, ax2 = plt.subplots(figsize=(10, 10), nrows=4, sharex=True) x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) if de.exon_num[0]: sea = 0 else: sea = 1 eoi = [de.exon_num[sea]] cigar_skipped = getattr(de, 'cigar{0!s}'.format(sea * 1 + 1)) if len(cigar_skipped) > 3: eoi.append(eoi[0] + 1) else: pass cpath = path_dict[de.tid[sea]] eoi_intron_lengths = [i[1] for i in cigar_skipped if i[0] == 3] eoi2 = getattr(de, 'cigar{0!s}'.format(2 - 1 * sea)) eoi2 = [abs(i[1]) for i in eoi2 if i[0] == 3] # Move generation of to plot into diffevents? to_plot1 = [ i for i in cpath if (i[2] >= min(eoi) - 1) and (i[2] <= max(eoi) + 1) ] plot_dict[de.tid[sea]] = to_plot1 try: to_plot2 = [ i for i in path_dict[de.tid[1 - sea]] if i[2] in [de.exon2[0], de.exon2[1]] ] except TypeError: # For alternate first exons #to_plot2 = [i for i in path_dict[de.tid[1-sea]]] to_plot2 = [[i[1], i[2], 2] for i in getattr(de, 'cigar{0!s}'.format(2 - 1 * sea)) if i[0] == 0] plot_dict[de.tid[1 - sea]] = to_plot2 x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) xmin = min(to_plot1[0][0], to_plot2[0][0]) try: xmax = max(to_plot1[2][1], to_plot2[1][1]) except IndexError: # AFE exception xmax = to_plot1[1][1] sample_mappings = sample_mappings.ix[ np.logical_not(sample_mappings.index.duplicated()), :] nsamples = sample_mappings.shape[0] buffer_bp = 0 t1 = de.tid[0] t2 = de.tid[1] all_juncs = [eoi_intron_lengths, eoi2] SCN5A = 'ENSG00000183873' transcript_min = min([i[0] for i in gi.transcripts[de.tid[0]]]) transcript_max = max([i[1] for i in gi.transcripts[de.tid[1]]]) out_frame = pd.DataFrame( { de.tid[sea]: np.zeros(nsamples), de.tid[sea - 1]: np.zeros(sample_mappings.shape[0]), }, index=sample_mappings.index) ofc = pd.DataFrame(np.zeros( (sample_mappings.shape[0], len(gi.transcripts))), index=sample_mappings.index, columns=gi.transcripts.keys()) # Move this to function for bami in sample_mappings.index: # :TODO Make this more generalizable fname = sample_mappings.ix[bami, 2].split("/")[-1] bi = '/home/hsuj/lustre/AFdata/{0!s}'.format(fname) bamf = pysam.Samfile(bi) bamiter = bamf.fetch('chr' + chrom, xmin - buffer_bp, xmax + buffer_bp) #################### Getting intron junctions counts # :TODO convert to cython # Convert this to series intron_counts = np.zeros(2, dtype=np.int32) for i in bamiter: #exons = [j[1] for j in i.cigar if j[0] == 0] introns = [j[1] for j in i.cigar if j[0] == 3] # This depends on there not being other exact intron sizes for knum, ijunc in enumerate(all_juncs): try: matches = [zi for zi in ijunc if zi in introns] except TypeError: matches = [zi for zi in [ijunc] if zi in introns] if len(matches) > 0: intron_counts[knum] += 1 out_frame.ix[bami, 0:3] = intron_counts bleh = gi.transcripts[de.tid[0]][:-3] hmm = [bleh, gi.transcripts[de.tid[1]]] for i, j in zip(hmm, de.tid): bamiter = bamf.fetch('chr' + chrom, transcript_min, transcript_max) ofc.ix[bami, j] = count_reads(i, bamiter) # Filter low counts count_threshold = 0 out_frame = out_frame.ix[out_frame.sum(axis=1) > count_threshold, :] read_length = 100 intron_factor = 1 # Refactor this to a function propi = (out_frame.ix[:, t1] + 1) / intron_factor / ( out_frame.ix[:, t1] / intron_factor + out_frame.ix[:, t2]) X = cov_mat.T X = sm.add_constant(X) from lin_test import test_transcript # Let's get all the SNPs that fall within a certain region # Adding cav1_beta1 for plotting purposes de.tid = [de.tid[0], de.tid[1]] out_frame.columns = [ 'ENST00000502471', 'ENST00000033079', ] for i, j in enumerate(de.tid): ax2[0] = plot_transcript(j, plot_dict, ax2[0], y=i * 2.5, height=2.) t_xmin = min([k[0] for k in plot_dict[j]]) t_xmax = max([k[1] for k in plot_dict[j]]) ax2[0].hlines((i * 2.5 + 2) - 1, t_xmin, t_xmax, colors='darkgrey', lw=2) ax2[0].xaxis.set_major_formatter(x_formatter) ax2[0].get_yaxis().set_ticks([]) ax2[0] = remove_tr_spines(ax2[0]) goi = geno goi = goi.ix[:, out_frame.index] gfits = goi.apply(test_transcript, axis=1, args=(X, propi[X.index])) pvalues = [i.pvalues['geno'] for i in gfits] best_snp = 'rs17171731' pvalues = pd.Series(pvalues, index=geno.index) color = plt.rcParams['axes.color_cycle'][0] example_ylims = [] for i in range(3): if i == 0: geno_string = sann.ix[best_snp, 'a0'] * 2 elif i == 1: geno_string = sann.ix[best_snp, 'a0'] +\ sann.ix[best_snp, 'a1'] elif i == 2: geno_string = sann.ix[best_snp, 'a1'] * 2 hist = np.zeros(xmax - xmin, dtype=np.uint64) c_geno = (goi.ix[best_snp, :] == i) # Random from out_Frame try: b_example = np.random.choice(goi.columns[c_geno.values], size=1)[0] except ValueError: continue het = pysam.Samfile(sample_mappings.ix[b_example, 2]) het_bamf = het.fetch('chr' + str(chrom), xmin, xmax) color = plt.rcParams['axes.color_cycle'][i] for read in het_bamf: coverage_hist(read, hist, xmin) het_bamf = het.fetch('chr' + str(chrom), xmin, xmax) het_bamf = het.fetch('chr' + str(chrom), xmin, xmax) hist = 1e3 * hist / coverages[b_example] ax2[i + 1].plot(np.linspace(xmin, xmax, num=len(hist)), hist, color) ax2[i + 1].fill_between(np.arange(xmin, xmax), 0, hist, facecolor=color) example_ylims.append(np.max(hist)) # Need this to draw between every single one for tran_i in de.tid: junc_norm = 1e3 * out_frame.ix[b_example, tran_i] / coverages[b_example] ax2[i + 1] = draw_junction_arcs(plot_dict[tran_i], hist, xmin, ax2[i + 1], color=color, text=junc_norm, y_buffer=np.max(hist) * 0.20) ax2[i + 1].set_ylabel('{0} Genotype\n RPKM'.format(geno_string)) #from lin_test import _temp_plot # Resave the pickeld file with correct int type example_ylim = max(example_ylims) * 1.2 for i in range(3): ax2[i + 1].set_ylim((0, example_ylim)) #dfmean = (out_frame - out_frame.mean())/(out_frame.max() - out_frame.min()) #pcafit = pca.fit(dfmean) ax2[0].text((xmax - xmin) / 2, ax2[0].get_ylim()[1] - 1, str(min(pvalues))) ax2[0].axvline(sann.ix[best_snp, 'pos'], color='r', linestyle='solid') ax2[0].set_title('{0!s}'.format(gene.symbol)) ax2[-1].set_xlabel('Position') embed() fig, ax = plt.subplots(nrows=3, sharex=True) ax[0] = plot_eQTL(pvalues, 'FAM13B', sann, goi, ax=ax[0], focus_snp='rs17171731') plt.tight_layout() fig.savefig(base_path +\ 'eQTL/graphs/{0!s}_cis_eqtl_transcript.png'.format(gene.symbol)) gr = gene_reference(chrom=5, gene=de.tid[0], rsID=best_snp) fig, ax = plt.subplots(figsize=(6, 10), nrows=2) gr = gene_reference(chrom=5, gene=de.tid[0], rsID=best_snp) ax[0], pv_1 = plot_dosage_by_rsID(gr, goi, X, out_frame.ix[X.index, :].T, ax=ax[0]) ax[0].set_title(de.tid[0]) gr = gene_reference(chrom=5, gene=de.tid[1], rsID=best_snp) ax[1], pv_2 = plot_dosage_by_rsID(gr, goi, X, out_frame.ix[X.index, :].T, ax=ax[1]) ax[1].set_title(de.tid[1]) plt.tight_layout() fig.savefig(base_path +\ 'eQTL/graphs/CAV1_bestfig.png') fig2.savefig(base_path + 'eQTL/graphs/{0!s}_transcript.png'.format(gene.symbol)) ''' fig, ax = plt.subplots() ax = plot_dosage_by_rsID(gr, goi, X2, prop3[X2.index].T, ax=ax) fig.savefig(base_path +\ 'eQTL/graphs/prop3.png') ''' embed() return (propi)
def main(gene, de, rsid, expr, cov=None): """rsid is simply a SNP within the region # Refactor transcript order shouldn't matter # :TODO refactor so that it can handle multiple transcripts Arguments --------- gene : genda.transcripts.gene object de : diffevent covariates : add covriates to the fit """ # :TODO add this to configure parser chrom = gene.chrom base_path = '/home/hsuj/Afib/' sann = pd.read_pickle(base_path + 'ref/snp_annot/{0!s}.pkl'.format(chrom)) sample_mappings = pd.read_csv('/home/hsuj/lustre/sample_mapping.txt', sep="\t", index_col=0,) sample_mappings['new_name'] = [i.split("/")[-1].rstrip(".bam") \ for i in sample_mappings.ix[:, 2]] coverages = pd.read_csv(('/home/hsuj/Afib/eQTL/gene_data.txt'), sep = ",", index_col=0) coverages = coverages.sum(axis=0) # Normalize ## PCA covariates cov_mat = pd.read_csv('/home/hsuj/Afib/eQTL/Exons/pca_final.csv', sep=",", index_col=0) pheno2 = cov_mat # Need at least 2 transcripts to compare. #graf = gr.genotype_reader_h5py('/home/hsuj/lustre/AF_miso_AFE.hdf') path_dict = gene.transcripts ## Handle genotypes ############################################ sann['pos'] = sann['pos'].astype(int) i = get_genotype(chrom, rsid) geno = i.ix[:, expr.columns] gaf = calculate_minor_allele_frequency(geno) gaf = ((gaf >= 0.10) & (gaf <= 0.90)).values geno = geno.ix[gaf,:] ################################################################ plot_dict = {} fig2, ax2 = plt.subplots(figsize=(10, 10), nrows=4, sharex=True) x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) if de.exon_num[0]: sea = 0 else: sea = 1 eoi = [de.exon_num[sea]] cigar_skipped = getattr(de, 'cigar{0!s}'.format(sea*1 + 1)) if len(cigar_skipped) > 3: eoi.append(eoi[0] + 1) else: pass cpath = path_dict[de.tid[sea]] eoi_intron_lengths = [i[1] for i in cigar_skipped if i[0] == 3] eoi2 = getattr(de,'cigar{0!s}'.format(2 - 1*sea)) eoi2 = [abs(i[1]) for i in eoi2 if i[0] == 3 ] # Move generation of to plot into diffevents? to_plot1 = [i for i in cpath if (i[2] >= min(eoi) - 1) and (i[2] <= max(eoi)+ 1)] plot_dict[de.tid[sea]] = to_plot1 try: to_plot2 = [i for i in path_dict[de.tid[1-sea]] if i[2] in [de.exon2[0], de.exon2[1]]] except TypeError: # For alternate first exons #to_plot2 = [i for i in path_dict[de.tid[1-sea]]] to_plot2 = [[i[1], i[2], 2] for i in getattr(de, 'cigar{0!s}'.format(2-1*sea)) if i[0] == 0] plot_dict[de.tid[1-sea]] = to_plot2 x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) xmin = min(to_plot1[0][0], to_plot2[0][0]) try: xmax = max(to_plot1[2][1], to_plot2[1][1]) except IndexError: # AFE exception xmax = to_plot1[1][1] sample_mappings = sample_mappings.ix[ np.logical_not(sample_mappings.index.duplicated()),:] nsamples = sample_mappings.shape[0] buffer_bp = 0 t1 = de.tid[0] t2 = de.tid[1] all_juncs = [eoi_intron_lengths, eoi2] SCN5A = 'ENSG00000183873' transcript_min = min([i[0] for i in gi.transcripts[de.tid[0]]]) transcript_max = max([i[1] for i in gi.transcripts[de.tid[1]]]) out_frame = pd.DataFrame({de.tid[sea]:np.zeros(nsamples), de.tid[sea-1]: np.zeros(sample_mappings.shape[0]), }, index = sample_mappings.index) ofc = pd.DataFrame( np.zeros((sample_mappings.shape[0],len(gi.transcripts))), index=sample_mappings.index, columns=gi.transcripts.keys()) # Move this to function for bami in sample_mappings.index: # :TODO Make this more generalizable fname = sample_mappings.ix[bami,2].split("/")[-1] bi = '/home/hsuj/lustre/AFdata/{0!s}'.format(fname) bamf = pysam.Samfile(bi) bamiter = bamf.fetch('chr' + chrom, xmin-buffer_bp, xmax + buffer_bp) #################### Getting intron junctions counts # :TODO convert to cython # Convert this to series intron_counts = np.zeros(2, dtype=np.int32) for i in bamiter: #exons = [j[1] for j in i.cigar if j[0] == 0] introns = [j[1] for j in i.cigar if j[0] == 3] # This depends on there not being other exact intron sizes for knum, ijunc in enumerate(all_juncs): try: matches = [zi for zi in ijunc if zi in introns] except TypeError: matches = [zi for zi in [ijunc] if zi in introns] if len(matches) > 0: intron_counts[knum] += 1 out_frame.ix[bami, 0:3] = intron_counts bleh = gi.transcripts[de.tid[0]][:-3] hmm = [bleh, gi.transcripts[de.tid[1]]] for i, j in zip(hmm, de.tid): bamiter = bamf.fetch('chr' + chrom, transcript_min, transcript_max) ofc.ix[bami ,j] = count_reads(i, bamiter) # Filter low counts count_threshold = 0 out_frame = out_frame.ix[out_frame.sum(axis=1) > count_threshold, :] read_length = 100 intron_factor = 1 # Refactor this to a function propi = (out_frame.ix[:, t1]+1)/intron_factor/(out_frame.ix[:,t1]/intron_factor + out_frame.ix[:, t2]) X = cov_mat.T X = sm.add_constant(X) from lin_test import test_transcript # Let's get all the SNPs that fall within a certain region # Adding cav1_beta1 for plotting purposes de.tid = [de.tid[0], de.tid[1]] out_frame.columns = ['ENST00000502471', 'ENST00000033079',] for i, j in enumerate(de.tid): ax2[0] = plot_transcript(j, plot_dict, ax2[0], y=i*2.5, height=2.) t_xmin = min([k[0] for k in plot_dict[j]]) t_xmax = max([k[1] for k in plot_dict[j]]) ax2[0].hlines((i*2.5 + 2) - 1, t_xmin, t_xmax, colors='darkgrey', lw=2) ax2[0].xaxis.set_major_formatter(x_formatter) ax2[0].get_yaxis().set_ticks([]) ax2[0] = remove_tr_spines(ax2[0]) goi = geno goi = goi.ix[:, out_frame.index] gfits = goi.apply(test_transcript, axis=1, args=(X, propi[X.index])) pvalues = [i.pvalues['geno'] for i in gfits] best_snp = 'rs17171731' pvalues = pd.Series(pvalues, index=geno.index) color = plt.rcParams['axes.color_cycle'][0] example_ylims = [] for i in range(3): if i == 0: geno_string = sann.ix[best_snp, 'a0'] * 2 elif i ==1: geno_string = sann.ix[best_snp, 'a0'] +\ sann.ix[best_snp, 'a1'] elif i == 2: geno_string = sann.ix[best_snp, 'a1'] * 2 hist = np.zeros(xmax - xmin, dtype=np.uint64) c_geno = (goi.ix[best_snp, :] == i) # Random from out_Frame try: b_example = np.random.choice(goi.columns[c_geno.values], size=1)[0] except ValueError: continue het = pysam.Samfile(sample_mappings.ix[b_example, 2]) het_bamf = het.fetch('chr' + str(chrom), xmin, xmax) color = plt.rcParams['axes.color_cycle'][i] for read in het_bamf: coverage_hist(read, hist, xmin) het_bamf = het.fetch('chr' + str(chrom), xmin, xmax) het_bamf = het.fetch('chr' + str(chrom), xmin, xmax) hist = 1e3 * hist/coverages[b_example] ax2[i + 1].plot(np.linspace(xmin, xmax, num=len(hist)), hist, color) ax2[i + 1].fill_between(np.arange(xmin, xmax),0, hist, facecolor=color) example_ylims.append(np.max(hist)) # Need this to draw between every single one for tran_i in de.tid: junc_norm = 1e3 * out_frame.ix[b_example, tran_i]/coverages[b_example] ax2[i + 1] = draw_junction_arcs(plot_dict[tran_i], hist, xmin, ax2[i+1], color=color, text=junc_norm, y_buffer=np.max(hist) *0.20) ax2[i + 1].set_ylabel('{0} Genotype\n RPKM'.format(geno_string)) #from lin_test import _temp_plot # Resave the pickeld file with correct int type example_ylim = max(example_ylims) * 1.2 for i in range(3): ax2[i + 1].set_ylim((0, example_ylim)) #dfmean = (out_frame - out_frame.mean())/(out_frame.max() - out_frame.min()) #pcafit = pca.fit(dfmean) ax2[0].text((xmax-xmin)/2, ax2[0].get_ylim()[1]- 1, str(min(pvalues))) ax2[0].axvline(sann.ix[best_snp, 'pos'],color='r', linestyle='solid') ax2[0].set_title('{0!s}'.format(gene.symbol)) ax2[-1].set_xlabel('Position') embed() fig, ax = plt.subplots(nrows=3, sharex=True) ax[0] = plot_eQTL(pvalues, 'FAM13B', sann, goi, ax=ax[0], focus_snp='rs17171731') plt.tight_layout() fig.savefig(base_path +\ 'eQTL/graphs/{0!s}_cis_eqtl_transcript.png'.format(gene.symbol)) gr = gene_reference(chrom=5, gene=de.tid[0], rsID = best_snp) fig, ax = plt.subplots(figsize=(6, 10),nrows=2) gr = gene_reference(chrom=5, gene=de.tid[0], rsID = best_snp) ax[0], pv_1 = plot_dosage_by_rsID(gr, goi, X, out_frame.ix[X.index,:].T, ax=ax[0]) ax[0].set_title(de.tid[0]) gr = gene_reference(chrom=5, gene=de.tid[1], rsID = best_snp) ax[1], pv_2 = plot_dosage_by_rsID(gr, goi, X, out_frame.ix[X.index,:].T, ax=ax[1]) ax[1].set_title(de.tid[1]) plt.tight_layout() fig.savefig(base_path +\ 'eQTL/graphs/CAV1_bestfig.png') fig2.savefig(base_path + 'eQTL/graphs/{0!s}_transcript.png'.format(gene.symbol)) ''' fig, ax = plt.subplots() ax = plot_dosage_by_rsID(gr, goi, X2, prop3[X2.index].T, ax=ax) fig.savefig(base_path +\ 'eQTL/graphs/prop3.png') ''' embed() return(propi)
def main(gene, de, rsid, expr, cov=None): """rsid is simply a SNP within the region # Refactor transcript order shouldn't matter Arguments --------- gene : genda.transcripts.gene object covariates : add covriates to the fit """ ''' pheno = pd.read_csv('/home/hsuj/Afib/eQTL/pheno_for_miso.txt', sep="\t", header=None, skiprows=1, index_col=1) ''' pheno2 = pd.read_csv('gene_pheno_eQTL_april.txt', sep=",", index_col=0) new_col = [i.replace('.', '-') for i in pheno2.columns] pheno2.columns = new_col pheno2 = pheno2.T srs = 'ENST00000453840' # Normalize ## PCA covariates base_path = '/home/hsuj/Afib/' # Need at least 2 transcripts to compare. path_dict = gene.transcripts chrom = gene.chrom sann = pd.read_pickle(base_path + 'ref/snp_annot/{0!s}.pkl'.format(chrom)) sann['pos'] = sann['pos'].astype(int) i = get_genotype(chrom, rsid) geno = i.ix[:, expr.columns] gaf = calculate_minor_allele_frequency(geno) gaf = ((gaf >= 0.10) & (gaf <= 0.90)).values geno = geno.ix[gaf,:] plot_dict = {} fig2, ax2 = plt.subplots(figsize=(10, 10), nrows=4, sharex=True) x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) if de.exon_num[0]: sea = 0 else: sea = 1 eoi = [de.exon_num[sea]] cigar_skipped = getattr(de, 'cigar{0!s}'.format(sea*1 + 1)) if len(cigar_skipped) > 3: eoi.append(eoi[0] + 1) else: pass cpath = path_dict[de.transcript_ids[sea]] eoi_intron_lengths = [i[1] for i in cigar_skipped if i[0] == 3] eoi2 = getattr(de,'cigar{0!s}'.format(2 - 1*sea))[0][1] # Move generation of to plot into diffevents? # rough size normalization factor norm_fact1 = (float(sum([i[1] - i[0] for i in to_plot1])) / sum([i[1] - i[0] for i in to_plot2])) x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False) xmin = min(to_plot1[0][0], to_plot2[0][0]) xmax = max(to_plot1[2][1], to_plot2[1][1]) sample_mappings = sample_mappings.ix[ np.logical_not(sample_mappings.index.duplicated()),:] nsamples = sample_mappings.shape[0] out_frame = pd.DataFrame({de.transcript_ids[0]:np.zeros(nsamples), de.transcript_ids[1]: np.zeros(sample_mappings.shape[0])}, index = sample_mappings.index) buffer_bp = 0 t1 = de.transcript_ids[0] t2 = de.transcript_ids[1] for bami in sample_mappings.index: # :TODO Make this more generalizable fname = sample_mappings.ix[bami,2].split("/")[-1] bi = '/home/hsuj/lustre/AFdata/{0!s}'.format(fname) bamf = pysam.Samfile(bi) bamiter = bamf.fetch('chr' + chrom, xmin-buffer_bp, xmax + buffer_bp) # Conver this to cython c0 = 0 c1 = 0 cnot = 0 for i in bamiter: #start = i.pos exons = [j[1] for j in i.cigar if j[0] == 0] introns = [j[1] for j in i.cigar if j[0] == 3] # Probably need to grab exact positions even though we are fetching # in small region matches1 = [zi for zi in eoi_intron_lengths if zi in introns] try: matches2 = [zi for zi in eoi2 if zi in introns] except TypeError: matches2 = [zi for zi in [eoi2] if zi in introns] if (len(matches1) > 0): c0 += 1 elif len(matches2) > 0: c1 += 1 else: cnot +=1 out_frame.ix[bami, t1] = c0 out_frame.ix[bami, t2] = c1 # Filter low counts count_threshold = 100 out_frame = out_frame.ix[out_frame.sum(axis=1) > count_threshold, :] read_length = 100 if to_plot2 > 2: intron_factor = 3 else: intron_factor = 1 propi = (out_frame.ix[:, t1]+1)/intron_factor/(out_frame.ix[:,t1]/intron_factor + out_frame.ix[:, t2]) ''' bii = (sann['pos'] > xmin) &\ (sann['pos'] < xmax) bii = sann.index[bii.values] goi = geno.ix[bii, :] ''' X = test.ix[srs, out_frame.index] X = sm.add_constant(X) X2 = X.copy() X2 = X2.join(pheno2.ix[:,0:5], how='inner') X2['prop'] = propi[X2.index] X2['fullsum'] = out_frame.ix[X2.index].sum(axis=1) try: prop_model =\ ols('prop~sexFemale+ENST00000453840+MDS4+MDS1+MDS3+MDS2+fullsum', data=X2, missing='drop').fit() except ValueError: embed() fig, ax = plt.subplots(figsize=(6,6)) fig = sm.graphics.plot_partregress("prop", "ENST00000453840", ['sexFemale', 'MDS4', 'fullsum'], data=X2, ax=ax, obs_labels=False) ax.text(0.5, ax.get_ylim()[1] - 0.02, 'p-value: %.2E' % Decimal(prop_model.pvalues['ENST00000453840']), size=12) ax.set_xlabel('SRSF10 expression') ax.set_ylabel('{0!s} included exon / skipped exon proportion'.format(gene.symbol)) ax.set_title('') plt.tight_layout() fig.savefig(base_path + 'eQTL/graphs/{0!s}_srsf10_fit.png'.format(gene.symbol)) print(prop_model.pvalues) from lin_test import test_transcript # Let's get all the SNPs that fall within a certain region if gene.symbol == 'CAST' or gene.symbol == 'GDAP1L1': for i, j in enumerate(de.transcript_ids): ax2[0] = plot_transcript(j, plot_dict, ax2[0], y=i*2.5, height=2.) ax2[0].hlines((i*2.5 + 2) - 1, xmin, xmax, colors='darkgrey', lw=2) ax2[0].xaxis.set_major_formatter(x_formatter) ax2[0] = remove_tr_spines(ax2[0]) ax2[0].set_xlim((xmin, xmax)) ax2[0].set_ylim((-0.5, 2*2.5 + 0.5)) goi = geno goi = goi.ix[:, out_frame.index] gfits = goi.apply(test_transcript, axis=1, args=(X, propi)) pvalues = [i.pvalues['geno'] for i in gfits] best_snp = geno.index[np.nanargmin(pvalues)] pvalues = pd.Series(pvalues, index=geno.index) print(gfits[np.nanargmin(pvalues)].pvalues) color = plt.rcParams['axes.color_cycle'][0] embed() for i in range(3): if i == 0: geno_string = sann.ix[best_snp, 'a0'] * 2 elif i ==1: geno_string = sann.ix[best_snp, 'a0'] +\ sann.ix[best_snp, 'a1'] elif i == 2: geno_string = sann.ix[best_snp, 'a1'] * 2 hist = np.zeros(xmax - xmin, dtype=np.uint64) c_geno = (goi.ix[best_snp, :] == i) # Random from out_Frame try: b_example = np.random.choice(goi.columns[c_geno.values], size=1)[0] except ValueError: continue het = pysam.Samfile(sample_mappings.ix[b_example,2]) het_bamf = het.fetch('chr' + str(chrom), xmin, xmax) color = plt.rcParams['axes.color_cycle'][i] for read in het_bamf: coverage_hist(read, hist, xmin) ax2[i + 1].plot(np.linspace(xmin, xmax, num=len(hist)), hist, color) ax2[i + 1].fill_between(np.arange(xmin, xmax),0, hist, facecolor=color) ax2[i + 1].set_ylim((np.min(hist), np.max(hist) + 0.2 * np.max(hist))) try: ax2[i + 1].text((xmax + xmin)/2, np.max(hist), str(out_frame.ix[b_example, 1])) except KeyError: pass ax2[i + 1].set_ylabel('{0} Genotype'.format(geno_string)) #from lin_test import _temp_plot # Resave the pickeld file with correct int type ax2[0].text((xmax-xmin)/2, ax2[0].get_ylim()[1]- 1, str(min(pvalues))) ax2[0].axvline(sann.ix[best_snp, 'pos'],color='r', linestyle='solid') ax2[0].set_title('{0!s}'.format(gene.symbol)) ax2[-1].set_xlabel('Position') fig2.savefig(base_path + 'eQTL/graphs/{0!s}_transcript.png'.format(gene.symbol)) out_frame.columns = ['{0} IE'.format(gene.symbol), '{0} SE'.format(gene.symbol)] return(propi)