Esempio n. 1
0
def calculate_D_prime(geno1, geno2):
    """ Calculates pairwise D_prime of genotype with the

    geno1 - an array or series  
    geno2 - an array or series
    """
    assert len(geno1) == len(geno2)
    p1 = calculate_minor_allele_frequency(geno1)
    p2 = calculate_minor_allele_frequency(geno2)

    p12 = (geno1 == 2) & (geno2 == 2)
    p21 = (geno1 == 0) & (geno2 == 0)
    return (p1 * p2)
Esempio n. 2
0
def calculate_D_prime(geno1, geno2):
    """ Calculates pairwise D_prime of genotype with the

    geno1 - an array or series  
    geno2 - an array or series
    """
    assert len(geno1) == len(geno2)
    p1 = calculate_minor_allele_frequency(geno1)
    p2 = calculate_minor_allele_frequency(geno2)


    p12 = (geno1 == 2) & (geno2 == 2)
    p21 = (geno1 == 0) & (geno2 == 0)
    return(p1 * p2)
Esempio n. 3
0
    def __init__(self,
                 aei_dataframe,
                 geno,
                 snp_annot,
                 gene_name,
                 maf_threshold=0.05,
                 ld_prune=False):
        """ 
        Arguments
        ---------
        aei_dataframe - AEI count dataframe gotton from func :TODO list script
        to obtain aei from bam file in genda/scripts.  Must be subsetted for
        SNPs within gene.
        geno - genotype dataframe to do testing on
        surrounding SNP.
        snp_annot - dataframe of SNP annotations
        """
        self.aei = aei_dataframe
        # :TODO assert self.geno and self.snp_annot are same shape
        try:
            self.sample_ids = [i[0] for i in self.aei.columns][::4]
        except AttributeError:
            self.sample_ids = [i[0] for i in self.aei.index][::4]

        ids = (pd.Index(self.sample_ids).intersection(geno.columns))
        idx = pd.IndexSlice
        try:
            self.aei.sort_index(axis=1, inplace=True)
            self.aei = self.aei.loc[:, idx[ids, :]]
            self.aei.sort_index(axis=1, inplace=True)
            self.geno = geno.ix[:, self.aei.columns.get_level_values(0)[::4]]
        except TypeError:
            # Case where aei_dataframe is just a series
            self.aei = self.aei.sort_index()
            self.aei = self.aei.loc[idx[ids, :]]
            self.aei = self.aei.sort_index()
            self.geno = geno.ix[:, self.aei.index.get_level_values(0)[::4]]
        self.ids = self.geno.columns
        self.maf = calculate_minor_allele_frequency(self.geno.ix[:, ids])
        # Restrict to  > 5%
        self.geno = self.geno.ix[(self.maf >= maf_threshold) &\
                (self.maf <= 1 - maf_threshold), :]
        self.snp_annot = snp_annot.ix[self.geno.index, :]
        self.maf = self.maf[self.geno.index]

        self.gene_name = gene_name
        self.aei = self.aei.ix[self.geno.index.intersection(self.aei.index), :]
Esempio n. 4
0
    def __init__(self, aei_dataframe, geno, snp_annot, gene_name,
            maf_threshold = 0.05, ld_prune=False):
        """ 
        Arguments
        ---------
        aei_dataframe - AEI count dataframe gotton from func :TODO list script
        to obtain aei from bam file in genda/scripts.  Must be subsetted for
        SNPs within gene.
        geno - genotype dataframe to do testing on
        surrounding SNP.
        snp_annot - dataframe of SNP annotations
        """
        self.aei = aei_dataframe
        # :TODO assert self.geno and self.snp_annot are same shape
        try:
            self.sample_ids = [i[0] for i in self.aei.columns][::4]
        except AttributeError:
            self.sample_ids = [i[0] for i in self.aei.index][::4]


        ids = (pd.Index(self.sample_ids)
                .intersection(geno.columns))
        idx = pd.IndexSlice
        try:
            self.aei.sort_index(axis=1, inplace=True)
            self.aei = self.aei.loc[:, idx[ids, :]]
            self.aei.sort_index(axis=1, inplace=True)
            self.geno = geno.ix[:, 
                    self.aei.columns.get_level_values(0)[::4]]
        except TypeError:
            # Case where aei_dataframe is just a series
            self.aei = self.aei.sort_index()
            self.aei = self.aei.loc[idx[ids, :]]
            self.aei = self.aei.sort_index()
            self.geno = geno.ix[:,
                    self.aei.index.get_level_values(0)[::4]]
        self.ids = self.geno.columns
        self.maf = calculate_minor_allele_frequency(self.geno.ix[:, ids])
        # Restrict to  > 5%
        self.geno = self.geno.ix[(self.maf >= maf_threshold) &\
                (self.maf <= 1 - maf_threshold), :]
        self.snp_annot = snp_annot.ix[self.geno.index, :]
        self.maf = self.maf[self.geno.index]
        
        self.gene_name = gene_name
        self.aei = self.aei.ix[self.geno.index.intersection(self.aei.index),:]
Esempio n. 5
0
def main(gene, de, rsid, expr, cov=None):
    """rsid is simply a SNP within the region

    # Refactor transcript order shouldn't matter 

    Arguments
    ---------
    gene : genda.transcripts.gene object
    covariates : add covriates to the fit
    """
    '''
    pheno = pd.read_csv('/home/hsuj/Afib/eQTL/pheno_for_miso.txt',
            sep="\t", header=None, skiprows=1, index_col=1)
    '''
    pheno2 = pd.read_csv('gene_pheno_eQTL_april.txt', sep=",", index_col=0)
    new_col = [i.replace('.', '-') for i in pheno2.columns]
    pheno2.columns = new_col
    pheno2 = pheno2.T
    srs = 'ENST00000453840'
    # Normalize
    ## PCA covariates
    base_path = '/home/hsuj/Afib/'
    # Need at least 2 transcripts to compare.
    path_dict = gene.transcripts
    chrom = gene.chrom
    sann = pd.read_pickle(base_path + 'ref/snp_annot/{0!s}.pkl'.format(chrom))
    sann['pos'] = sann['pos'].astype(int)
    i = get_genotype(chrom, rsid)
    geno = i.ix[:, expr.columns]
    gaf = calculate_minor_allele_frequency(geno)
    gaf = ((gaf >= 0.10) & (gaf <= 0.90)).values
    geno = geno.ix[gaf, :]
    plot_dict = {}
    fig2, ax2 = plt.subplots(figsize=(10, 10), nrows=4, sharex=True)
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    if de.exon_num[0]:
        sea = 0
    else:
        sea = 1
    eoi = [de.exon_num[sea]]
    cigar_skipped = getattr(de, 'cigar{0!s}'.format(sea * 1 + 1))
    if len(cigar_skipped) > 3:
        eoi.append(eoi[0] + 1)
    else:
        pass
    cpath = path_dict[de.transcript_ids[sea]]
    eoi_intron_lengths = [i[1] for i in cigar_skipped if i[0] == 3]
    eoi2 = getattr(de, 'cigar{0!s}'.format(2 - 1 * sea))[0][1]
    # Move generation of to plot into diffevents?
    # rough size normalization factor
    norm_fact1 = (float(sum([i[1] - i[0] for i in to_plot1])) /
                  sum([i[1] - i[0] for i in to_plot2]))
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    xmin = min(to_plot1[0][0], to_plot2[0][0])
    xmax = max(to_plot1[2][1], to_plot2[1][1])
    sample_mappings = sample_mappings.ix[
        np.logical_not(sample_mappings.index.duplicated()), :]
    nsamples = sample_mappings.shape[0]
    out_frame = pd.DataFrame(
        {
            de.transcript_ids[0]: np.zeros(nsamples),
            de.transcript_ids[1]: np.zeros(sample_mappings.shape[0])
        },
        index=sample_mappings.index)
    buffer_bp = 0
    t1 = de.transcript_ids[0]
    t2 = de.transcript_ids[1]
    for bami in sample_mappings.index:
        # :TODO Make this more generalizable
        fname = sample_mappings.ix[bami, 2].split("/")[-1]
        bi = '/home/hsuj/lustre/AFdata/{0!s}'.format(fname)
        bamf = pysam.Samfile(bi)
        bamiter = bamf.fetch('chr' + chrom, xmin - buffer_bp, xmax + buffer_bp)
        # Conver this to cython
        c0 = 0
        c1 = 0
        cnot = 0
        for i in bamiter:
            #start = i.pos
            exons = [j[1] for j in i.cigar if j[0] == 0]
            introns = [j[1] for j in i.cigar if j[0] == 3]
            # Probably need to grab exact positions even though we are fetching
            # in small region
            matches1 = [zi for zi in eoi_intron_lengths if zi in introns]
            try:
                matches2 = [zi for zi in eoi2 if zi in introns]
            except TypeError:
                matches2 = [zi for zi in [eoi2] if zi in introns]
            if (len(matches1) > 0): c0 += 1
            elif len(matches2) > 0: c1 += 1
            else: cnot += 1
        out_frame.ix[bami, t1] = c0
        out_frame.ix[bami, t2] = c1
    # Filter low counts
    count_threshold = 100
    out_frame = out_frame.ix[out_frame.sum(axis=1) > count_threshold, :]
    read_length = 100
    if to_plot2 > 2:
        intron_factor = 3
    else:
        intron_factor = 1
    propi = (out_frame.ix[:, t1] + 1) / intron_factor / (
        out_frame.ix[:, t1] / intron_factor + out_frame.ix[:, t2])
    '''
    bii = (sann['pos'] > xmin) &\
            (sann['pos'] < xmax) 
    bii = sann.index[bii.values]
    goi = geno.ix[bii, :]
    '''
    X = test.ix[srs, out_frame.index]
    X = sm.add_constant(X)
    X2 = X.copy()
    X2 = X2.join(pheno2.ix[:, 0:5], how='inner')
    X2['prop'] = propi[X2.index]
    X2['fullsum'] = out_frame.ix[X2.index].sum(axis=1)
    try:
        prop_model =\
                ols('prop~sexFemale+ENST00000453840+MDS4+MDS1+MDS3+MDS2+fullsum',
                data=X2,
                missing='drop').fit()
    except ValueError:
        embed()
    fig, ax = plt.subplots(figsize=(6, 6))
    fig = sm.graphics.plot_partregress("prop",
                                       "ENST00000453840",
                                       ['sexFemale', 'MDS4', 'fullsum'],
                                       data=X2,
                                       ax=ax,
                                       obs_labels=False)
    ax.text(0.5,
            ax.get_ylim()[1] - 0.02,
            'p-value: %.2E' % Decimal(prop_model.pvalues['ENST00000453840']),
            size=12)
    ax.set_xlabel('SRSF10 expression')
    ax.set_ylabel('{0!s} included exon / skipped exon proportion'.format(
        gene.symbol))
    ax.set_title('')
    plt.tight_layout()
    fig.savefig(base_path +
                'eQTL/graphs/{0!s}_srsf10_fit.png'.format(gene.symbol))
    print(prop_model.pvalues)
    from lin_test import test_transcript
    # Let's get all the SNPs that fall within a certain region

    if gene.symbol == 'CAST' or gene.symbol == 'GDAP1L1':
        for i, j in enumerate(de.transcript_ids):
            ax2[0] = plot_transcript(j,
                                     plot_dict,
                                     ax2[0],
                                     y=i * 2.5,
                                     height=2.)
            ax2[0].hlines((i * 2.5 + 2) - 1,
                          xmin,
                          xmax,
                          colors='darkgrey',
                          lw=2)
            ax2[0].xaxis.set_major_formatter(x_formatter)
        ax2[0] = remove_tr_spines(ax2[0])
        ax2[0].set_xlim((xmin, xmax))
        ax2[0].set_ylim((-0.5, 2 * 2.5 + 0.5))
        goi = geno
        goi = goi.ix[:, out_frame.index]
        gfits = goi.apply(test_transcript, axis=1, args=(X, propi))
        pvalues = [i.pvalues['geno'] for i in gfits]
        best_snp = geno.index[np.nanargmin(pvalues)]
        pvalues = pd.Series(pvalues, index=geno.index)
        print(gfits[np.nanargmin(pvalues)].pvalues)
        color = plt.rcParams['axes.color_cycle'][0]
        embed()
        for i in range(3):
            if i == 0: geno_string = sann.ix[best_snp, 'a0'] * 2
            elif i == 1:
                geno_string = sann.ix[best_snp, 'a0'] +\
                        sann.ix[best_snp, 'a1']
            elif i == 2:
                geno_string = sann.ix[best_snp, 'a1'] * 2
            hist = np.zeros(xmax - xmin, dtype=np.uint64)
            c_geno = (goi.ix[best_snp, :] == i)
            # Random from out_Frame
            try:
                b_example = np.random.choice(goi.columns[c_geno.values],
                                             size=1)[0]
            except ValueError:
                continue
            het = pysam.Samfile(sample_mappings.ix[b_example, 2])
            het_bamf = het.fetch('chr' + str(chrom), xmin, xmax)
            color = plt.rcParams['axes.color_cycle'][i]
            for read in het_bamf:
                coverage_hist(read, hist, xmin)

            ax2[i + 1].plot(np.linspace(xmin, xmax, num=len(hist)), hist,
                            color)
            ax2[i + 1].fill_between(np.arange(xmin, xmax),
                                    0,
                                    hist,
                                    facecolor=color)
            ax2[i + 1].set_ylim(
                (np.min(hist), np.max(hist) + 0.2 * np.max(hist)))
            try:
                ax2[i + 1].text((xmax + xmin) / 2, np.max(hist),
                                str(out_frame.ix[b_example, 1]))
            except KeyError:
                pass
            ax2[i + 1].set_ylabel('{0} Genotype'.format(geno_string))
            #from lin_test import _temp_plot
            # Resave the pickeld file with correct int type
        ax2[0].text((xmax - xmin) / 2, ax2[0].get_ylim()[1] - 1,
                    str(min(pvalues)))
        ax2[0].axvline(sann.ix[best_snp, 'pos'], color='r', linestyle='solid')
        ax2[0].set_title('{0!s}'.format(gene.symbol))
        ax2[-1].set_xlabel('Position')
        fig2.savefig(base_path +
                     'eQTL/graphs/{0!s}_transcript.png'.format(gene.symbol))
        out_frame.columns = [
            '{0} IE'.format(gene.symbol), '{0} SE'.format(gene.symbol)
        ]
    return (propi)
Esempio n. 6
0
def _all_eQTL_comp(chrom, base_path, debug=None, count_threshold=200):
    chrom_dict = {'chrom': str(chrom)}
    config = ConfigParser.ConfigParser()
    config.read('test.cfg')
    aei_base = base_path + config.get('data', 'aei_prefix')
    dos_path = base_path + config.get('data', 'dosage_prefix', 0, chrom_dict)
    snp_annot = base_path + config.get('annotation', 'snp_annot_prefix', 0, chrom_dict)
    gene_snps_path = base_path + config.get('annotation', 'gene_snps', 0, chrom_dict)
    eqtl_path = base_path + config.get('data', 'eqtl_prefix', 0, chrom_dict)

    ###### Loading the data ##########
    aei = get_aei((aei_base + 'chr{chrom!s}.pkl'.format(chrom=chrom)))
    dos = pd.read_csv(dos_path, sep=" ", index_col=0, header=0)
    s_ann = pd.read_pickle(snp_annot)
    gene_snps = pd.read_pickle(gene_snps_path)
    eqtl_matrix = pd.read_pickle(eqtl_path)


    ##### Restrict to only Europeans
    vsd_counts = pd.read_csv(base_path + 'eQTL/gene_vsd_eQTL_april.txt', 
            sep=",", index_col=0)
    af_euro = calculate_minor_allele_frequency(dos.ix[:,vsd_counts.columns])
    maf_euro = af_euro.index[(af_euro >= 0.01) & (af_euro <= 0.99)]
    eqtl_matrix =\
            eqtl_matrix.ix[eqtl_matrix.index.get_level_values('SNP').isin(maf_euro)]  
    print(eqtl_matrix.shape)
    count_threshold = 200
    outfile = open(base_path +\
            'eQTL/tables/global_aei/{0}_aei_rep.txt'.format(chrom), 'w+')
    eqtl_matrix = eqtl_matrix.swaplevel(0, 1)
    eqtl = eqtl_matrix.groupby(level=0)

    pvalues_fdr_calc = []
    idx = eqtl.apply(lambda x: x['p-value'].argmin())
    print(eqtl.groups.keys()[0:10])
    header = ['Symbol', 'ensid', 'Chrom', 'Indicator',
            'Best AEI SNP', '']

    for i, j in gene_snps.iteritems():
        if i == 'ENSG00000054654':
            pass
        else:
            print('going')
            continue
        symbol = get_symbol_ensembl(i)
        print(i, symbol)
        empty_out = [str(symbol), str(i), str(chrom), 'NA', 'NA', 'NA', 
                'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']
        aei_t = aei.ix[j, :]
        aei_t = aei_t.ix[aei_t.sum(axis=1) >= count_threshold, :]
        if aei_t.shape[0] == 0:
            outfile.write("\t".join(empty_out) + "\n")
        #si = 
        # Or grab from eQTL.matrix?
        snps_cis = [eqtl_i[1] for eqtl_i in eqtl.groups[i]]
        gaei2 = AEI(aei_t, dos.ix[snps_cis, :], s_ann.ix[snps_cis,:], i)
        gaei2.calc_aei(num_threshold=20)

        bt = gaei2.pvalues
        bt = bt.ix[:, np.logical_not(bt.min().isnull())]


        cur_best = None
        cur_best_pvalue = 1

        for ind in bt.columns:
            cissnp = bt.ix[:, ind].idxmin()
            outliers_g = gaei2.outliers.ix[gaei2.hets_dict[ind], ind].values
            tgeno = dosage_round(gaei2.geno.ix[cissnp,
                gaei2.hets_dict[ind]][np.logical_not(outliers_g)])
            sum_hets_homos = np.sum((tgeno == 0) | (tgeno == 2)) 
            if (sum_hets_homos > 15) and (bt.ix[cissnp, ind] < cur_best_pvalue):
                cur_best = ind
                cur_best_pvalue = bt.ix[cissnp, ind]

        if cur_best:
            good = cur_best
            pvalue_good = cur_best_pvalue
            cissnp = gaei2.pvalues.ix[:,good].idxmin()
        else:
            outfile.write("\t".join(empty_out) + "\n")
            continue


        # :TODO get beta estimate
        pvalues_fdr_calc.extend(gaei2.pvalues.ix[:,good].values)
        #indsnp = gaei2.pvalues.columns[good]
        indsnp = good

        # Beta estimates 
        try:
            outliers_g = gaei2.outliers.ix[gaei2.hets_dict[indsnp] ,indsnp].values
        except KeyError:
            continue
        tgeno = dosage_round(gaei2.geno.ix[cissnp,
            gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)])
        ar = gaei2.ratios.ix[gaei2.hets_dict[indsnp],
                indsnp][np.logical_not(outliers_g)]
        ar[ar > 1]  = 1/ar[ar > 1]
        het_combined = ar[np.array(tgeno == 1)]
        homo_combined = ar[np.array((tgeno == 0) | (tgeno == 2))]
        beta_best = np.mean(het_combined)/np.mean(homo_combined)
        try:
            aei_eqtl_best = gaei2.pvalues.ix[idx[i][1], good]
            tgeno = dosage_round(gaei2.geno.ix[idx[i][1],
                gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)])
            het_combined_e = ar[np.array(tgeno == 1)]
            homo_combined_e = ar[np.array((tgeno == 0) | (tgeno == 2))]
            eqtl_best_aei_beta =\
                    np.mean(het_combined_e)/np.mean(homo_combined_e)
        except KeyError:
            aei_eqtl_best = 'NA'
            eqtl_best_aei_beta = 'NA'
        if not cissnp == idx[i][1]:
            ldbest = calculate_ld(dos.ix[[cissnp, idx[i][1]],:], 
                    cissnp)[idx[i][1]] ** 2
        else:
            ldbest = 1

        out_l = [symbol, i, 
                chrom,
                indsnp,
                cissnp,
                beta_best,
                pvalue_good,
                eqtl_matrix.ix[(i, cissnp), 'beta'],
                eqtl_matrix.ix[(i, cissnp), 'p-value'],
                idx[i][1],
                eqtl_best_aei_beta,
                aei_eqtl_best,
                eqtl_matrix.ix[(i, idx[i][1]), 'beta'],
                eqtl_matrix.ix[(i, idx[i][1]), 'p-value'],
                float(np.sum(outliers_g))/len(gaei2.hets_dict[indsnp]),
                ldbest
                ]
        if (symbol == str(debug)) or (i==str(debug)):
            embed()
            break
        else:
            pass
        out_l = [str(out_column) for out_column in out_l]
        outfile.write("\t".join(out_l) + "\n")
        '''
        if debug >= 30:
            embed()
            break
        else:
            debug += 1
        '''
    '''
    outfile_pvalues.write("\n".join([str(i) for i in pvalues_fdr_calc]))
    '''
    outfile.close()
Esempio n. 7
0
def plot_eQTL(meQTL,
              gene_name,
              annotation,
              dosage,
              ax=None,
              symbol=None,
              focus_snp=None,
              gene_annot=None,
              size_shift=0,
              **kwargs):
    """ Plot eQTL from a full_matrix object
    Arguments
    ---------
    meQTL - a matrix eQTL dataframe or a series of pvalues
    gene_name - gene name
    annotation - snp annotation dataframe, index is rsID
    dosage - a dosage dataframe
    ax - axis to plot into
    """
    subset = subset_meQTL(meQTL, gene_name)

    if isinstance(subset.index, pd.core.index.MultiIndex):
        subset.index = subset.index.get_level_values('SNP')
    else:
        pass

    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    x_scale = 1e6
    try:
        adj_pv = -1 * np.log10(subset.ix[:, 'p-value'])
    except IndexError:
        adj_pv = -1 * np.log10(subset.iloc[:, 0])
    except pd.core.indexing.IndexingError:
        adj_pv = -1 * np.log10(subset.iloc[:])
    try:
        pos = np.asarray(annotation.ix[subset.index, 'pos'],
                         dtype=np.double) / x_scale
    except KeyError:
        pos = np.asarray(annotation.ix[subset.index, 1],
                         dtype=np.double) / x_scale
    dosage_sub = dosage.ix[subset.index, :]
    print('subset shape')
    print(subset.shape)
    print(kwargs)
    dosage_maf =\
        calculate_minor_allele_frequency(dosage_sub)
    dosage_maf[dosage_maf > 0.5] = 1 - dosage_maf[dosage_maf > 0.5]
    dosage_maf = ((200 * dosage_maf) + 20) + size_shift
    if focus_snp:
        snp = focus_snp
    else:
        # :TODO fix for both use cases
        #snp = subset.iloc[np.nanargmax(adj_pv), 0]
        snp = subset.index[np.nanargmax(adj_pv)]
    try:
        iix = [i for i, j in enumerate(subset["SNP"]) if j == snp]
    except KeyError:
        iix = [i for i, j in enumerate(subset.index) if j == snp]
    # Need this since pos is a numpy array not pandas series
    snpx = pos[iix[0]]
    snp_pv = adj_pv.iloc[iix[0]]
    color_ld = calculate_ld(dosage_sub, snp)[dosage_sub.index].values
    if ax is None:
        ax_orig = False
        fig, ax = plt.subplots(nrows=1,
                               ncols=1,
                               figsize=(16, 6),
                               sharey=False,
                               sharex=True,
                               subplot_kw=dict(axisbg='#FFFFFF'))
        fig.tight_layout()
        fig.subplots_adjust(right=0.8, bottom=0.2)
    else:
        ax_orig = True
    ax.set_xlim((min(pos) - 0.01, max(pos) + 0.01))
    ylim = (max(adj_pv) + max(adj_pv / 6.0))
    ax.set_ylim((-0.01, ylim))
    ax.xaxis.set_major_formatter(x_formatter)
    ### Actual scatter #############################
    im = ax.scatter(pos, adj_pv, s=dosage_maf, c=color_ld, **kwargs)
    #:TODO make the arrow into a funciton
    ax.set_ylabel(r'$-log_{10}$ eQTL p-value')
    ax.set_xlabel(r'Position (Mb)')
    if symbol:
        gene_name = symbol
    if ax_orig:
        return (ax)
    else:
        cbar_ax = fig.add_axes([0.87, 0.15, 0.05, 0.7])
        bar = fig.colorbar(im, cax=cbar_ax)
        bar.ax.tick_params(labelsize=18)
        bar.set_label('r$^{2}$')
        return (fig)
Esempio n. 8
0
def _all_eQTL_comp(chrom, base_path, debug=None, count_threshold=200):
    chrom_dict = {'chrom': str(chrom)}
    config = ConfigParser.ConfigParser()
    config.read('test.cfg')
    aei_base = base_path + config.get('data', 'aei_prefix')
    dos_path = base_path + config.get('data', 'dosage_prefix', 0, chrom_dict)
    snp_annot = base_path + config.get('annotation', 'snp_annot_prefix', 0,
                                       chrom_dict)
    gene_snps_path = base_path + config.get('annotation', 'gene_snps', 0,
                                            chrom_dict)
    eqtl_path = base_path + config.get('data', 'eqtl_prefix', 0, chrom_dict)

    ###### Loading the data ##########
    aei = get_aei((aei_base + 'chr{chrom!s}.pkl'.format(chrom=chrom)))
    dos = pd.read_csv(dos_path, sep=" ", index_col=0, header=0)
    s_ann = pd.read_pickle(snp_annot)
    gene_snps = pd.read_pickle(gene_snps_path)
    eqtl_matrix = pd.read_pickle(eqtl_path)

    ##### Restrict to only Europeans
    vsd_counts = pd.read_csv(base_path + 'eQTL/gene_vsd_eQTL_april.txt',
                             sep=",",
                             index_col=0)
    af_euro = calculate_minor_allele_frequency(dos.ix[:, vsd_counts.columns])
    maf_euro = af_euro.index[(af_euro >= 0.01) & (af_euro <= 0.99)]
    eqtl_matrix =\
            eqtl_matrix.ix[eqtl_matrix.index.get_level_values('SNP').isin(maf_euro)]
    print(eqtl_matrix.shape)
    count_threshold = 200
    outfile = open(base_path +\
            'eQTL/tables/global_aei/{0}_aei_rep.txt'.format(chrom), 'w+')
    eqtl_matrix = eqtl_matrix.swaplevel(0, 1)
    eqtl = eqtl_matrix.groupby(level=0)

    pvalues_fdr_calc = []
    idx = eqtl.apply(lambda x: x['p-value'].argmin())
    print(eqtl.groups.keys()[0:10])
    header = ['Symbol', 'ensid', 'Chrom', 'Indicator', 'Best AEI SNP', '']

    for i, j in gene_snps.iteritems():
        if i == 'ENSG00000054654':
            pass
        else:
            print('going')
            continue
        symbol = get_symbol_ensembl(i)
        print(i, symbol)
        empty_out = [
            str(symbol),
            str(i),
            str(chrom), 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA',
            'NA', 'NA', 'NA'
        ]
        aei_t = aei.ix[j, :]
        aei_t = aei_t.ix[aei_t.sum(axis=1) >= count_threshold, :]
        if aei_t.shape[0] == 0:
            outfile.write("\t".join(empty_out) + "\n")
        #si =
        # Or grab from eQTL.matrix?
        snps_cis = [eqtl_i[1] for eqtl_i in eqtl.groups[i]]
        gaei2 = AEI(aei_t, dos.ix[snps_cis, :], s_ann.ix[snps_cis, :], i)
        gaei2.calc_aei(num_threshold=20)

        bt = gaei2.pvalues
        bt = bt.ix[:, np.logical_not(bt.min().isnull())]

        cur_best = None
        cur_best_pvalue = 1

        for ind in bt.columns:
            cissnp = bt.ix[:, ind].idxmin()
            outliers_g = gaei2.outliers.ix[gaei2.hets_dict[ind], ind].values
            tgeno = dosage_round(gaei2.geno.ix[cissnp, gaei2.hets_dict[ind]][
                np.logical_not(outliers_g)])
            sum_hets_homos = np.sum((tgeno == 0) | (tgeno == 2))
            if (sum_hets_homos > 15) and (bt.ix[cissnp, ind] <
                                          cur_best_pvalue):
                cur_best = ind
                cur_best_pvalue = bt.ix[cissnp, ind]

        if cur_best:
            good = cur_best
            pvalue_good = cur_best_pvalue
            cissnp = gaei2.pvalues.ix[:, good].idxmin()
        else:
            outfile.write("\t".join(empty_out) + "\n")
            continue

        # :TODO get beta estimate
        pvalues_fdr_calc.extend(gaei2.pvalues.ix[:, good].values)
        #indsnp = gaei2.pvalues.columns[good]
        indsnp = good

        # Beta estimates
        try:
            outliers_g = gaei2.outliers.ix[gaei2.hets_dict[indsnp],
                                           indsnp].values
        except KeyError:
            continue
        tgeno = dosage_round(
            gaei2.geno.ix[cissnp,
                          gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)])
        ar = gaei2.ratios.ix[gaei2.hets_dict[indsnp],
                             indsnp][np.logical_not(outliers_g)]
        ar[ar > 1] = 1 / ar[ar > 1]
        het_combined = ar[np.array(tgeno == 1)]
        homo_combined = ar[np.array((tgeno == 0) | (tgeno == 2))]
        beta_best = np.mean(het_combined) / np.mean(homo_combined)
        try:
            aei_eqtl_best = gaei2.pvalues.ix[idx[i][1], good]
            tgeno = dosage_round(gaei2.geno.ix[idx[i][1],
                                               gaei2.hets_dict[indsnp]][
                                                   np.logical_not(outliers_g)])
            het_combined_e = ar[np.array(tgeno == 1)]
            homo_combined_e = ar[np.array((tgeno == 0) | (tgeno == 2))]
            eqtl_best_aei_beta =\
                    np.mean(het_combined_e)/np.mean(homo_combined_e)
        except KeyError:
            aei_eqtl_best = 'NA'
            eqtl_best_aei_beta = 'NA'
        if not cissnp == idx[i][1]:
            ldbest = calculate_ld(dos.ix[[cissnp, idx[i][1]], :],
                                  cissnp)[idx[i][1]]**2
        else:
            ldbest = 1

        out_l = [
            symbol, i, chrom, indsnp, cissnp, beta_best, pvalue_good,
            eqtl_matrix.ix[(i, cissnp), 'beta'], eqtl_matrix.ix[(i, cissnp),
                                                                'p-value'],
            idx[i][1], eqtl_best_aei_beta, aei_eqtl_best,
            eqtl_matrix.ix[(i, idx[i][1]),
                           'beta'], eqtl_matrix.ix[(i, idx[i][1]), 'p-value'],
            float(np.sum(outliers_g)) / len(gaei2.hets_dict[indsnp]), ldbest
        ]
        if (symbol == str(debug)) or (i == str(debug)):
            embed()
            break
        else:
            pass
        out_l = [str(out_column) for out_column in out_l]
        outfile.write("\t".join(out_l) + "\n")
        '''
        if debug >= 30:
            embed()
            break
        else:
            debug += 1
        '''
    '''
    outfile_pvalues.write("\n".join([str(i) for i in pvalues_fdr_calc]))
    '''
    outfile.close()
Esempio n. 9
0
def plot_eQTL(meQTL, gene_name, annotation, dosage, ax=None,
              symbol=None, focus_snp=None, gene_annot=None, size_shift=0,
              **kwargs):
    """ Plot eQTL from a full_matrix object
    Arguments
    ---------
    meQTL - a matrix eQTL dataframe or a series of pvalues
    gene_name - gene name
    annotation - snp annotation dataframe, index is rsID
    dosage - a dosage dataframe
    ax - axis to plot into
    """
    subset = subset_meQTL(meQTL, gene_name)

    if isinstance(subset.index, pd.core.index.MultiIndex):
        subset.index = subset.index.get_level_values('SNP')
    else:
        pass

    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    x_scale = 1e6
    try:
        adj_pv = -1 * np.log10(subset.ix[:, 'p-value'])
    except IndexError:
        adj_pv = -1 * np.log10(subset.iloc[:, 0])
    except pd.core.indexing.IndexingError:
        adj_pv = -1 * np.log10(subset.iloc[:])
    try:
        pos = np.asarray(annotation.ix[subset.index, 'pos'],
                         dtype=np.double) / x_scale
    except KeyError:
        pos = np.asarray(annotation.ix[subset.index, 1],
                         dtype=np.double) / x_scale
    dosage_sub = dosage.ix[subset.index, :]
    print('subset shape')
    print(subset.shape)

    dosage_maf =\
        calculate_minor_allele_frequency(dosage_sub)
    dosage_maf[dosage_maf > 0.5] = 1 - dosage_maf[dosage_maf > 0.5]
    dosage_maf = ((200 * dosage_maf) + 20) + size_shift
    if focus_snp:
        snp = focus_snp
    else:
        # :TODO fix for both use cases
        #snp = subset.iloc[np.nanargmax(adj_pv), 0]
        snp = subset.index[np.nanargmax(adj_pv)]
    try:
        iix = [i for i, j in enumerate(subset["SNP"]) if j == snp]
    except KeyError:
        iix = [i for i, j in enumerate(subset.index) if j == snp]
    # Need this since pos is a numpy array not pandas series
    snpx = pos[iix[0]]
    snp_pv = adj_pv.iloc[iix[0]]
    color1 = calculate_ld(dosage_sub,
                          snp)[dosage_sub.index].values
    if ax is None:
        ax_orig = False
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6),
                               sharey=False, sharex=True,
                               subplot_kw=dict(axisbg='#FFFFFF'))
        fig.tight_layout()
        fig.subplots_adjust(right=0.8, bottom=0.2)
    else:
        ax_orig = True
    ax.set_xlim((min(pos) - 0.01, max(pos) + 0.01))
    ylim = (max(adj_pv) + max(adj_pv / 6.0))
    ax.set_ylim((-0.01, ylim))
    ax.xaxis.set_major_formatter(x_formatter)
    ### Actual scatter #############################
    im = ax.scatter(pos, adj_pv, s=dosage_maf, c=color1)
    #:TODO make the arrow into a funciton
    ax.set_ylabel(r'$-log_{10}$ eQTL p-value')
    ax.set_xlabel(r'Position (Mb)')
    if symbol:
        gene_name = symbol
    if ax_orig:
        return(ax)
    else:
        cbar_ax = fig.add_axes([0.87, 0.15, 0.05, 0.7])
        bar = fig.colorbar(im, cax=cbar_ax)
        bar.ax.tick_params(labelsize=18)
        bar.set_label('r$^{2}$')
        return(fig)
Esempio n. 10
0
def main(gene, de, rsid, expr, cov=None):
    """rsid is simply a SNP within the region

    # Refactor transcript order shouldn't matter 
    # :TODO refactor so that it can handle multiple transcripts

    Arguments
    ---------
    gene : genda.transcripts.gene object
    de : diffevent
    covariates : add covriates to the fit
    """
    # :TODO add this to configure parser
    chrom = gene.chrom
    base_path = '/home/hsuj/Afib/'
    sann = pd.read_pickle(base_path + 'ref/snp_annot/{0!s}.pkl'.format(chrom))
    sample_mappings = pd.read_csv(
        '/home/hsuj/lustre/sample_mapping.txt',
        sep="\t",
        index_col=0,
    )
    sample_mappings['new_name'] = [i.split("/")[-1].rstrip(".bam") \
            for i in sample_mappings.ix[:, 2]]
    coverages = pd.read_csv(('/home/hsuj/Afib/eQTL/gene_data.txt'),
                            sep=",",
                            index_col=0)
    coverages = coverages.sum(axis=0)
    # Normalize
    ## PCA covariates
    cov_mat = pd.read_csv('/home/hsuj/Afib/eQTL/Exons/pca_final.csv',
                          sep=",",
                          index_col=0)
    pheno2 = cov_mat
    # Need at least 2 transcripts to compare.
    #graf = gr.genotype_reader_h5py('/home/hsuj/lustre/AF_miso_AFE.hdf')
    path_dict = gene.transcripts
    ## Handle genotypes ############################################
    sann['pos'] = sann['pos'].astype(int)
    i = get_genotype(chrom, rsid)
    geno = i.ix[:, expr.columns]
    gaf = calculate_minor_allele_frequency(geno)
    gaf = ((gaf >= 0.10) & (gaf <= 0.90)).values
    geno = geno.ix[gaf, :]
    ################################################################

    plot_dict = {}
    fig2, ax2 = plt.subplots(figsize=(10, 10), nrows=4, sharex=True)
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    if de.exon_num[0]:
        sea = 0
    else:
        sea = 1
    eoi = [de.exon_num[sea]]
    cigar_skipped = getattr(de, 'cigar{0!s}'.format(sea * 1 + 1))
    if len(cigar_skipped) > 3:
        eoi.append(eoi[0] + 1)
    else:
        pass
    cpath = path_dict[de.tid[sea]]
    eoi_intron_lengths = [i[1] for i in cigar_skipped if i[0] == 3]
    eoi2 = getattr(de, 'cigar{0!s}'.format(2 - 1 * sea))
    eoi2 = [abs(i[1]) for i in eoi2 if i[0] == 3]
    # Move generation of to plot into diffevents?
    to_plot1 = [
        i for i in cpath if (i[2] >= min(eoi) - 1) and (i[2] <= max(eoi) + 1)
    ]
    plot_dict[de.tid[sea]] = to_plot1

    try:
        to_plot2 = [
            i for i in path_dict[de.tid[1 - sea]]
            if i[2] in [de.exon2[0], de.exon2[1]]
        ]
    except TypeError:
        # For alternate first exons
        #to_plot2 = [i for i in path_dict[de.tid[1-sea]]]
        to_plot2 = [[i[1], i[2], 2]
                    for i in getattr(de, 'cigar{0!s}'.format(2 - 1 * sea))
                    if i[0] == 0]
    plot_dict[de.tid[1 - sea]] = to_plot2
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    xmin = min(to_plot1[0][0], to_plot2[0][0])
    try:
        xmax = max(to_plot1[2][1], to_plot2[1][1])
    except IndexError:
        # AFE exception
        xmax = to_plot1[1][1]
    sample_mappings = sample_mappings.ix[
        np.logical_not(sample_mappings.index.duplicated()), :]
    nsamples = sample_mappings.shape[0]
    buffer_bp = 0
    t1 = de.tid[0]
    t2 = de.tid[1]
    all_juncs = [eoi_intron_lengths, eoi2]
    SCN5A = 'ENSG00000183873'
    transcript_min = min([i[0] for i in gi.transcripts[de.tid[0]]])
    transcript_max = max([i[1] for i in gi.transcripts[de.tid[1]]])
    out_frame = pd.DataFrame(
        {
            de.tid[sea]: np.zeros(nsamples),
            de.tid[sea - 1]: np.zeros(sample_mappings.shape[0]),
        },
        index=sample_mappings.index)
    ofc = pd.DataFrame(np.zeros(
        (sample_mappings.shape[0], len(gi.transcripts))),
                       index=sample_mappings.index,
                       columns=gi.transcripts.keys())
    # Move this to function
    for bami in sample_mappings.index:
        # :TODO Make this more generalizable
        fname = sample_mappings.ix[bami, 2].split("/")[-1]
        bi = '/home/hsuj/lustre/AFdata/{0!s}'.format(fname)
        bamf = pysam.Samfile(bi)
        bamiter = bamf.fetch('chr' + chrom, xmin - buffer_bp, xmax + buffer_bp)
        #################### Getting intron junctions counts
        # :TODO convert to cython
        # Convert this to series
        intron_counts = np.zeros(2, dtype=np.int32)
        for i in bamiter:
            #exons = [j[1] for j in i.cigar if j[0] == 0]
            introns = [j[1] for j in i.cigar if j[0] == 3]
            # This depends on there not being other exact intron sizes
            for knum, ijunc in enumerate(all_juncs):
                try:
                    matches = [zi for zi in ijunc if zi in introns]
                except TypeError:
                    matches = [zi for zi in [ijunc] if zi in introns]
                if len(matches) > 0:
                    intron_counts[knum] += 1
        out_frame.ix[bami, 0:3] = intron_counts
        bleh = gi.transcripts[de.tid[0]][:-3]
        hmm = [bleh, gi.transcripts[de.tid[1]]]
        for i, j in zip(hmm, de.tid):
            bamiter = bamf.fetch('chr' + chrom, transcript_min, transcript_max)
            ofc.ix[bami, j] = count_reads(i, bamiter)
    # Filter low counts
    count_threshold = 0
    out_frame = out_frame.ix[out_frame.sum(axis=1) > count_threshold, :]
    read_length = 100
    intron_factor = 1
    # Refactor this to a function
    propi = (out_frame.ix[:, t1] + 1) / intron_factor / (
        out_frame.ix[:, t1] / intron_factor + out_frame.ix[:, t2])
    X = cov_mat.T
    X = sm.add_constant(X)
    from lin_test import test_transcript
    # Let's get all the SNPs that fall within a certain region
    # Adding cav1_beta1 for plotting purposes
    de.tid = [de.tid[0], de.tid[1]]
    out_frame.columns = [
        'ENST00000502471',
        'ENST00000033079',
    ]
    for i, j in enumerate(de.tid):
        ax2[0] = plot_transcript(j, plot_dict, ax2[0], y=i * 2.5, height=2.)
        t_xmin = min([k[0] for k in plot_dict[j]])
        t_xmax = max([k[1] for k in plot_dict[j]])
        ax2[0].hlines((i * 2.5 + 2) - 1,
                      t_xmin,
                      t_xmax,
                      colors='darkgrey',
                      lw=2)
        ax2[0].xaxis.set_major_formatter(x_formatter)
    ax2[0].get_yaxis().set_ticks([])
    ax2[0] = remove_tr_spines(ax2[0])
    goi = geno
    goi = goi.ix[:, out_frame.index]
    gfits = goi.apply(test_transcript, axis=1, args=(X, propi[X.index]))
    pvalues = [i.pvalues['geno'] for i in gfits]
    best_snp = 'rs17171731'
    pvalues = pd.Series(pvalues, index=geno.index)
    color = plt.rcParams['axes.color_cycle'][0]
    example_ylims = []
    for i in range(3):
        if i == 0: geno_string = sann.ix[best_snp, 'a0'] * 2
        elif i == 1:
            geno_string = sann.ix[best_snp, 'a0'] +\
                    sann.ix[best_snp, 'a1']
        elif i == 2:
            geno_string = sann.ix[best_snp, 'a1'] * 2
        hist = np.zeros(xmax - xmin, dtype=np.uint64)
        c_geno = (goi.ix[best_snp, :] == i)
        # Random from out_Frame
        try:
            b_example = np.random.choice(goi.columns[c_geno.values], size=1)[0]
        except ValueError:
            continue
        het = pysam.Samfile(sample_mappings.ix[b_example, 2])
        het_bamf = het.fetch('chr' + str(chrom), xmin, xmax)
        color = plt.rcParams['axes.color_cycle'][i]
        for read in het_bamf:
            coverage_hist(read, hist, xmin)
        het_bamf = het.fetch('chr' + str(chrom), xmin, xmax)
        het_bamf = het.fetch('chr' + str(chrom), xmin, xmax)
        hist = 1e3 * hist / coverages[b_example]
        ax2[i + 1].plot(np.linspace(xmin, xmax, num=len(hist)), hist, color)
        ax2[i + 1].fill_between(np.arange(xmin, xmax),
                                0,
                                hist,
                                facecolor=color)
        example_ylims.append(np.max(hist))
        # Need this to draw between every single one

        for tran_i in de.tid:
            junc_norm = 1e3 * out_frame.ix[b_example,
                                           tran_i] / coverages[b_example]
            ax2[i + 1] = draw_junction_arcs(plot_dict[tran_i],
                                            hist,
                                            xmin,
                                            ax2[i + 1],
                                            color=color,
                                            text=junc_norm,
                                            y_buffer=np.max(hist) * 0.20)
        ax2[i + 1].set_ylabel('{0} Genotype\n RPKM'.format(geno_string))
        #from lin_test import _temp_plot
        # Resave the pickeld file with correct int type
    example_ylim = max(example_ylims) * 1.2
    for i in range(3):
        ax2[i + 1].set_ylim((0, example_ylim))
    #dfmean = (out_frame - out_frame.mean())/(out_frame.max() - out_frame.min())
    #pcafit = pca.fit(dfmean)
    ax2[0].text((xmax - xmin) / 2, ax2[0].get_ylim()[1] - 1, str(min(pvalues)))
    ax2[0].axvline(sann.ix[best_snp, 'pos'], color='r', linestyle='solid')
    ax2[0].set_title('{0!s}'.format(gene.symbol))
    ax2[-1].set_xlabel('Position')
    embed()
    fig, ax = plt.subplots(nrows=3, sharex=True)
    ax[0] = plot_eQTL(pvalues,
                      'FAM13B',
                      sann,
                      goi,
                      ax=ax[0],
                      focus_snp='rs17171731')
    plt.tight_layout()
    fig.savefig(base_path +\
            'eQTL/graphs/{0!s}_cis_eqtl_transcript.png'.format(gene.symbol))
    gr = gene_reference(chrom=5, gene=de.tid[0], rsID=best_snp)
    fig, ax = plt.subplots(figsize=(6, 10), nrows=2)
    gr = gene_reference(chrom=5, gene=de.tid[0], rsID=best_snp)
    ax[0], pv_1 = plot_dosage_by_rsID(gr,
                                      goi,
                                      X,
                                      out_frame.ix[X.index, :].T,
                                      ax=ax[0])
    ax[0].set_title(de.tid[0])
    gr = gene_reference(chrom=5, gene=de.tid[1], rsID=best_snp)
    ax[1], pv_2 = plot_dosage_by_rsID(gr,
                                      goi,
                                      X,
                                      out_frame.ix[X.index, :].T,
                                      ax=ax[1])
    ax[1].set_title(de.tid[1])
    plt.tight_layout()
    fig.savefig(base_path +\
            'eQTL/graphs/CAV1_bestfig.png')
    fig2.savefig(base_path +
                 'eQTL/graphs/{0!s}_transcript.png'.format(gene.symbol))
    '''
    fig, ax = plt.subplots()
    ax = plot_dosage_by_rsID(gr, goi, X2, prop3[X2.index].T, ax=ax)
    fig.savefig(base_path +\
            'eQTL/graphs/prop3.png')
    '''
    embed()
    return (propi)
Esempio n. 11
0
def main(gene, de, rsid, expr, cov=None):
    """rsid is simply a SNP within the region

    # Refactor transcript order shouldn't matter 
    # :TODO refactor so that it can handle multiple transcripts

    Arguments
    ---------
    gene : genda.transcripts.gene object
    de : diffevent
    covariates : add covriates to the fit
    """
    # :TODO add this to configure parser
    chrom = gene.chrom
    base_path = '/home/hsuj/Afib/'
    sann = pd.read_pickle(base_path + 'ref/snp_annot/{0!s}.pkl'.format(chrom))
    sample_mappings = pd.read_csv('/home/hsuj/lustre/sample_mapping.txt',
            sep="\t", index_col=0,)
    sample_mappings['new_name'] = [i.split("/")[-1].rstrip(".bam") \
            for i in sample_mappings.ix[:, 2]]
    coverages = pd.read_csv(('/home/hsuj/Afib/eQTL/gene_data.txt'),
            sep = ",", index_col=0)
    coverages = coverages.sum(axis=0)
    # Normalize
    ## PCA covariates
    cov_mat = pd.read_csv('/home/hsuj/Afib/eQTL/Exons/pca_final.csv', sep=",",
            index_col=0)
    pheno2 = cov_mat
    # Need at least 2 transcripts to compare.   
    #graf = gr.genotype_reader_h5py('/home/hsuj/lustre/AF_miso_AFE.hdf')
    path_dict = gene.transcripts
    ## Handle genotypes ############################################
    sann['pos'] = sann['pos'].astype(int)
    i = get_genotype(chrom, rsid)
    geno = i.ix[:, expr.columns]
    gaf = calculate_minor_allele_frequency(geno)
    gaf = ((gaf >= 0.10) & (gaf <= 0.90)).values
    geno = geno.ix[gaf,:]
    ################################################################

    plot_dict = {}
    fig2, ax2  = plt.subplots(figsize=(10, 10), nrows=4,
            sharex=True)
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)  
    if de.exon_num[0]:
        sea = 0
    else:
        sea = 1
    eoi = [de.exon_num[sea]]
    cigar_skipped = getattr(de, 'cigar{0!s}'.format(sea*1 + 1))
    if len(cigar_skipped) > 3:
        eoi.append(eoi[0] + 1)
    else: pass
    cpath = path_dict[de.tid[sea]]
    eoi_intron_lengths = [i[1] for i in cigar_skipped if i[0] == 3]
    eoi2 = getattr(de,'cigar{0!s}'.format(2 - 1*sea))
    eoi2 = [abs(i[1]) for i in eoi2 if i[0] == 3 ]
    # Move generation of to plot into diffevents?
    to_plot1 = [i for i in cpath if (i[2] >= min(eoi) - 1) and (i[2] <= max(eoi)+ 1)]
    plot_dict[de.tid[sea]] = to_plot1

    try:
        to_plot2 = [i for i in path_dict[de.tid[1-sea]] if i[2] in
                [de.exon2[0], de.exon2[1]]]
    except TypeError:
        # For alternate first exons
        #to_plot2 = [i for i in path_dict[de.tid[1-sea]]]
        to_plot2 = [[i[1], i[2], 2] for i in getattr(de,
            'cigar{0!s}'.format(2-1*sea)) if i[0] == 0]
    plot_dict[de.tid[1-sea]] = to_plot2
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    xmin = min(to_plot1[0][0], to_plot2[0][0]) 
    try:
        xmax = max(to_plot1[2][1], to_plot2[1][1])
    except IndexError:
        # AFE exception 
        xmax = to_plot1[1][1]
    sample_mappings = sample_mappings.ix[
            np.logical_not(sample_mappings.index.duplicated()),:]
    nsamples =  sample_mappings.shape[0]
    buffer_bp = 0 
    t1 = de.tid[0]
    t2 = de.tid[1]
    all_juncs = [eoi_intron_lengths, eoi2]
    SCN5A = 'ENSG00000183873'
    transcript_min = min([i[0] for i in gi.transcripts[de.tid[0]]])
    transcript_max = max([i[1] for i in gi.transcripts[de.tid[1]]])
    out_frame = pd.DataFrame({de.tid[sea]:np.zeros(nsamples),
        de.tid[sea-1]: np.zeros(sample_mappings.shape[0]),
        }, 
        index = sample_mappings.index)
    ofc = pd.DataFrame(
            np.zeros((sample_mappings.shape[0],len(gi.transcripts))),
            index=sample_mappings.index, columns=gi.transcripts.keys())
    # Move this to function
    for bami in sample_mappings.index:
        # :TODO Make this more generalizable
        fname = sample_mappings.ix[bami,2].split("/")[-1]
        bi = '/home/hsuj/lustre/AFdata/{0!s}'.format(fname)
        bamf = pysam.Samfile(bi)
        bamiter = bamf.fetch('chr' + chrom,  xmin-buffer_bp,
                xmax + buffer_bp)
        #################### Getting intron junctions counts
        # :TODO convert to cython
        # Convert this to series
        intron_counts = np.zeros(2, dtype=np.int32)
        for i in bamiter:
            #exons = [j[1] for j in i.cigar if j[0] == 0]
            introns = [j[1] for j in i.cigar if j[0] == 3]
            # This depends on there not being other exact intron sizes
            for knum, ijunc in enumerate(all_juncs):
                try:
                    matches = [zi for zi in ijunc if zi in introns]
                except TypeError:
                    matches = [zi for zi in [ijunc] if zi in introns]
                if len(matches) > 0:
                    intron_counts[knum] += 1
        out_frame.ix[bami, 0:3] = intron_counts
        bleh = gi.transcripts[de.tid[0]][:-3]
        hmm = [bleh, gi.transcripts[de.tid[1]]]
        for i, j in zip(hmm, de.tid):
            bamiter = bamf.fetch('chr' + chrom,  transcript_min,
                    transcript_max)
            ofc.ix[bami ,j] = count_reads(i, bamiter)
    # Filter low counts
    count_threshold = 0 
    out_frame = out_frame.ix[out_frame.sum(axis=1) > count_threshold, :]
    read_length = 100
    intron_factor = 1
    # Refactor this to a function
    propi = (out_frame.ix[:, t1]+1)/intron_factor/(out_frame.ix[:,t1]/intron_factor +
                out_frame.ix[:, t2])
    X = cov_mat.T
    X = sm.add_constant(X)
    from lin_test import test_transcript
    # Let's get all the SNPs that fall within a certain region
    # Adding cav1_beta1 for plotting purposes
    de.tid = [de.tid[0], de.tid[1]]
    out_frame.columns = ['ENST00000502471', 'ENST00000033079',]
    for i, j in enumerate(de.tid):
        ax2[0] = plot_transcript(j, plot_dict, ax2[0], y=i*2.5, 
                height=2.)
        t_xmin = min([k[0] for k in plot_dict[j]])
        t_xmax = max([k[1] for k in plot_dict[j]])
        ax2[0].hlines((i*2.5 + 2) - 1, t_xmin, t_xmax, colors='darkgrey', lw=2)
        ax2[0].xaxis.set_major_formatter(x_formatter)
    ax2[0].get_yaxis().set_ticks([])
    ax2[0] = remove_tr_spines(ax2[0])
    goi = geno
    goi = goi.ix[:, out_frame.index]
    gfits = goi.apply(test_transcript, axis=1, args=(X, propi[X.index]))
    pvalues = [i.pvalues['geno'] for i in gfits]
    best_snp = 'rs17171731'
    pvalues = pd.Series(pvalues, index=geno.index)
    color = plt.rcParams['axes.color_cycle'][0]
    example_ylims = []
    for i in range(3):
        if i == 0: geno_string = sann.ix[best_snp, 'a0'] * 2
        elif i ==1:
            geno_string = sann.ix[best_snp, 'a0'] +\
                    sann.ix[best_snp, 'a1']
        elif i == 2: geno_string = sann.ix[best_snp, 'a1'] * 2
        hist = np.zeros(xmax - xmin, dtype=np.uint64)  
        c_geno = (goi.ix[best_snp, :] == i)
        # Random from out_Frame
        try:
            b_example = np.random.choice(goi.columns[c_geno.values], size=1)[0]
        except ValueError:
            continue
        het = pysam.Samfile(sample_mappings.ix[b_example, 2])
        het_bamf = het.fetch('chr' + str(chrom), xmin, xmax)
        color = plt.rcParams['axes.color_cycle'][i]
        for read in het_bamf:
            coverage_hist(read, hist, xmin)
        het_bamf = het.fetch('chr' + str(chrom), xmin, xmax)
        het_bamf = het.fetch('chr' + str(chrom), xmin, xmax)
        hist = 1e3 * hist/coverages[b_example]
        ax2[i + 1].plot(np.linspace(xmin, xmax, num=len(hist)),
                    hist, color)  
        ax2[i + 1].fill_between(np.arange(xmin, xmax),0, hist, facecolor=color)
        example_ylims.append(np.max(hist))
        # Need this to draw between every single one
        
        for tran_i in de.tid:
            junc_norm = 1e3 * out_frame.ix[b_example, tran_i]/coverages[b_example]
            ax2[i + 1] = draw_junction_arcs(plot_dict[tran_i], hist, xmin, ax2[i+1], 
                    color=color, text=junc_norm, y_buffer=np.max(hist) *0.20)
        ax2[i + 1].set_ylabel('{0} Genotype\n RPKM'.format(geno_string))
        #from lin_test import _temp_plot
        # Resave the pickeld file with correct int type
    example_ylim = max(example_ylims) * 1.2
    for i in range(3):
        ax2[i + 1].set_ylim((0, example_ylim))
    #dfmean = (out_frame - out_frame.mean())/(out_frame.max() - out_frame.min())
    #pcafit = pca.fit(dfmean)
    ax2[0].text((xmax-xmin)/2, ax2[0].get_ylim()[1]- 1, str(min(pvalues)))
    ax2[0].axvline(sann.ix[best_snp, 'pos'],color='r', linestyle='solid')
    ax2[0].set_title('{0!s}'.format(gene.symbol))
    ax2[-1].set_xlabel('Position')
    embed()
    fig, ax = plt.subplots(nrows=3, sharex=True)
    ax[0] = plot_eQTL(pvalues, 'FAM13B', sann, goi, ax=ax[0], 
            focus_snp='rs17171731') 
    plt.tight_layout()
    fig.savefig(base_path +\
            'eQTL/graphs/{0!s}_cis_eqtl_transcript.png'.format(gene.symbol))
    gr = gene_reference(chrom=5, gene=de.tid[0],
            rsID = best_snp)
    fig, ax = plt.subplots(figsize=(6, 10),nrows=2)
    gr = gene_reference(chrom=5, gene=de.tid[0],
            rsID = best_snp)
    ax[0], pv_1 = plot_dosage_by_rsID(gr, goi, X, out_frame.ix[X.index,:].T, ax=ax[0])
    ax[0].set_title(de.tid[0])
    gr = gene_reference(chrom=5, gene=de.tid[1],
            rsID = best_snp)
    ax[1], pv_2 = plot_dosage_by_rsID(gr, goi, X, out_frame.ix[X.index,:].T, ax=ax[1])
    ax[1].set_title(de.tid[1])
    plt.tight_layout()
    fig.savefig(base_path +\
            'eQTL/graphs/CAV1_bestfig.png')
    fig2.savefig(base_path + 'eQTL/graphs/{0!s}_transcript.png'.format(gene.symbol))
    '''
    fig, ax = plt.subplots()
    ax = plot_dosage_by_rsID(gr, goi, X2, prop3[X2.index].T, ax=ax)
    fig.savefig(base_path +\
            'eQTL/graphs/prop3.png')
    '''
    embed()
    return(propi)
Esempio n. 12
0
def main(gene, de, rsid, expr, cov=None):
    """rsid is simply a SNP within the region

    # Refactor transcript order shouldn't matter 

    Arguments
    ---------
    gene : genda.transcripts.gene object
    covariates : add covriates to the fit
    """

    '''
    pheno = pd.read_csv('/home/hsuj/Afib/eQTL/pheno_for_miso.txt',
            sep="\t", header=None, skiprows=1, index_col=1)
    '''
    pheno2 = pd.read_csv('gene_pheno_eQTL_april.txt', sep=",", index_col=0)
    new_col = [i.replace('.', '-') for i in pheno2.columns]
    pheno2.columns = new_col
    pheno2 = pheno2.T
    srs = 'ENST00000453840'
    # Normalize
    ## PCA covariates
    base_path = '/home/hsuj/Afib/'
    # Need at least 2 transcripts to compare.   
    path_dict = gene.transcripts
    chrom = gene.chrom
    sann = pd.read_pickle(base_path + 'ref/snp_annot/{0!s}.pkl'.format(chrom))
    sann['pos'] = sann['pos'].astype(int)
    i = get_genotype(chrom, rsid)
    geno = i.ix[:, expr.columns]
    gaf = calculate_minor_allele_frequency(geno)
    gaf = ((gaf >= 0.10) & (gaf <= 0.90)).values
    geno = geno.ix[gaf,:]
    plot_dict = {}
    fig2, ax2  = plt.subplots(figsize=(10, 10), nrows=4,
            sharex=True)
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)  
    if de.exon_num[0]:
        sea = 0
    else:
        sea = 1
    eoi = [de.exon_num[sea]]
    cigar_skipped = getattr(de, 'cigar{0!s}'.format(sea*1 + 1))
    if len(cigar_skipped) > 3:
        eoi.append(eoi[0] + 1)
    else: pass
    cpath = path_dict[de.transcript_ids[sea]]
    eoi_intron_lengths = [i[1] for i in cigar_skipped if i[0] == 3]
    eoi2 = getattr(de,'cigar{0!s}'.format(2 - 1*sea))[0][1]
    # Move generation of to plot into diffevents?
    # rough size normalization factor
    norm_fact1 = (float(sum([i[1] - i[0] for i in to_plot1])) / sum([i[1] - i[0] for i
        in to_plot2]))
    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    xmin = min(to_plot1[0][0], to_plot2[0][0]) 
    xmax = max(to_plot1[2][1], to_plot2[1][1])
    sample_mappings = sample_mappings.ix[
            np.logical_not(sample_mappings.index.duplicated()),:]
    nsamples =  sample_mappings.shape[0]
    out_frame = pd.DataFrame({de.transcript_ids[0]:np.zeros(nsamples),
        de.transcript_ids[1]: np.zeros(sample_mappings.shape[0])}, 
        index = sample_mappings.index)
    buffer_bp = 0 
    t1 = de.transcript_ids[0]
    t2 = de.transcript_ids[1]
    for bami in sample_mappings.index:
        # :TODO Make this more generalizable
        fname = sample_mappings.ix[bami,2].split("/")[-1]
        bi = '/home/hsuj/lustre/AFdata/{0!s}'.format(fname)
        bamf = pysam.Samfile(bi)
        bamiter = bamf.fetch('chr' + chrom,  xmin-buffer_bp,
                xmax + buffer_bp)
        # Conver this to cython
        c0 = 0
        c1 = 0
        cnot = 0
        for i in bamiter:
            #start = i.pos
            exons = [j[1] for j in i.cigar if j[0] == 0]
            introns = [j[1] for j in i.cigar if j[0] == 3]
            # Probably need to grab exact positions even though we are fetching
            # in small region
            matches1 = [zi for zi in eoi_intron_lengths if zi in introns]  
            try:
                matches2 = [zi for zi in eoi2 if zi in introns]
            except TypeError:
                matches2 = [zi for zi in [eoi2] if zi in introns]
            if (len(matches1) > 0): c0 += 1
            elif len(matches2) > 0: c1 += 1
            else: cnot +=1
        out_frame.ix[bami, t1] = c0 
        out_frame.ix[bami, t2] = c1
    # Filter low counts
    count_threshold = 100
    out_frame = out_frame.ix[out_frame.sum(axis=1) > count_threshold, :]
    read_length = 100
    if to_plot2 > 2:
        intron_factor = 3
    else: intron_factor = 1
    propi = (out_frame.ix[:, t1]+1)/intron_factor/(out_frame.ix[:,t1]/intron_factor +
                out_frame.ix[:, t2])
    '''
    bii = (sann['pos'] > xmin) &\
            (sann['pos'] < xmax) 
    bii = sann.index[bii.values]
    goi = geno.ix[bii, :]
    '''
    X = test.ix[srs, out_frame.index]
    X = sm.add_constant(X)
    X2 = X.copy()
    X2 = X2.join(pheno2.ix[:,0:5], how='inner')
    X2['prop'] = propi[X2.index]
    X2['fullsum'] = out_frame.ix[X2.index].sum(axis=1)
    try:
        prop_model =\
                ols('prop~sexFemale+ENST00000453840+MDS4+MDS1+MDS3+MDS2+fullsum', 
                data=X2,
                missing='drop').fit()
    except ValueError:
        embed()
    fig, ax = plt.subplots(figsize=(6,6))
    fig = sm.graphics.plot_partregress("prop", "ENST00000453840", ['sexFemale',
        'MDS4', 'fullsum'], data=X2, ax=ax, obs_labels=False)
    ax.text(0.5, ax.get_ylim()[1] - 0.02, 
            'p-value: %.2E' % Decimal(prop_model.pvalues['ENST00000453840']),
            size=12)
    ax.set_xlabel('SRSF10 expression')
    ax.set_ylabel('{0!s} included exon / skipped exon proportion'.format(gene.symbol))
    ax.set_title('')
    plt.tight_layout()
    fig.savefig(base_path + 'eQTL/graphs/{0!s}_srsf10_fit.png'.format(gene.symbol))
    print(prop_model.pvalues)
    from lin_test import test_transcript
    # Let's get all the SNPs that fall within a certain region



    if gene.symbol == 'CAST' or gene.symbol == 'GDAP1L1':
        for i, j in enumerate(de.transcript_ids):
            ax2[0] = plot_transcript(j, plot_dict, ax2[0], y=i*2.5, 
                    height=2.)
            ax2[0].hlines((i*2.5 + 2) - 1, xmin, xmax, colors='darkgrey', lw=2)
            ax2[0].xaxis.set_major_formatter(x_formatter)
        ax2[0] = remove_tr_spines(ax2[0])
        ax2[0].set_xlim((xmin, xmax))
        ax2[0].set_ylim((-0.5, 2*2.5 + 0.5))
        goi = geno
        goi = goi.ix[:, out_frame.index]
        gfits = goi.apply(test_transcript, axis=1, args=(X, propi))
        pvalues = [i.pvalues['geno'] for i in gfits]
        best_snp = geno.index[np.nanargmin(pvalues)]
        pvalues = pd.Series(pvalues, index=geno.index)
        print(gfits[np.nanargmin(pvalues)].pvalues)
        color = plt.rcParams['axes.color_cycle'][0]
        embed()
        for i in range(3):
            if i == 0: geno_string = sann.ix[best_snp, 'a0'] * 2
            elif i ==1:
                geno_string = sann.ix[best_snp, 'a0'] +\
                        sann.ix[best_snp, 'a1']
            elif i == 2: geno_string = sann.ix[best_snp, 'a1'] * 2
            hist = np.zeros(xmax - xmin, dtype=np.uint64)  
            c_geno = (goi.ix[best_snp, :] == i)
            # Random from out_Frame
            try:
                b_example = np.random.choice(goi.columns[c_geno.values], size=1)[0]
            except ValueError:
                continue
            het = pysam.Samfile(sample_mappings.ix[b_example,2])
            het_bamf = het.fetch('chr' + str(chrom), xmin, xmax)
            color = plt.rcParams['axes.color_cycle'][i]
            for read in het_bamf:
                coverage_hist(read, hist, xmin)

            ax2[i + 1].plot(np.linspace(xmin, xmax, num=len(hist)),
                        hist, color)  
            ax2[i + 1].fill_between(np.arange(xmin, xmax),0, hist, facecolor=color)
            ax2[i + 1].set_ylim((np.min(hist), np.max(hist) + 0.2 * np.max(hist)))
            try:
                ax2[i + 1].text((xmax + xmin)/2, np.max(hist),
                        str(out_frame.ix[b_example, 1]))
            except KeyError:
                pass
            ax2[i + 1].set_ylabel('{0} Genotype'.format(geno_string))
            #from lin_test import _temp_plot
            # Resave the pickeld file with correct int type
        ax2[0].text((xmax-xmin)/2, ax2[0].get_ylim()[1]- 1, str(min(pvalues)))
        ax2[0].axvline(sann.ix[best_snp, 'pos'],color='r', linestyle='solid')
        ax2[0].set_title('{0!s}'.format(gene.symbol))
        ax2[-1].set_xlabel('Position')
        fig2.savefig(base_path + 'eQTL/graphs/{0!s}_transcript.png'.format(gene.symbol))
        out_frame.columns = ['{0} IE'.format(gene.symbol), 
                            '{0} SE'.format(gene.symbol)]
    return(propi)