Exemple #1
0
 def aei_plot_single(self, cissnp, ax=None, focus_snp=None):
     """
     Arguments
     ---------
     tsnp - a particular tag snp to plot association with """
     x_scale = 1e6
     cm = plt.cm.get_cmap('Blues')
     size_maf = ((200 * self.maf) + 20)
     pos = self.snp_annot.loc[:, 'pos']
     pos = np.asarray(pos, dtype=np.uint64) / x_scale
     if ax:
         pass
     else:
         fig, ax = plt.subplots(nrows=1,
                                ncols=1,
                                sharey=False,
                                sharex=True,
                                subplot_kw=dict(axisbg='#FFFFFF'))
     adj_pvalue = -1 * np.log10(self.pvalues.loc[:, cissnp])
     if focus_snp:
         print('focus snp')
         snp = focus_snp
     else:
         # :TODO fix for both use cases
         #snp = subset.iloc[np.nanargmax(adj_pv), 0]
         snp = self.pvalues.index[np.nanargmax(adj_pvalue)]
     print(snp)
     snp_iloc = [i for i, j in enumerate(adj_pvalue.index)\
             if j == snp][0]
     color1 = calculate_ld(self.geno, snp)[adj_pvalue.index].values
     scatter = ax.scatter(pos, adj_pvalue, s=size_maf, c=color1)
     ax.set_ylabel(r'-$log_{10}$ AEI p-value')
     ylim = (max(adj_pvalue) + max(adj_pvalue / 6.0))
     ax.set_ylim((-0.01, ylim))
     #ax = add_snp_arrow(adj_pvalue[snp_iloc], pos[snp_iloc], snp, ax)
     ax = snp_arrow(pos[snp_iloc], adj_pvalue[snp_iloc], snp, ax)
     if ax:
         return (ax)
     else:
         return (fig)
Exemple #2
0
 def aei_plot_single(self, cissnp, ax=None, focus_snp=None):
     """
     Arguments
     ---------
     tsnp - a particular tag snp to plot association with """
     x_scale = 1e6
     cm = plt.cm.get_cmap('Blues')
     size_maf = ((200 * self.maf) + 20)
     pos = self.snp_annot.loc[:, 'pos']
     pos = np.asarray(pos, dtype=np.uint64)/x_scale
     if ax:
         pass
     else:
         fig, ax = plt.subplots(nrows=1 , ncols=1, 
                 sharey=False, sharex=True, 
                 subplot_kw=dict(axisbg='#FFFFFF'))
     adj_pvalue = -1*np.log10(self.pvalues.loc[:, cissnp])
     if focus_snp:
         print('focus snp')
         snp = focus_snp
     else:
         # :TODO fix for both use cases
         #snp = subset.iloc[np.nanargmax(adj_pv), 0]
         snp = self.pvalues.index[np.nanargmax(adj_pvalue)]
     print(snp)
     snp_iloc = [i for i, j in enumerate(adj_pvalue.index)\
             if j == snp][0]
     color1 = calculate_ld(self.geno,
         snp)[adj_pvalue.index].values
     scatter = ax.scatter(pos, 
                     adj_pvalue, s=size_maf, c=color1)
     ax.set_ylabel(r'-$log_{10}$ AEI p-value')
     ylim = (max(adj_pvalue) + max(adj_pvalue/6.0))
     ax.set_ylim((-0.01, ylim))
     #ax = add_snp_arrow(adj_pvalue[snp_iloc], pos[snp_iloc], snp, ax)
     ax = snp_arrow(pos[snp_iloc], adj_pvalue[snp_iloc], snp, ax)
     if ax:
         return(ax)
     else:
         return(fig)
Exemple #3
0
def _all_eQTL_comp(chrom, base_path, debug=None, count_threshold=200):
    chrom_dict = {'chrom': str(chrom)}
    config = ConfigParser.ConfigParser()
    config.read('test.cfg')
    aei_base = base_path + config.get('data', 'aei_prefix')
    dos_path = base_path + config.get('data', 'dosage_prefix', 0, chrom_dict)
    snp_annot = base_path + config.get('annotation', 'snp_annot_prefix', 0, chrom_dict)
    gene_snps_path = base_path + config.get('annotation', 'gene_snps', 0, chrom_dict)
    eqtl_path = base_path + config.get('data', 'eqtl_prefix', 0, chrom_dict)

    ###### Loading the data ##########
    aei = get_aei((aei_base + 'chr{chrom!s}.pkl'.format(chrom=chrom)))
    dos = pd.read_csv(dos_path, sep=" ", index_col=0, header=0)
    s_ann = pd.read_pickle(snp_annot)
    gene_snps = pd.read_pickle(gene_snps_path)
    eqtl_matrix = pd.read_pickle(eqtl_path)


    ##### Restrict to only Europeans
    vsd_counts = pd.read_csv(base_path + 'eQTL/gene_vsd_eQTL_april.txt', 
            sep=",", index_col=0)
    af_euro = calculate_minor_allele_frequency(dos.ix[:,vsd_counts.columns])
    maf_euro = af_euro.index[(af_euro >= 0.01) & (af_euro <= 0.99)]
    eqtl_matrix =\
            eqtl_matrix.ix[eqtl_matrix.index.get_level_values('SNP').isin(maf_euro)]  
    print(eqtl_matrix.shape)
    count_threshold = 200
    outfile = open(base_path +\
            'eQTL/tables/global_aei/{0}_aei_rep.txt'.format(chrom), 'w+')
    eqtl_matrix = eqtl_matrix.swaplevel(0, 1)
    eqtl = eqtl_matrix.groupby(level=0)

    pvalues_fdr_calc = []
    idx = eqtl.apply(lambda x: x['p-value'].argmin())
    print(eqtl.groups.keys()[0:10])
    header = ['Symbol', 'ensid', 'Chrom', 'Indicator',
            'Best AEI SNP', '']

    for i, j in gene_snps.iteritems():
        if i == 'ENSG00000054654':
            pass
        else:
            print('going')
            continue
        symbol = get_symbol_ensembl(i)
        print(i, symbol)
        empty_out = [str(symbol), str(i), str(chrom), 'NA', 'NA', 'NA', 
                'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA']
        aei_t = aei.ix[j, :]
        aei_t = aei_t.ix[aei_t.sum(axis=1) >= count_threshold, :]
        if aei_t.shape[0] == 0:
            outfile.write("\t".join(empty_out) + "\n")
        #si = 
        # Or grab from eQTL.matrix?
        snps_cis = [eqtl_i[1] for eqtl_i in eqtl.groups[i]]
        gaei2 = AEI(aei_t, dos.ix[snps_cis, :], s_ann.ix[snps_cis,:], i)
        gaei2.calc_aei(num_threshold=20)

        bt = gaei2.pvalues
        bt = bt.ix[:, np.logical_not(bt.min().isnull())]


        cur_best = None
        cur_best_pvalue = 1

        for ind in bt.columns:
            cissnp = bt.ix[:, ind].idxmin()
            outliers_g = gaei2.outliers.ix[gaei2.hets_dict[ind], ind].values
            tgeno = dosage_round(gaei2.geno.ix[cissnp,
                gaei2.hets_dict[ind]][np.logical_not(outliers_g)])
            sum_hets_homos = np.sum((tgeno == 0) | (tgeno == 2)) 
            if (sum_hets_homos > 15) and (bt.ix[cissnp, ind] < cur_best_pvalue):
                cur_best = ind
                cur_best_pvalue = bt.ix[cissnp, ind]

        if cur_best:
            good = cur_best
            pvalue_good = cur_best_pvalue
            cissnp = gaei2.pvalues.ix[:,good].idxmin()
        else:
            outfile.write("\t".join(empty_out) + "\n")
            continue


        # :TODO get beta estimate
        pvalues_fdr_calc.extend(gaei2.pvalues.ix[:,good].values)
        #indsnp = gaei2.pvalues.columns[good]
        indsnp = good

        # Beta estimates 
        try:
            outliers_g = gaei2.outliers.ix[gaei2.hets_dict[indsnp] ,indsnp].values
        except KeyError:
            continue
        tgeno = dosage_round(gaei2.geno.ix[cissnp,
            gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)])
        ar = gaei2.ratios.ix[gaei2.hets_dict[indsnp],
                indsnp][np.logical_not(outliers_g)]
        ar[ar > 1]  = 1/ar[ar > 1]
        het_combined = ar[np.array(tgeno == 1)]
        homo_combined = ar[np.array((tgeno == 0) | (tgeno == 2))]
        beta_best = np.mean(het_combined)/np.mean(homo_combined)
        try:
            aei_eqtl_best = gaei2.pvalues.ix[idx[i][1], good]
            tgeno = dosage_round(gaei2.geno.ix[idx[i][1],
                gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)])
            het_combined_e = ar[np.array(tgeno == 1)]
            homo_combined_e = ar[np.array((tgeno == 0) | (tgeno == 2))]
            eqtl_best_aei_beta =\
                    np.mean(het_combined_e)/np.mean(homo_combined_e)
        except KeyError:
            aei_eqtl_best = 'NA'
            eqtl_best_aei_beta = 'NA'
        if not cissnp == idx[i][1]:
            ldbest = calculate_ld(dos.ix[[cissnp, idx[i][1]],:], 
                    cissnp)[idx[i][1]] ** 2
        else:
            ldbest = 1

        out_l = [symbol, i, 
                chrom,
                indsnp,
                cissnp,
                beta_best,
                pvalue_good,
                eqtl_matrix.ix[(i, cissnp), 'beta'],
                eqtl_matrix.ix[(i, cissnp), 'p-value'],
                idx[i][1],
                eqtl_best_aei_beta,
                aei_eqtl_best,
                eqtl_matrix.ix[(i, idx[i][1]), 'beta'],
                eqtl_matrix.ix[(i, idx[i][1]), 'p-value'],
                float(np.sum(outliers_g))/len(gaei2.hets_dict[indsnp]),
                ldbest
                ]
        if (symbol == str(debug)) or (i==str(debug)):
            embed()
            break
        else:
            pass
        out_l = [str(out_column) for out_column in out_l]
        outfile.write("\t".join(out_l) + "\n")
        '''
        if debug >= 30:
            embed()
            break
        else:
            debug += 1
        '''
    '''
    outfile_pvalues.write("\n".join([str(i) for i in pvalues_fdr_calc]))
    '''
    outfile.close()
Exemple #4
0
def plot_eQTL(meQTL,
              gene_name,
              annotation,
              dosage,
              ax=None,
              symbol=None,
              focus_snp=None,
              gene_annot=None,
              size_shift=0,
              **kwargs):
    """ Plot eQTL from a full_matrix object
    Arguments
    ---------
    meQTL - a matrix eQTL dataframe or a series of pvalues
    gene_name - gene name
    annotation - snp annotation dataframe, index is rsID
    dosage - a dosage dataframe
    ax - axis to plot into
    """
    subset = subset_meQTL(meQTL, gene_name)

    if isinstance(subset.index, pd.core.index.MultiIndex):
        subset.index = subset.index.get_level_values('SNP')
    else:
        pass

    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    x_scale = 1e6
    try:
        adj_pv = -1 * np.log10(subset.ix[:, 'p-value'])
    except IndexError:
        adj_pv = -1 * np.log10(subset.iloc[:, 0])
    except pd.core.indexing.IndexingError:
        adj_pv = -1 * np.log10(subset.iloc[:])
    try:
        pos = np.asarray(annotation.ix[subset.index, 'pos'],
                         dtype=np.double) / x_scale
    except KeyError:
        pos = np.asarray(annotation.ix[subset.index, 1],
                         dtype=np.double) / x_scale
    dosage_sub = dosage.ix[subset.index, :]
    print('subset shape')
    print(subset.shape)
    print(kwargs)
    dosage_maf =\
        calculate_minor_allele_frequency(dosage_sub)
    dosage_maf[dosage_maf > 0.5] = 1 - dosage_maf[dosage_maf > 0.5]
    dosage_maf = ((200 * dosage_maf) + 20) + size_shift
    if focus_snp:
        snp = focus_snp
    else:
        # :TODO fix for both use cases
        #snp = subset.iloc[np.nanargmax(adj_pv), 0]
        snp = subset.index[np.nanargmax(adj_pv)]
    try:
        iix = [i for i, j in enumerate(subset["SNP"]) if j == snp]
    except KeyError:
        iix = [i for i, j in enumerate(subset.index) if j == snp]
    # Need this since pos is a numpy array not pandas series
    snpx = pos[iix[0]]
    snp_pv = adj_pv.iloc[iix[0]]
    color_ld = calculate_ld(dosage_sub, snp)[dosage_sub.index].values
    if ax is None:
        ax_orig = False
        fig, ax = plt.subplots(nrows=1,
                               ncols=1,
                               figsize=(16, 6),
                               sharey=False,
                               sharex=True,
                               subplot_kw=dict(axisbg='#FFFFFF'))
        fig.tight_layout()
        fig.subplots_adjust(right=0.8, bottom=0.2)
    else:
        ax_orig = True
    ax.set_xlim((min(pos) - 0.01, max(pos) + 0.01))
    ylim = (max(adj_pv) + max(adj_pv / 6.0))
    ax.set_ylim((-0.01, ylim))
    ax.xaxis.set_major_formatter(x_formatter)
    ### Actual scatter #############################
    im = ax.scatter(pos, adj_pv, s=dosage_maf, c=color_ld, **kwargs)
    #:TODO make the arrow into a funciton
    ax.set_ylabel(r'$-log_{10}$ eQTL p-value')
    ax.set_xlabel(r'Position (Mb)')
    if symbol:
        gene_name = symbol
    if ax_orig:
        return (ax)
    else:
        cbar_ax = fig.add_axes([0.87, 0.15, 0.05, 0.7])
        bar = fig.colorbar(im, cax=cbar_ax)
        bar.ax.tick_params(labelsize=18)
        bar.set_label('r$^{2}$')
        return (fig)
Exemple #5
0
def _all_eQTL_comp(chrom, base_path, debug=None, count_threshold=200):
    chrom_dict = {'chrom': str(chrom)}
    config = ConfigParser.ConfigParser()
    config.read('test.cfg')
    aei_base = base_path + config.get('data', 'aei_prefix')
    dos_path = base_path + config.get('data', 'dosage_prefix', 0, chrom_dict)
    snp_annot = base_path + config.get('annotation', 'snp_annot_prefix', 0,
                                       chrom_dict)
    gene_snps_path = base_path + config.get('annotation', 'gene_snps', 0,
                                            chrom_dict)
    eqtl_path = base_path + config.get('data', 'eqtl_prefix', 0, chrom_dict)

    ###### Loading the data ##########
    aei = get_aei((aei_base + 'chr{chrom!s}.pkl'.format(chrom=chrom)))
    dos = pd.read_csv(dos_path, sep=" ", index_col=0, header=0)
    s_ann = pd.read_pickle(snp_annot)
    gene_snps = pd.read_pickle(gene_snps_path)
    eqtl_matrix = pd.read_pickle(eqtl_path)

    ##### Restrict to only Europeans
    vsd_counts = pd.read_csv(base_path + 'eQTL/gene_vsd_eQTL_april.txt',
                             sep=",",
                             index_col=0)
    af_euro = calculate_minor_allele_frequency(dos.ix[:, vsd_counts.columns])
    maf_euro = af_euro.index[(af_euro >= 0.01) & (af_euro <= 0.99)]
    eqtl_matrix =\
            eqtl_matrix.ix[eqtl_matrix.index.get_level_values('SNP').isin(maf_euro)]
    print(eqtl_matrix.shape)
    count_threshold = 200
    outfile = open(base_path +\
            'eQTL/tables/global_aei/{0}_aei_rep.txt'.format(chrom), 'w+')
    eqtl_matrix = eqtl_matrix.swaplevel(0, 1)
    eqtl = eqtl_matrix.groupby(level=0)

    pvalues_fdr_calc = []
    idx = eqtl.apply(lambda x: x['p-value'].argmin())
    print(eqtl.groups.keys()[0:10])
    header = ['Symbol', 'ensid', 'Chrom', 'Indicator', 'Best AEI SNP', '']

    for i, j in gene_snps.iteritems():
        if i == 'ENSG00000054654':
            pass
        else:
            print('going')
            continue
        symbol = get_symbol_ensembl(i)
        print(i, symbol)
        empty_out = [
            str(symbol),
            str(i),
            str(chrom), 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA', 'NA',
            'NA', 'NA', 'NA'
        ]
        aei_t = aei.ix[j, :]
        aei_t = aei_t.ix[aei_t.sum(axis=1) >= count_threshold, :]
        if aei_t.shape[0] == 0:
            outfile.write("\t".join(empty_out) + "\n")
        #si =
        # Or grab from eQTL.matrix?
        snps_cis = [eqtl_i[1] for eqtl_i in eqtl.groups[i]]
        gaei2 = AEI(aei_t, dos.ix[snps_cis, :], s_ann.ix[snps_cis, :], i)
        gaei2.calc_aei(num_threshold=20)

        bt = gaei2.pvalues
        bt = bt.ix[:, np.logical_not(bt.min().isnull())]

        cur_best = None
        cur_best_pvalue = 1

        for ind in bt.columns:
            cissnp = bt.ix[:, ind].idxmin()
            outliers_g = gaei2.outliers.ix[gaei2.hets_dict[ind], ind].values
            tgeno = dosage_round(gaei2.geno.ix[cissnp, gaei2.hets_dict[ind]][
                np.logical_not(outliers_g)])
            sum_hets_homos = np.sum((tgeno == 0) | (tgeno == 2))
            if (sum_hets_homos > 15) and (bt.ix[cissnp, ind] <
                                          cur_best_pvalue):
                cur_best = ind
                cur_best_pvalue = bt.ix[cissnp, ind]

        if cur_best:
            good = cur_best
            pvalue_good = cur_best_pvalue
            cissnp = gaei2.pvalues.ix[:, good].idxmin()
        else:
            outfile.write("\t".join(empty_out) + "\n")
            continue

        # :TODO get beta estimate
        pvalues_fdr_calc.extend(gaei2.pvalues.ix[:, good].values)
        #indsnp = gaei2.pvalues.columns[good]
        indsnp = good

        # Beta estimates
        try:
            outliers_g = gaei2.outliers.ix[gaei2.hets_dict[indsnp],
                                           indsnp].values
        except KeyError:
            continue
        tgeno = dosage_round(
            gaei2.geno.ix[cissnp,
                          gaei2.hets_dict[indsnp]][np.logical_not(outliers_g)])
        ar = gaei2.ratios.ix[gaei2.hets_dict[indsnp],
                             indsnp][np.logical_not(outliers_g)]
        ar[ar > 1] = 1 / ar[ar > 1]
        het_combined = ar[np.array(tgeno == 1)]
        homo_combined = ar[np.array((tgeno == 0) | (tgeno == 2))]
        beta_best = np.mean(het_combined) / np.mean(homo_combined)
        try:
            aei_eqtl_best = gaei2.pvalues.ix[idx[i][1], good]
            tgeno = dosage_round(gaei2.geno.ix[idx[i][1],
                                               gaei2.hets_dict[indsnp]][
                                                   np.logical_not(outliers_g)])
            het_combined_e = ar[np.array(tgeno == 1)]
            homo_combined_e = ar[np.array((tgeno == 0) | (tgeno == 2))]
            eqtl_best_aei_beta =\
                    np.mean(het_combined_e)/np.mean(homo_combined_e)
        except KeyError:
            aei_eqtl_best = 'NA'
            eqtl_best_aei_beta = 'NA'
        if not cissnp == idx[i][1]:
            ldbest = calculate_ld(dos.ix[[cissnp, idx[i][1]], :],
                                  cissnp)[idx[i][1]]**2
        else:
            ldbest = 1

        out_l = [
            symbol, i, chrom, indsnp, cissnp, beta_best, pvalue_good,
            eqtl_matrix.ix[(i, cissnp), 'beta'], eqtl_matrix.ix[(i, cissnp),
                                                                'p-value'],
            idx[i][1], eqtl_best_aei_beta, aei_eqtl_best,
            eqtl_matrix.ix[(i, idx[i][1]),
                           'beta'], eqtl_matrix.ix[(i, idx[i][1]), 'p-value'],
            float(np.sum(outliers_g)) / len(gaei2.hets_dict[indsnp]), ldbest
        ]
        if (symbol == str(debug)) or (i == str(debug)):
            embed()
            break
        else:
            pass
        out_l = [str(out_column) for out_column in out_l]
        outfile.write("\t".join(out_l) + "\n")
        '''
        if debug >= 30:
            embed()
            break
        else:
            debug += 1
        '''
    '''
    outfile_pvalues.write("\n".join([str(i) for i in pvalues_fdr_calc]))
    '''
    outfile.close()
Exemple #6
0
def plot_eQTL(meQTL, gene_name, annotation, dosage, ax=None,
              symbol=None, focus_snp=None, gene_annot=None, size_shift=0,
              **kwargs):
    """ Plot eQTL from a full_matrix object
    Arguments
    ---------
    meQTL - a matrix eQTL dataframe or a series of pvalues
    gene_name - gene name
    annotation - snp annotation dataframe, index is rsID
    dosage - a dosage dataframe
    ax - axis to plot into
    """
    subset = subset_meQTL(meQTL, gene_name)

    if isinstance(subset.index, pd.core.index.MultiIndex):
        subset.index = subset.index.get_level_values('SNP')
    else:
        pass

    x_formatter = matplotlib.ticker.ScalarFormatter(useOffset=False)
    x_scale = 1e6
    try:
        adj_pv = -1 * np.log10(subset.ix[:, 'p-value'])
    except IndexError:
        adj_pv = -1 * np.log10(subset.iloc[:, 0])
    except pd.core.indexing.IndexingError:
        adj_pv = -1 * np.log10(subset.iloc[:])
    try:
        pos = np.asarray(annotation.ix[subset.index, 'pos'],
                         dtype=np.double) / x_scale
    except KeyError:
        pos = np.asarray(annotation.ix[subset.index, 1],
                         dtype=np.double) / x_scale
    dosage_sub = dosage.ix[subset.index, :]
    print('subset shape')
    print(subset.shape)

    dosage_maf =\
        calculate_minor_allele_frequency(dosage_sub)
    dosage_maf[dosage_maf > 0.5] = 1 - dosage_maf[dosage_maf > 0.5]
    dosage_maf = ((200 * dosage_maf) + 20) + size_shift
    if focus_snp:
        snp = focus_snp
    else:
        # :TODO fix for both use cases
        #snp = subset.iloc[np.nanargmax(adj_pv), 0]
        snp = subset.index[np.nanargmax(adj_pv)]
    try:
        iix = [i for i, j in enumerate(subset["SNP"]) if j == snp]
    except KeyError:
        iix = [i for i, j in enumerate(subset.index) if j == snp]
    # Need this since pos is a numpy array not pandas series
    snpx = pos[iix[0]]
    snp_pv = adj_pv.iloc[iix[0]]
    color1 = calculate_ld(dosage_sub,
                          snp)[dosage_sub.index].values
    if ax is None:
        ax_orig = False
        fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(16, 6),
                               sharey=False, sharex=True,
                               subplot_kw=dict(axisbg='#FFFFFF'))
        fig.tight_layout()
        fig.subplots_adjust(right=0.8, bottom=0.2)
    else:
        ax_orig = True
    ax.set_xlim((min(pos) - 0.01, max(pos) + 0.01))
    ylim = (max(adj_pv) + max(adj_pv / 6.0))
    ax.set_ylim((-0.01, ylim))
    ax.xaxis.set_major_formatter(x_formatter)
    ### Actual scatter #############################
    im = ax.scatter(pos, adj_pv, s=dosage_maf, c=color1)
    #:TODO make the arrow into a funciton
    ax.set_ylabel(r'$-log_{10}$ eQTL p-value')
    ax.set_xlabel(r'Position (Mb)')
    if symbol:
        gene_name = symbol
    if ax_orig:
        return(ax)
    else:
        cbar_ax = fig.add_axes([0.87, 0.15, 0.05, 0.7])
        bar = fig.colorbar(im, cax=cbar_ax)
        bar.ax.tick_params(labelsize=18)
        bar.set_label('r$^{2}$')
        return(fig)