コード例 #1
0
ファイル: GeneAnalysis.py プロジェクト: airanmehr/bio
def gowinda():
    gow = pd.read_csv(utl.outpath + 'real/gowinda/cand.q99.out', sep='\t', header=None)
    arya = np.array("""GO:0004046
    GO:0015101
    GO:0007501
    GO:0004601
    GO:0006979
    GO:0009312
    GO:0004653
    GO:0040014
    GO:0016485
    GO:0006030
    GO:0020037
    GO:0008061
    GO:0004702""".split())
    np.intersect1d(gow[0].unique().astype(str), arya).shape
    Genes = pd.read_pickle(utl.outpath + 'real/GO.df')
    pval, cont = utl.getPvalFisher(Genes.reset_index().GO.unique(), gow[0], arya)
コード例 #2
0
ファイル: GeneAnalysis.py プロジェクト: airanmehr/bio
def Final():
    ############ preparing data
    def saveGOTex(df):
        name = np.unique(df.index)[0]
        print '*' * 80, name
        df = df.sort_values('-log($p$-value)', ascending=False)
        df['Rank'] = range(1, df.shape[0] + 1);
        df = df.iloc[:, [6] + range(6)]
        path = utl.paperPath + '/tables/{}.tex'.format(name);
        df.to_csv(path.replace('.tex', '.csv').replace('/tables/', '/data/'))
        utl.DataframetolaTexTable(df.iloc[:, :-1], alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path)

    goPvalue = lambda x: utl.getPvalFisher(AllGenes=allVariantGenes.values, putativeList=x.values,
                                           myList=g.index.values)
    unpackp = lambda x: [min(6, np.round(x[0], 1)), x[1].loc['Putative', 'myList']]
    # Score = lambda x,f:f(scores.loc[x.CHROM][(scores.loc[x.CHROM].index>=x.start)&(scores.loc[x.CHROM].index<=x.end)])
    sort = lambda df: pd.concat(
            [df[df.index.get_level_values('CHROM') == ch] for ch in ['X', '2L', '2R', '3L', '3R']]).rename(
        columns={'H': r'$\mathcal{H}^+$', 'M': 'Num. of Variants'})
    Genes = loadGeneData().reset_index().set_index('GO')
    Genes = Genes.loc[
        (Genes['FBgn'].groupby(level=0).apply(lambda x: len(x.unique())) > 2).replace({False: None}).dropna().index]
    scores = utl.getEuChromatin(rutl.loadScores(skipHetChroms=True))
    ann = pd.DataFrame(scores).join(loadANN(), how='inner')
    allVariantGenes = ann['Gene_ID'].drop_duplicates()
    # f=lambda x: x[x>=x.quantile(0.9)].mean()
    # geneScores=ann.reset_index().set_index('Gene_ID')[['CHROM','POS',0]].drop_duplicates().groupby(level=0)[0].apply(f)


    ############ computing candidate regions
    scan = utl.scanGenome(scores.abs(), {'H': lambda x: x.abs().mean(), 'M': lambda x: x.size}, winSize=30000)
    o = utl.localOutliers(scan.H, q=0.99);
    o = scan.loc[o.index]
    fig = plt.figure(figsize=(7, 2.5), dpi=300);
    pplt.Manhattan(data=sort(scan), Outliers=sort(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 6) for ax in fig.get_axes()];

    pplt.annotate('(A)', ax=fig.axes[0], fontsize=8)
    pplt.annotate('(B)', ax=fig.axes[1], fontsize=8)
    plt.gcf().subplots_adjust(bottom=0.15);
    pplt.savefig('manhattan', 300)
    plt.savefig(utl.paperFiguresPath + 'manhattan.pdf')
    regions = utl.BED.getIntervals(o.H, padding=30000);
    print regions.shape
    intervalGenes = utl.BED.intersection(ann, regions).name.drop_duplicates().reset_index().set_index('name');
    print intervalGenes.size
    g = intervalGenes;
    # intervalGenes
    # g=g[g>=g.quantile(0.)];
    print g.size
    df = Genes.groupby(level=0).apply(lambda x: pd.DataFrame(
        [x.name, x.term.iloc[0]] + unpackp(goPvalue(x.FBgn.drop_duplicates())) + [x.ontology.iloc[0],
                                                                                  x.FBgn.unique().size] + [
            np.intersect1d(x.values, g.index.values)],
        index=['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Ontology', 'Num of Genes', 'Genes']).T)
    df = df[(df['-log($p$-value)'] >= 3) & (df.Hits >= 3)]
    df['-log($p$-value)'] = df['-log($p$-value)'].astype(str)
    df = df.set_index('Ontology')
    df.groupby(level=0).apply(saveGOTex);
    print df

    tempGenes = Genes.reset_index().set_index('FBgn').loc[
        np.append(df.set_index('GO ID').loc['GO:0009631'].Genes, df.set_index('GO ID').loc['GO:0009408'].Genes)][
        ['term', 'name', 'GO']].reset_index().set_index('GO').loc[['GO:0009631', 'GO:0009408']].drop_duplicates()
    tempGenes.columns = ['FlyBase ID', 'GO Term', 'Gene Name']
    utl.DataframetolaTexTable(tempGenes, fname=utl.paperPath + '/tables/{}.tex'.format('tempGenes'),
                              alignment=['l', 'l', 'l'])


    regions.to_csv(utl.paperPath + 'data/intervals.csv')

    snps = utl.BED.intersection(scores.reset_index(), regions, 0);
    snps['POS'] = snps.start;
    snps.set_index('POS', append=True, inplace=True)
    snps = snps['name'].astype(float).reset_index().drop_duplicates().set_index(['CHROM', 'POS']).name

    def ff(x):
        y = utl.BED.intersection(scores.reset_index(), x, 0).rename(columns={'start': 'POS'}).set_index('POS',
                                                                                                append=True).name.astype(
            float)
        y = y[y > 0]
        y = y[y >= y.quantile(0.9)]
        print x['len'].iloc[0], y.size
        return y

    cands = regions.reset_index().groupby(level=0).apply(ff).reset_index(level=0).name
    cands.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/cands.final.txt',
                                                                       sep='\t', header=None, index=False)
    scores.sort_index().reset_index().drop_duplicates().dropna().to_csv(utl.outpath + 'real/gowinda/allsnps.txt',
                                                                        sep='\t', header=None, index=False)

    name = 'cands.final.out.tsv'
    gowinda = pd.read_csv('/home/arya/out/real/gowinda/{}'.format(name), sep='\t', header=None)[[0, 4, 5, 6, 7, 8, 9]]
    gowinda.columns = ['GO ID', '-log($p$-value)', 'Hits', 'Num of Genes', 'Total Genes', 'GO Term', 'Genes']
    gowinda = gowinda[gowinda.Hits >= 3]
    gowinda['-log($p$-value)'] = -gowinda['-log($p$-value)'].apply(np.log10).round(1)
    gowinda.to_csv(utl.paperPath + 'data/gowinda.all.tsv', sep='\t')
    bp = gowinda.set_index('GO ID').loc[
        Genes[Genes.ontology == 'biological_process'].index.unique().rename('GO ID')].dropna()
    bp.to_csv(utl.paperPath + 'data/gowinda.bp.tsv', sep='\t')
    utl.DataframetolaTexTable(bp.reset_index()[['GO ID', 'GO Term', '-log($p$-value)']], alignment=['c', 'p{4in}', 'c'],
                              fname=utl.paperPath + 'tables/gowinda.tex')

    map(len, (Genes.index.unique(), bp.index.unique(), df.loc['biological_process']['GO ID'].unique())), len(
        np.intersect1d(bp.index.unique(), df['GO ID'].unique()))

    pval = utl.getPvalFisher(Genes[Genes.ontology == 'biological_process'].index.unique(), bp.index.unique(),
                             df.loc['biological_process']['GO ID'].unique())
    print pval

    stats = pd.Series(None, name='Value')

    stats['Num. of Vatiants'] = scores.size
    stats['Num. of Candidate Intervals'] = regions.shape[0]
    stats['Total Num. of Genes'] = loadGeneCoordinates().shape[0]
    stats['Num. of Variant Genes'] = ann['Gene_ID'].unique().shape[0]
    stats['Num. of Genes within Candidate Intervals'] = intervalGenes.shape[0]
    stats['Total Num. of GO'] = len(loadGeneData().index.unique())
    stats['Num. of GO with 3 or More Genes'] = len(Genes.index.unique())
    stats['Num. of Candidate Variants for Gowinda'] = cands.size
    stats = stats.apply(lambda x: '{:,.0f}'.format(x))
    stats.index.name = 'Statistic'
    print stats
    utl.DataframetolaTexTable(stats.reset_index(), fname=utl.paperPath + 'tables/stats.tex', alignment=['l', 'r'])
コード例 #3
0
ファイル: GeneAnalysis.py プロジェクト: airanmehr/bio
def IntervalAnalysis():
    q = 0.9;
    padding = 25000;
    windowMinSNP = 500
    ann = loadANN()
    scores = pd.read_pickle(utl.outpath + 'real/scores.df')
    scores = (scores.lrh * scores.sh.apply(np.sign)).rename('H')
    a = pd.DataFrame(scores).join(ann, how='inner')

    regions = computeIntervals()
    csv = regions.reset_index().iloc[:, :3];
    csv[' '] = range(1, csv.shape[0] + 1)
    utl.DataframetolaTexTable(csv.iloc[:, [3, 0, 1, 2]], fname=utl.paperPath + 'new/intervals.tex')

    intervalGenes = utl.BED.intersection(a, regions).name.drop_duplicates().reset_index().set_index('name')
    intervalGenes.columns = ['chr'];
    # pd.Series(intervalGenes.index.unique()).to_csv(utl.paperFiguresPath + '../new/intervalGenes.{}.tsv'.format(field), sep='\t',
    #                                                index=None)
    intervalGenes.index.name = 'FBgn'
    gscores = pd.read_pickle(utl.outpath + 'real/geneScores.df')
    col = 'transcript'
    # for col in gscores.columns:
    #     if col != 'transcript': continue
    data = gscores
    Genes = loadGeneData().reset_index().set_index('GO')

    # def getGOinfoforVineet(go):
    #     return Genes.loc[go].reset_index().iloc[:, :4].drop_duplicates().set_index('FBgn').join(
    #             intervalGenes).dropna().reset_index()
    #
    # getGOinfoforVineet('GO:0004046')
    # getGOinfoforVineet('GO:0006520')
    # getGOinfoforVineet('GO:0004601')

    res = []
    for go in Genes.index.unique():
        if Genes[Genes.index == go].shape[0]:
            gogenes = Genes[Genes.index == go]['FBgn'].drop_duplicates()
            try:
                p, cont = utl.getPvalFisher(AllGenes=data.index.values, putativeList=gogenes.values,
                                            myList=intervalGenes.index.values)
                res += [(go, Genes[Genes.index == go].term.iloc[0], p, cont.loc['Putative', 'myList'],
                         gogenes.shape[0])]
            except:
                pass

    df = pd.DataFrame(res, columns=['GO', 'term', 'p', 'hit', 'numGenes']).set_index('GO')
    df = df[df.p > 3]
    df = df[df.hit >= 3]

    df
    df = df.reset_index().sort_values('p', ascending=False)
    df.p = df.p.round(1)
    df.columns = ['GO ID', 'GO Term', '-log($p$-value)', 'Hits', 'Num of Genes']
    df['Rank'] = range(1, df.shape[0] + 1)

    df = df.iloc[:, [-1] + range(df.shape[1])[:-1]]
    np.intersect1d(df['GO ID'].values, 'arya')
    arya = np.array("""GO:0004046
    GO:0015101
    GO:0007501
    GO:0004601
    GO:0006979
    GO:0009312
    GO:0004653
    GO:0040014
    GO:0016485
    GO:0006030
    GO:0020037
    GO:0008061
    GO:0004702""".split())
    path = utl.paperFiguresPath + '../new/{}Fisher.tex'.format('transcript')
    utl.DataframetolaTexTable(df, alignment=['c', 'c', 'p{3in}', 'c', 'c', 'c'], fname=path)