Esempio n. 1
0
def outlier():
    scores = rutl.removeHeteroChromatin(rutl.loadScores())
    field = comale;
    df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[
        [field, 'Num. of SNPs']]
    a = df.iloc[:, 0]
    a = a.rename('Global Outliers');
    o = a[a > a.quantile(0.99)]
    o.to_pickle(utl.outpath + 'real/outliers.global.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()];
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('global'))

    a = a.rename('Chrom Outliers');
    o = a.groupby(level=0).apply(lambda x: x[x > x.quantile(0.99)].loc[x.name])
    o.to_pickle(utl.outpath + 'real/outliers.chrom.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('chrom'))

    a = a.rename('Local Outliers');
    o = localOutliers(a)
    o.to_pickle(utl.outpath + 'real/outliers.local.df')
    fig = plt.figure(figsize=(7, 1.5), dpi=300);
    pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True);
    [pplt.setSize(ax, 5) for ax in fig.get_axes()]
    plt.gcf().subplots_adjust(bottom=0.15);
    plt.savefig(utl.paperPath + 'new/{}.pdf'.format('local'))
Esempio n. 2
0
def findCandidateSNPs(outlierMode='local'):
    print  'running ', outlierMode
    ann = loadANN()["Annotation Annotation_Impact               Gene_Name      Gene_ID".split()]
    intervals = utl.BED.getIntervals(pd.read_pickle(utl.outpath + 'real/outliers.{}.df'.format(outlierMode)), padding=25000)
    scores = rutl.removeHeteroChromatin(rutl.loadScores()).rename('H')
    scores.shape
    candidates = []
    for _, row in intervals.iterrows():
        row = pd.DataFrame(row).T
        row.index.name = 'CHROM'
        snp = utl.BED.intersection(scores.reset_index(), row, 'H').rename(columns={'name': 'H', 'start': 'POS'})[
            ['POS', 'H']].set_index('POS', append=True)['H'].astype(float)
        snp = snp[snp > snp.quantile(0.99)]
        candidates += [snp]
        print snp.shape, row.iloc[0].loc['len']

    candidates = pd.DataFrame(pd.concat(candidates)).join(ann, how='inner')
    candidates = candidates[candidates['Annotation_Impact'] != 'LOW']
    candidates.to_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))
Esempio n. 3
0
def createGwandaDataNew():
    def save(df, name='candidatesnps.txt'):
        df.sort_index().reset_index().to_csv(utl.outpath + 'real/gowinda/{}.txt'.format(name), sep='\t', header=None,
                                             index=False)

    scores = rutl.removeHeteroChromatin(rutl.loadScores()).rename('H')
    save(scores, 'allsnps')
    for outlierMode in ['local', 'global', 'chrom']:
        a = pd.read_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))['H'].reset_index().drop_duplicates(
            subset=['CHROM', 'POS']).set_index(['CHROM', 'POS']).H
        save(a, 'cand.' + outlierMode + '.damped.0')
        print a.shape

    ann = loadANN()["Annotation Annotation_Impact               Gene_Name      Gene_ID".split()]

    for dampn in [100, 500, 1000, 2000]:
        damp = scores.sort_values(ascending=False).iloc[:dampn]
        damp = pd.DataFrame(damp).join(ann, how='inner')
        damp = \
        damp[damp['Annotation_Impact'] != 'LOW']['H'].reset_index().drop_duplicates().set_index(['CHROM', 'POS'])['H']
        damp.shape

        for outlierMode in ['local', 'global', 'chrom']:
            a = pd.read_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))[
                'H'].reset_index().drop_duplicates(subset=['CHROM', 'POS']).set_index(['CHROM', 'POS']).H
            a = pd.concat([a, damp]).reset_index().drop_duplicates(subset=['CHROM', 'POS']).set_index(
                    ['CHROM', 'POS']).H
            save(a, 'cand.' + outlierMode + '.damped.{}'.format(dampn))
            print a.shape

    Genes = pd.read_pickle(utl.outpath + 'real/GO.df').set_index('GO')
    Genes = Genes[Genes.AnnID.apply(lambda x: x[:2] == 'CG')]
    Genes.AnnID.value_counts()
    df = pd.concat([Genes.term.drop_duplicates(), Genes.AnnID.groupby(level=0).apply(lambda x: ' '.join(x.tolist()))],
                   axis=1)
    df.to_csv(utl.outpath + 'real/gowinda/goassociation.CG', sep='\t', header=None)

    df = pd.concat([Genes.term.drop_duplicates(), Genes.FBgn.groupby(level=0).apply(lambda x: ' '.join(x.tolist()))],
                   axis=1)
    df.to_csv(utl.outpath + 'real/gowinda/goassociation.FBgn', sep='\t', header=None)