def outlier(): scores = rutl.removeHeteroChromatin(rutl.loadScores()) field = comale; df = sort(utl.scanGenome(scores.abs(), {field: lambda x: x.abs().mean(), 'Num. of SNPs': lambda x: x.size}))[ [field, 'Num. of SNPs']] a = df.iloc[:, 0] a = a.rename('Global Outliers'); o = a[a > a.quantile(0.99)] o.to_pickle(utl.outpath + 'real/outliers.global.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()]; plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('global')) a = a.rename('Chrom Outliers'); o = a.groupby(level=0).apply(lambda x: x[x > x.quantile(0.99)].loc[x.name]) o.to_pickle(utl.outpath + 'real/outliers.chrom.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()] plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('chrom')) a = a.rename('Local Outliers'); o = localOutliers(a) o.to_pickle(utl.outpath + 'real/outliers.local.df') fig = plt.figure(figsize=(7, 1.5), dpi=300); pplt.Manhattan(data=a, Outliers=pd.DataFrame(o), fig=fig, markerSize=2, ticksize=8, sortedAlready=True); [pplt.setSize(ax, 5) for ax in fig.get_axes()] plt.gcf().subplots_adjust(bottom=0.15); plt.savefig(utl.paperPath + 'new/{}.pdf'.format('local'))
def findCandidateSNPs(outlierMode='local'): print 'running ', outlierMode ann = loadANN()["Annotation Annotation_Impact Gene_Name Gene_ID".split()] intervals = utl.BED.getIntervals(pd.read_pickle(utl.outpath + 'real/outliers.{}.df'.format(outlierMode)), padding=25000) scores = rutl.removeHeteroChromatin(rutl.loadScores()).rename('H') scores.shape candidates = [] for _, row in intervals.iterrows(): row = pd.DataFrame(row).T row.index.name = 'CHROM' snp = utl.BED.intersection(scores.reset_index(), row, 'H').rename(columns={'name': 'H', 'start': 'POS'})[ ['POS', 'H']].set_index('POS', append=True)['H'].astype(float) snp = snp[snp > snp.quantile(0.99)] candidates += [snp] print snp.shape, row.iloc[0].loc['len'] candidates = pd.DataFrame(pd.concat(candidates)).join(ann, how='inner') candidates = candidates[candidates['Annotation_Impact'] != 'LOW'] candidates.to_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))
def createGwandaDataNew(): def save(df, name='candidatesnps.txt'): df.sort_index().reset_index().to_csv(utl.outpath + 'real/gowinda/{}.txt'.format(name), sep='\t', header=None, index=False) scores = rutl.removeHeteroChromatin(rutl.loadScores()).rename('H') save(scores, 'allsnps') for outlierMode in ['local', 'global', 'chrom']: a = pd.read_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))['H'].reset_index().drop_duplicates( subset=['CHROM', 'POS']).set_index(['CHROM', 'POS']).H save(a, 'cand.' + outlierMode + '.damped.0') print a.shape ann = loadANN()["Annotation Annotation_Impact Gene_Name Gene_ID".split()] for dampn in [100, 500, 1000, 2000]: damp = scores.sort_values(ascending=False).iloc[:dampn] damp = pd.DataFrame(damp).join(ann, how='inner') damp = \ damp[damp['Annotation_Impact'] != 'LOW']['H'].reset_index().drop_duplicates().set_index(['CHROM', 'POS'])['H'] damp.shape for outlierMode in ['local', 'global', 'chrom']: a = pd.read_pickle(utl.outpath + 'real/{}.df'.format('cand.' + outlierMode))[ 'H'].reset_index().drop_duplicates(subset=['CHROM', 'POS']).set_index(['CHROM', 'POS']).H a = pd.concat([a, damp]).reset_index().drop_duplicates(subset=['CHROM', 'POS']).set_index( ['CHROM', 'POS']).H save(a, 'cand.' + outlierMode + '.damped.{}'.format(dampn)) print a.shape Genes = pd.read_pickle(utl.outpath + 'real/GO.df').set_index('GO') Genes = Genes[Genes.AnnID.apply(lambda x: x[:2] == 'CG')] Genes.AnnID.value_counts() df = pd.concat([Genes.term.drop_duplicates(), Genes.AnnID.groupby(level=0).apply(lambda x: ' '.join(x.tolist()))], axis=1) df.to_csv(utl.outpath + 'real/gowinda/goassociation.CG', sep='\t', header=None) df = pd.concat([Genes.term.drop_duplicates(), Genes.FBgn.groupby(level=0).apply(lambda x: ' '.join(x.tolist()))], axis=1) df.to_csv(utl.outpath + 'real/gowinda/goassociation.FBgn', sep='\t', header=None)