def closestAnnotation( bedFile, RANGE=1000, ANNOTATION_FILE=None, GSIZE=None, silent=True, ): ''' use bedtools to find the feature closest to the regions contianed inthe in the given bed file. The annotation will be expanded by {RANGE} bp before queryed chrom.sizes must be supplied as {GSIZE} to make bedtools happy ''' FOUT = bedFile.split('/')[-1] FOUT = 'type=closest_bed=%s_feat=%s.tsv' % ( pyutil.basename(bedFile), pyutil.basename(ANNOTATION_FILE)) cmd = ''' bedtools slop -b {RANGE} -i {ANNO} -g {GSIZE} |bedtools sort > {ANNOBASE}.{RANGE} bedtools sort -i {bedFile} |\ bedtools closest -d -a - -b {ANNOBASE}.{RANGE} | tee {FOUT}.tmp '''.format( GSIZE=GSIZE, ANNO=ANNOTATION_FILE, ANNOBASE=ANNOTATION_FILE.split('/')[-1], bedFile=bedFile, RANGE=RANGE, FOUT=FOUT, ).strip() buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent)) if buf.len: buf.seek(0) header = sum([ guessBedHeader(x, prefix=k) for k, x in [('', bedFile), ('feat', ANNOTATION_FILE)] ], []) header += [ 'distance', ] df = pyutil.readData(buf, header=None, ext='tsv', guess_index=False) df.columns = header # df = parseBedClosest(fname = buf) # os.system('rm %s.tmp' % FOUT) else: assert 0, ' Buffer is empty, check error msg' df = df[df['distance'] == 0] df.to_csv(FOUT, sep='\t', index=0) return FOUT
def bed_randomise(infile, GSIZE=None, silent=1): '''Create a randomly distributed bed file ''' ofile = pyutil.basename(infile) + '_type=random.bed' assert GSIZE is not None LC = pyutil.lineCount(infile) cmd = "bedtools random -g {GSIZE} -l 2 -n {LC} | tee {ofile}".format( **locals()) pyutil.shellexec(cmd, silent=silent) return ofile
def job__nearAUG(peakFile=None, featFile=None, peakSummit=None, featSummit=None, CUTOFF=6000, peakWid=None, GSIZE=None): JOB = 'nearAUG' if peakSummit is None: assert peakFile is not None peakSummit = sdio.bed__summit(peakFile, GSIZE=GSIZE, inplace=0) if featSummit is None: assert featFile is not None featSummit = sdio.bed__leftSummit(featFile, GSIZE=GSIZE, inplace=0) if peakWid is None: peakWid = sdio.bed__guessWidth(peakFile) // 2 res = sdio.summitDist(peakSummit, featSummit, CUTOFF=6000 - peakWid, GSIZE=GSIZE) out = sdio.extract_peak(peakSummit).merge( res[['acc', 'feat_acc', 'distance']], # res.drop(columns=['chrom','start','end']), how='right', left_on='acc', right_on='acc').query("distance < %d" % CUTOFF) featSummitBase = pyutil.basename(featSummit).replace('_', '-') peakSummitBase = pyutil.basename(peakSummit).replace('_', '-') ofname = '\ job_{JOB}__\ peak_{peakSummitBase}__\ cutoff_{CUTOFF}__\ feat_{featSummitBase}.tsv'.format(**locals()) # pyutil.to_tsv(out,ofname) out.to_csv(ofname, sep='\t') # pyutil.to_tsv(out,ofname,header=True,index=1) return ofname
def npk_expandSummit(fname=None, df=None, radius=200, clip=1, center_summit=0): ''' Expand the summit regions of a .narrowPeak dataFrame ''' if df is None: df = extract_peak(fname) if 'abs_summit' not in df.columns: # if 'summit' not in df.columns and : if center_summit: df['summit'] = (df.start + df.end) // 2 assert 'summit' in df.columns if (df.summit >= df.start).all(): pass else: df['summit'] = df.start + df.summit df.rename(columns={'summit': 'abs_summit'}, inplace=True) df.abs_summit = df.abs_summit.astype('int') # df = df.rename(columns={'summit':'abs_summit'}, ) # st = df.strand df.start = (df.abs_summit - radius) df.end = df.abs_summit + radius # df.drop('abs_summit',1,inplace=True) if clip: df.start = df.start.clip_lower(0) if fname is not None: base = pyutil.basename(fname) ofname = '%s_radius=%d.tsv' % (base, radius) df.to_csv(ofname, sep='\t', index=None, header=None) return ofname else: return df
def qc_summitDist( peak1, peak2, GSIZE, query=None, query1=None, query2=None, xlab=None, ylab=None, CUTOFF=600, axs=None, density=1, # ax = None, ): '''plot the distribution of inter-summit distance between files ''' xbin = np.linspace(2, CUTOFF, 50) if axs is None: fig, axs = plt.subplots(1, 3, figsize=[16, 4]) # ax = axs[0] i = -1 infiles = [peak1, peak2] if query is not None: query1 = query2 = query if query1 is not None and query2 is not None: querys = [query1, query2] infiles = [ pyutil.queryCopy( reader=sdio.extract_peak, infile=infile, query=query, inplace=True, ) for query, infile in zip(querys, infiles) ] if xlab is None: xlab = pyutil.basename(peak1) if ylab is None: ylab = pyutil.basename(peak2) randFiles = [sdio.bed_randomise(fname, GSIZE=GSIZE) for fname in infiles] peak1, peak2 = infiles i += 1 ax = axs[i] plt.sca(ax) plotter = ax.hist # plotter = pyvis.histoLine common = { 'bins': xbin, 'density': density, 'alpha': 1.0, 'histtype': 'step', } df = df__inter = sdio.summitDist(peak1, peak2, GSIZE=GSIZE, CUTOFF=CUTOFF) lab = '%s__%s' % (xlab, ylab) lab += ',N=%d' % len(df) plotter(df.distance, label=lab, **common) df = sdio.summitDist(peak1, peak1, GSIZE=GSIZE, CUTOFF=CUTOFF) lab = '%s__%s' % (xlab, xlab) lab += ',N=%d' % len(df) plotter(df.distance, label=lab, **common) df = sdio.summitDist(peak2, peak2, GSIZE=GSIZE, CUTOFF=CUTOFF) lab = '%s__%s' % (ylab, ylab) lab += ',N=%d' % len(df) plotter(df.distance, label=lab, **common) df = sdio.summitDist(peak1, randFiles[-1], GSIZE=GSIZE, CUTOFF=CUTOFF) lab = '%s__randomise-%s' % (xlab, ylab) lab += ',N=%d' % len(df) plotter(df.distance, label=lab, **common) df = sdio.summitDist(peak2, randFiles[0], GSIZE=GSIZE, CUTOFF=CUTOFF) lab = 'randomise-%s__%s' % (xlab, ylab) lab += ',N=%d' % len(df) plotter(df.distance, label=lab, **common) title = 'query1={query1},query2={query2}'.format(**locals()) ax.set_title(title) ax.set_xlabel('inter-summit distance (bp)') ax.legend() ax.grid(1) df = df_inter = df__inter.query('distance>1') L1and2 = len(df.acc.unique()) L1not2 = pyutil.lineCount(peak1) - L1and2 L2and1 = len(df.feat_acc.unique()) L2not1 = pyutil.lineCount(peak2) - L2and1 Lall = (L1and2 + L2and1) // 2 i += 1 ax = axs[i] plt.sca(ax) indVenn = pyvis.qc_index(subsets=(L1not2, L2not1, Lall), silent=0, xlab=xlab, ylab=ylab, ax=ax)[0] return df_inter, indVenn, axs
def qc_narrowPeak( qfile, cutoff=0.98, ax=None, silent=1, keyFile=None, ofname=None, cutoff_key='per_FC', # cutoff = {'per_FC':0.98} ): ''' Visualise the fold-change distribution and do cutoff ''' f = open(qfile) fline = f.readline() f.close() if fline.split('\t')[0] == 'chrom': qres = pyutil.readData(qfile, guess_index=0) pass else: qres = sdio.extract_peak(qfile) qres['per_FC'] = pyutil.dist2ppf(qres.FC) qres['per_score'] = pyutil.dist2ppf(qres.score) dfc = qres.query('%s > %.3f' % (cutoff_key, cutoff)) ofname = '%s_chipTarg.tsv' % pyutil.basename(qfile) dfc.reset_index(drop=1).to_csv(ofname, sep='\t', index=None, header=None) print(qres.shape, dfc.shape) # print (dfc.shape) if keyFile is not None: keyDF = pyutil.readData(keyFile) dfcc = dfc.set_index('feature_acc', drop=0) dfcc = dfcc.loc[~dfcc.index.duplicated(), ] keyTarg = pd.concat([dfcc[['FC']], keyDF], axis=1, join='inner') pyutil.ipd.display(keyTarg) if not silent: if ax is None: fig, axs = plt.subplots(1, 2, figsize=[12, 4]) ax = axs[0] plt.sca(ax) # cutoff = 0.98 raw_key = cutoff_key.split('_')[-1] ax.plot(qres[cutoff_key], qres[raw_key], 'x') ax.set_xlim(0.5, 1.1) ax.grid(1) ax.vlines(cutoff, *ax.get_ylim()) ax.set_xlabel('percentile') ax.set_ylabel(raw_key) title = 'All=%d' % len(qres) + ', keep=%d' % len(dfc) ax.set_title(title) # dfcc =qc_ sutil.tidyBd(dfc.set_index('feature_acc',drop=0)) return ofname, ax