Ejemplo n.º 1
0
def closestAnnotation(
    bedFile,
    RANGE=1000,
    ANNOTATION_FILE=None,
    GSIZE=None,
    silent=True,
):
    '''
    use bedtools to find the feature closest to the 
regions contianed inthe in the given bed file.
    The annotation will be expanded by {RANGE} bp before queryed
    chrom.sizes must be supplied as {GSIZE} to make bedtools happy
'''

    FOUT = bedFile.split('/')[-1]
    FOUT = 'type=closest_bed=%s_feat=%s.tsv' % (
        pyutil.basename(bedFile), pyutil.basename(ANNOTATION_FILE))
    cmd = '''
bedtools slop -b {RANGE} -i {ANNO} -g {GSIZE} |bedtools sort > {ANNOBASE}.{RANGE}
bedtools sort -i {bedFile} |\
bedtools closest -d -a - -b {ANNOBASE}.{RANGE} | tee {FOUT}.tmp
'''.format(
        GSIZE=GSIZE,
        ANNO=ANNOTATION_FILE,
        ANNOBASE=ANNOTATION_FILE.split('/')[-1],
        bedFile=bedFile,
        RANGE=RANGE,
        FOUT=FOUT,
    ).strip()
    buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent))
    if buf.len:
        buf.seek(0)
        header = sum([
            guessBedHeader(x, prefix=k)
            for k, x in [('', bedFile), ('feat', ANNOTATION_FILE)]
        ], [])
        header += [
            'distance',
        ]
        df = pyutil.readData(buf, header=None, ext='tsv', guess_index=False)
        df.columns = header
#         df = parseBedClosest(fname = buf)
#         os.system('rm %s.tmp' % FOUT)
    else:
        assert 0, ' Buffer is empty, check error msg'
    df = df[df['distance'] == 0]
    df.to_csv(FOUT, sep='\t', index=0)
    return FOUT
Ejemplo n.º 2
0
def bed_randomise(infile, GSIZE=None, silent=1):
    '''Create a randomly distributed bed file
'''
    ofile = pyutil.basename(infile) + '_type=random.bed'
    assert GSIZE is not None
    LC = pyutil.lineCount(infile)
    cmd = "bedtools random -g {GSIZE} -l 2 -n {LC}  | tee {ofile}".format(
        **locals())
    pyutil.shellexec(cmd, silent=silent)
    return ofile
Ejemplo n.º 3
0
def job__nearAUG(peakFile=None,
                 featFile=None,
                 peakSummit=None,
                 featSummit=None,
                 CUTOFF=6000,
                 peakWid=None,
                 GSIZE=None):
    JOB = 'nearAUG'
    if peakSummit is None:
        assert peakFile is not None
        peakSummit = sdio.bed__summit(peakFile, GSIZE=GSIZE, inplace=0)
    if featSummit is None:
        assert featFile is not None
        featSummit = sdio.bed__leftSummit(featFile, GSIZE=GSIZE, inplace=0)

    if peakWid is None:
        peakWid = sdio.bed__guessWidth(peakFile) // 2

    res = sdio.summitDist(peakSummit,
                          featSummit,
                          CUTOFF=6000 - peakWid,
                          GSIZE=GSIZE)

    out = sdio.extract_peak(peakSummit).merge(
        res[['acc', 'feat_acc', 'distance']],
        #         res.drop(columns=['chrom','start','end']),
        how='right',
        left_on='acc',
        right_on='acc').query("distance < %d" % CUTOFF)

    featSummitBase = pyutil.basename(featSummit).replace('_', '-')
    peakSummitBase = pyutil.basename(peakSummit).replace('_', '-')
    ofname = '\
job_{JOB}__\
peak_{peakSummitBase}__\
cutoff_{CUTOFF}__\
feat_{featSummitBase}.tsv'.format(**locals())
    #     pyutil.to_tsv(out,ofname)
    out.to_csv(ofname, sep='\t')
    #     pyutil.to_tsv(out,ofname,header=True,index=1)
    return ofname
Ejemplo n.º 4
0
def npk_expandSummit(fname=None, df=None, radius=200, clip=1, center_summit=0):
    '''
    Expand the summit regions of a .narrowPeak dataFrame
'''
    if df is None:
        df = extract_peak(fname)

    if 'abs_summit' not in df.columns:
        #         if 'summit' not in df.columns and :
        if center_summit:
            df['summit'] = (df.start + df.end) // 2

        assert 'summit' in df.columns
        if (df.summit >= df.start).all():
            pass
        else:
            df['summit'] = df.start + df.summit
        df.rename(columns={'summit': 'abs_summit'}, inplace=True)
        df.abs_summit = df.abs_summit.astype('int')


#       df = df.rename(columns={'summit':'abs_summit'}, )

#     st = df.strand
    df.start = (df.abs_summit - radius)
    df.end = df.abs_summit + radius
    #     df.drop('abs_summit',1,inplace=True)
    if clip:
        df.start = df.start.clip_lower(0)

    if fname is not None:
        base = pyutil.basename(fname)
        ofname = '%s_radius=%d.tsv' % (base, radius)
        df.to_csv(ofname, sep='\t', index=None, header=None)
        return ofname
    else:
        return df
Ejemplo n.º 5
0
def qc_summitDist(
    peak1,
    peak2,
    GSIZE,
    query=None,
    query1=None,
    query2=None,
    xlab=None,
    ylab=None,
    CUTOFF=600,
    axs=None,
    density=1,
    #                  ax = None,
):
    '''plot the distribution of inter-summit distance between files
'''
    xbin = np.linspace(2, CUTOFF, 50)
    if axs is None:
        fig, axs = plt.subplots(1, 3, figsize=[16, 4])


#         ax = axs[0]
    i = -1

    infiles = [peak1, peak2]

    if query is not None:
        query1 = query2 = query
    if query1 is not None and query2 is not None:
        querys = [query1, query2]
        infiles = [
            pyutil.queryCopy(
                reader=sdio.extract_peak,
                infile=infile,
                query=query,
                inplace=True,
            ) for query, infile in zip(querys, infiles)
        ]

    if xlab is None:
        xlab = pyutil.basename(peak1)
    if ylab is None:
        ylab = pyutil.basename(peak2)

    randFiles = [sdio.bed_randomise(fname, GSIZE=GSIZE) for fname in infiles]
    peak1, peak2 = infiles

    i += 1
    ax = axs[i]
    plt.sca(ax)
    plotter = ax.hist
    # plotter = pyvis.histoLine
    common = {
        'bins': xbin,
        'density': density,
        'alpha': 1.0,
        'histtype': 'step',
    }

    df = df__inter = sdio.summitDist(peak1, peak2, GSIZE=GSIZE, CUTOFF=CUTOFF)
    lab = '%s__%s' % (xlab, ylab)
    lab += ',N=%d' % len(df)
    plotter(df.distance, label=lab, **common)

    df = sdio.summitDist(peak1, peak1, GSIZE=GSIZE, CUTOFF=CUTOFF)
    lab = '%s__%s' % (xlab, xlab)
    lab += ',N=%d' % len(df)
    plotter(df.distance, label=lab, **common)

    df = sdio.summitDist(peak2, peak2, GSIZE=GSIZE, CUTOFF=CUTOFF)
    lab = '%s__%s' % (ylab, ylab)
    lab += ',N=%d' % len(df)
    plotter(df.distance, label=lab, **common)

    df = sdio.summitDist(peak1, randFiles[-1], GSIZE=GSIZE, CUTOFF=CUTOFF)
    lab = '%s__randomise-%s' % (xlab, ylab)
    lab += ',N=%d' % len(df)
    plotter(df.distance, label=lab, **common)

    df = sdio.summitDist(peak2, randFiles[0], GSIZE=GSIZE, CUTOFF=CUTOFF)
    lab = 'randomise-%s__%s' % (xlab, ylab)
    lab += ',N=%d' % len(df)
    plotter(df.distance, label=lab, **common)

    title = 'query1={query1},query2={query2}'.format(**locals())
    ax.set_title(title)
    ax.set_xlabel('inter-summit distance (bp)')
    ax.legend()
    ax.grid(1)

    df = df_inter = df__inter.query('distance>1')
    L1and2 = len(df.acc.unique())
    L1not2 = pyutil.lineCount(peak1) - L1and2

    L2and1 = len(df.feat_acc.unique())
    L2not1 = pyutil.lineCount(peak2) - L2and1

    Lall = (L1and2 + L2and1) // 2
    i += 1
    ax = axs[i]
    plt.sca(ax)
    indVenn = pyvis.qc_index(subsets=(L1not2, L2not1, Lall),
                             silent=0,
                             xlab=xlab,
                             ylab=ylab,
                             ax=ax)[0]
    return df_inter, indVenn, axs
Ejemplo n.º 6
0
def qc_narrowPeak(
    qfile,
    cutoff=0.98,
    ax=None,
    silent=1,
    keyFile=None,
    ofname=None,
    cutoff_key='per_FC',
    #                   cutoff = {'per_FC':0.98}
):
    '''
    Visualise the fold-change distribution and do cutoff
'''
    f = open(qfile)
    fline = f.readline()
    f.close()

    if fline.split('\t')[0] == 'chrom':
        qres = pyutil.readData(qfile, guess_index=0)
        pass
    else:
        qres = sdio.extract_peak(qfile)

    qres['per_FC'] = pyutil.dist2ppf(qres.FC)
    qres['per_score'] = pyutil.dist2ppf(qres.score)

    dfc = qres.query('%s > %.3f' % (cutoff_key, cutoff))

    ofname = '%s_chipTarg.tsv' % pyutil.basename(qfile)

    dfc.reset_index(drop=1).to_csv(ofname, sep='\t', index=None, header=None)
    print(qres.shape, dfc.shape)
    #     print (dfc.shape)

    if keyFile is not None:
        keyDF = pyutil.readData(keyFile)

        dfcc = dfc.set_index('feature_acc', drop=0)
        dfcc = dfcc.loc[~dfcc.index.duplicated(), ]

        keyTarg = pd.concat([dfcc[['FC']], keyDF], axis=1, join='inner')

        pyutil.ipd.display(keyTarg)

    if not silent:
        if ax is None:
            fig, axs = plt.subplots(1, 2, figsize=[12, 4])
            ax = axs[0]
        plt.sca(ax)
        #     cutoff = 0.98
        raw_key = cutoff_key.split('_')[-1]
        ax.plot(qres[cutoff_key], qres[raw_key], 'x')

        ax.set_xlim(0.5, 1.1)
        ax.grid(1)
        ax.vlines(cutoff, *ax.get_ylim())
        ax.set_xlabel('percentile')
        ax.set_ylabel(raw_key)
        title = 'All=%d' % len(qres) + ', keep=%d' % len(dfc)
        ax.set_title(title)

    # dfcc =qc_ sutil.tidyBd(dfc.set_index('feature_acc',drop=0))

    return ofname, ax