Example #1
0
def qc_Sort(df=None,
            fname=None,
            cname='test',
            vlim=[-2, 2],
            title=None,
            xlim=None,
            ylim=None,
            figsize2=[14, 6],
            nMax=5000,
            **heatargs):
    figs = collections.OrderedDict()
    vmin, vmax = vlim
    if df is None:
        df = pyutil.readData(fname)
        if title is None:
            title = '[file]%s' % fname
    heatargs.update({
        'vmin': vmin,
        'vmax': vmax,
        'cname': cname,
        'vlim': vlim,
    })
    if isinstance(df, pd.DataFrame):
        C = df.values
    else:
        C = df
    (M, V, CV), axsLst = qcAvg(C, silent=0, xlim=xlim, ylim=ylim, nMax=nMax)
    figs['qcAvg'] = plt.gcf()

    plt.suptitle(title)
    inter = -len(C) // 1000

    fig, axs = plt.subplots(3,
                            1,
                            figsize=figsize2,
                            gridspec_kw={'hspace': 0.3})
    axs = axs.flat
    pyvis.heatmap(C[V.argsort()][::inter],
                  transpose=1,
                  main='sorted by Varaince',
                  ax=axs[0],
                  **heatargs)

    pyvis.heatmap(C[CV.argsort()][::inter],
                  transpose=1,
                  main='sorted by CV',
                  ax=axs[1],
                  **heatargs)

    pyvis.heatmap(C[M.argsort()][::inter],
                  transpose=1,
                  main='sorted by Average',
                  ax=axs[2],
                  **heatargs)

    axsLst = np.hstack([axsLst, axs])
    figs['qcSort'] = plt.gcf()

    return (M, V, CV), figs
Example #2
0
def extract_peak(fname, ext='tsv', header=None, guess_index=0, **kwargs):
    df = pyutil.readData(fname,
                         ext=ext,
                         header=header,
                         guess_index=guess_index,
                         **kwargs)
    df.columns = bedHeader[:len(df.columns)] + list(
        df.columns)[len(bedHeader):]
    return df
Example #3
0
def closestAnnotation(
    bedFile,
    RANGE=1000,
    ANNOTATION_FILE=None,
    GSIZE=None,
    silent=True,
):
    '''
    use bedtools to find the feature closest to the 
regions contianed inthe in the given bed file.
    The annotation will be expanded by {RANGE} bp before queryed
    chrom.sizes must be supplied as {GSIZE} to make bedtools happy
'''

    FOUT = bedFile.split('/')[-1]
    FOUT = 'type=closest_bed=%s_feat=%s.tsv' % (
        pyutil.basename(bedFile), pyutil.basename(ANNOTATION_FILE))
    cmd = '''
bedtools slop -b {RANGE} -i {ANNO} -g {GSIZE} |bedtools sort > {ANNOBASE}.{RANGE}
bedtools sort -i {bedFile} |\
bedtools closest -d -a - -b {ANNOBASE}.{RANGE} | tee {FOUT}.tmp
'''.format(
        GSIZE=GSIZE,
        ANNO=ANNOTATION_FILE,
        ANNOBASE=ANNOTATION_FILE.split('/')[-1],
        bedFile=bedFile,
        RANGE=RANGE,
        FOUT=FOUT,
    ).strip()
    buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent))
    if buf.len:
        buf.seek(0)
        header = sum([
            guessBedHeader(x, prefix=k)
            for k, x in [('', bedFile), ('feat', ANNOTATION_FILE)]
        ], [])
        header += [
            'distance',
        ]
        df = pyutil.readData(buf, header=None, ext='tsv', guess_index=False)
        df.columns = header
#         df = parseBedClosest(fname = buf)
#         os.system('rm %s.tmp' % FOUT)
    else:
        assert 0, ' Buffer is empty, check error msg'
    df = df[df['distance'] == 0]
    df.to_csv(FOUT, sep='\t', index=0)
    return FOUT
Example #4
0
def parseBedmap(
    fname=None,
    df=None,
):
    ''' Parse the output of bedMap
'''
    if df is None:
        df = pyutil.readData(fname, header=None, ext='tsv', guess_index=False)

    df = df.dropna()

    df.columns = bedHeader + ['hit']

    res = pyutil.explode(df, 'hit', 'acc', ';')
    res = res.merge(df.drop('hit', 1), on='acc')
    return res
Example #5
0
def extract_closest(
    fname=None,
    df=None,
):
    ''' Parse the output of 'bedtools closest'
'''
    if df is None:
        df = pyutil.readData(fname, header=None, ext='tsv', guess_index=False)


#     df = df.dropna()

    header = bedHeader + pyutil.paste0([['feature_'], bedHeader]).tolist()
    df = df.iloc[:, :18]
    df.columns = header[:17] + ['distance']
    df['hit'] = df['feature_acc']
    return df
Example #6
0
def bed__checkValid(bed, GSIZE, force=0):
    fname = None
    if not isinstance(bed, pd.DataFrame):
        fname = bed
        bed = sdio.extract_peak(bed)
    sizeDF = pyutil.readData(GSIZE, ext='tsv', header=None, guess_index=0)
    sizeDF.columns = ['chrom', 'length']
    bedDF = sizeDF.merge(bed)
    bedDF['valid'] = bedDF.eval('start > 0 and end <= length')
    if not force:
        assert bedDF.valid.all()
    else:
        resDF = bedDF.query('valid').drop(columns=['valid', 'length'])
        if fname is not None:
            ofname = '%s__valid.bed' % fname.rsplit('.', 1)[0]
            pyutil.to_tsv(resDF, ofname)
            return ofname
        else:
            return resDF
Example #7
0
def guessBedHeader(fname,
                   silent=True,
                   ext='tsv',
                   guess_index=0,
                   prefix='',
                   **kwargs):
    cmd = 'head -n5 %s' % fname
    buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent))
    df = pyutil.readData(buf,
                         ext=ext,
                         header=None,
                         guess_index=guess_index,
                         **kwargs)
    if len(df.columns) > len(bedHeader):
        header = bedHeader + list(df.columns)[len(bedHeader):]
    else:
        header = bedHeader[:len(df.columns)]
    if prefix:
        header = ['%s_%s' % (prefix, x) for x in header]
    return map(str, header)
Example #8
0
def main(
    bedFile,
    #          = None,
    bwFiles=None,
    bwTrackFile=None,
    relistByGene=0,  #### potentially takes a long time
    stepSize=50,
    radius=None,
    center_summit=0,
    NCORE=1,
):
    if bedFile is not None:
        refBed = sdio.extract_peak(bedFile)
    if bwTrackFile is not None:
        dfc = pyutil.readData(bwTrackFile, )
    else:
        sdio.extract_bigwig_multiple(
            bedFile=bedFile,
            bwFiles=bwFiles,
            center_summit=center_summit,
            NCORE=NCORE,
            stepSize=stepSize,
            radius=radius,
        )
        relistByGene = 1
    if relistByGene:
        dfc = sdio.listByGene(dfc)
    dfc0 = dfc

    tdf = np.std(dfc.values, axis=1, keepdims=1)
    lr = hlhmm.GaussianHMM(n_components=2,
                           covariance_type="diag",
                           init_params="cmt",
                           params="cmt")
    lr.fit(tdf)
    seg = lr.predict(tdf)

    segDF = pd.DataFrame(seg, index=dfc0.index, columns=['clu'])
    ofname = pyutil.getBname(bedFile) + '__HVPeak.bed'
    ofname = sdio.clu2bed(segDF, ofname)
    return ofname
Example #9
0
    def from_DataFrame(cls,
                       df=None,
                       fname=None,
                       name=None,
                       index_col=None,
                       **kwargs):
        if df is None:
            assert fname is not None, '[ERR] must specify one of "df" or "fname" '
            df = pyutil.readData(fname, **kwargs)
            name = pyutil.os.path.basename(fname).rsplit('.', 1)[0]
        elif isinstance(df, pd.Series):
            df = df.to_frame()
        if index_col is not None:
            assert index_col in df
            df.set_index(index_col, drop=0, inplace=1)
#         print 3,type(df)
        ins = cls(
            C=df.values,
            colName=df.columns,
            rowName=df.index,
            name=name,
            fname=fname,
        )
        return ins
Example #10
0
def job__render__panelPlot(tracks=None,
                           clu=None,
                           order=None,
                           index=None,
                           aliasFmt='{alias}',
                           alias=None,
                           baseFile=0,
                           figsize=None,
                           panel_kw=panel_kw_dft,
                           how='left',
                           debug=0,
                           extra={},
                           **kwargs):
    if figsize is not None:
        panel_kw['figsize'] = figsize
    autoAli = alias is None
    if autoAli:
        alias = ''
    if isinstance(clu, basestring):
        alias += pyext.getBname(clu)
        clu = pyutil.readData(clu, baseFile=baseFile).get(['clu'])

    if isinstance(order, basestring):
        alias += pyext.getBname(order)
        order = pyutil.readData(order, baseFile=baseFile)
    if isinstance(tracks, basestring):
        alias += pyext.getBname(tracks)
        tracks = pyutil.readData(tracks, baseFile=baseFile)
        tracks = list(tracks)
    if isinstance(panel_kw, basestring):
        alias += pyext.getBname(panel_kw)
        panel_kw = pyutil.read__buffer(panel_kw,
                                       ext='json',
                                       typ='rec',
                                       guess_index=0).to_dict()
    if order is not None:
        clu = order.get(['clu'])
    else:
        assert clu is not None
        order = pd.DataFrame(clu)

    if isinstance(index, basestring):
        alias += pyutil.sanitise_query(index)
        locals().update(extra)
        index = eval(index)

    cluTrack = spanel.fixCluster(clu.get(['clu']))
    alias = aliasFmt.format(**locals())
    #     cluFile_clean = 'clean_%s.csv' % alias
    #     cluc.to_csv(cluFile_clean)
    tracks = pyext.list__realise(tracks, locals())
    ##### Output heatmap
    pp = spanel.panelPlot(tracks, **panel_kw)
    pp.compile(how=how, index=index, **kwargs)
    pp.compile(order=order)
    #     if debug:
    #         return pp
    if debug:
        return pp
    fig = pp.render()
    return (alias, fig)
Example #11
0
def count__getGeneHeader(fname, ext='tsv', pipeline=None, silent=1, **kwargs):
    ext = 'tsv'  ### hard set
    res = file__header(fname, silent=silent)
    df = pyutil.readData(res, ext=ext, guess_index=0)
    return df.gene_id.tolist()
Example #12
0
def tsv__getColumns(fname, ext='tsv'):
    #     pyutil.readData()
    res = file__header(fname, silent=silent)
    df = pyutil.readData(res, ext=ext)
    return df.columns.tolist()
Example #13
0
def summitDist(peak1,
               peak2,
               CUTOFF=400,
               silent=1,
               GSIZE=None,
               as_fname=0,
               **kwargs):
    '''Find nearby summits within a distance cutoff
'''
    if GSIZE is None:
        GSIZE = pyutil.os.environ.get('GSIZE', None)
    assert GSIZE is not None
    RANGE = CUTOFF // 2 - 1
    infiles = [peak1, peak2]
    #     def file_ncol(fname):
    #         cmd = 'wc -l %s'%(fname)
    #         res = pyutil.shellexec(cmd,silent=silent)
    #         ncol = res[0].strip().split('\t')
    #     incols =
    incols = map(pyutil.file_ncol, infiles)

    ### padding/inflate the summit to have radius
    lst = []
    for infile in infiles:

        ofile = "{infile}.{RANGE}".format(**locals()).split('/')[-1]
        lst += [ofile]

        cmd = "bedtools slop -g {GSIZE} -b {RANGE} -i {infile} \
          | tee {ofile}".format(**locals())
        _ = pyutil.shellexec(cmd, silent=silent)

    slop1, slop2 = lst
    FOUT = 'infiles:'+ ":".join(map(pyutil.basename,infiles)) \
        + "__cutoff:{}.tsv".format(CUTOFF)

    # ### bed format 1=chrom, 2=start, 3=end
    # cols = ','.join(map(str,[2,3,] + [x + incols[0] for x in [2,3]]))
    # cmd = "bedtools closest -a {slop1} -b {slop2} \
    #   | bedtools overlap -cols {cols} \
    #   | tee {FOUT}".format(**locals())

    cmd = "bedtools intersect -wo -a {slop1} -b {slop2} \
      | tee {FOUT}".format(**locals())

    buf = pyutil.shellexec(cmd, silent=silent)

    ### [TBC]Memory-intensive, Replace with awk mutation in the future
    columns = header_closest(peak1, peak2)

    df = pyutil.readData(StringIO.StringIO(buf),
                         header=None,
                         ext='tsv',
                         guess_index=False,
                         columns=columns)
    df.distance = CUTOFF - df.distance
    df.to_csv(FOUT, sep='\t', index=False)
    if as_fname:
        return FOUT
    else:
        return df
Example #14
0
def qc_narrowPeak(
    qfile,
    cutoff=0.98,
    ax=None,
    silent=1,
    keyFile=None,
    ofname=None,
    cutoff_key='per_FC',
    #                   cutoff = {'per_FC':0.98}
):
    '''
    Visualise the fold-change distribution and do cutoff
'''
    f = open(qfile)
    fline = f.readline()
    f.close()

    if fline.split('\t')[0] == 'chrom':
        qres = pyutil.readData(qfile, guess_index=0)
        pass
    else:
        qres = sdio.extract_peak(qfile)

    qres['per_FC'] = pyutil.dist2ppf(qres.FC)
    qres['per_score'] = pyutil.dist2ppf(qres.score)

    dfc = qres.query('%s > %.3f' % (cutoff_key, cutoff))

    ofname = '%s_chipTarg.tsv' % pyutil.basename(qfile)

    dfc.reset_index(drop=1).to_csv(ofname, sep='\t', index=None, header=None)
    print(qres.shape, dfc.shape)
    #     print (dfc.shape)

    if keyFile is not None:
        keyDF = pyutil.readData(keyFile)

        dfcc = dfc.set_index('feature_acc', drop=0)
        dfcc = dfcc.loc[~dfcc.index.duplicated(), ]

        keyTarg = pd.concat([dfcc[['FC']], keyDF], axis=1, join='inner')

        pyutil.ipd.display(keyTarg)

    if not silent:
        if ax is None:
            fig, axs = plt.subplots(1, 2, figsize=[12, 4])
            ax = axs[0]
        plt.sca(ax)
        #     cutoff = 0.98
        raw_key = cutoff_key.split('_')[-1]
        ax.plot(qres[cutoff_key], qres[raw_key], 'x')

        ax.set_xlim(0.5, 1.1)
        ax.grid(1)
        ax.vlines(cutoff, *ax.get_ylim())
        ax.set_xlabel('percentile')
        ax.set_ylabel(raw_key)
        title = 'All=%d' % len(qres) + ', keep=%d' % len(dfc)
        ax.set_title(title)

    # dfcc =qc_ sutil.tidyBd(dfc.set_index('feature_acc',drop=0))

    return ofname, ax
Example #15
0
# execfile('/home/feng/headers/header__import.py')
import pymisca.util as pyutil
dfc = pyutil.readData(pyutil.base__file('TOUCHED.list'),ext='tsv',header=None)

ind = dfc.query('~index.duplicated()').sort_index()
# print (ind.to_csv())
ind.to_csv(pyutil.base__file('file.index',force=1))
pyutil.shellexec('''
cd $BASE
cat file.index | grep ^RNA | xargs tar -cvzf RNA-seq.tar.gz
''')
                
pyutil.shellexec('''
cd $BASE
echo > tracking.index
{
cat file.index | grep -v ^RNA-seq 
echo *.tar.gz 
echo *.index 
echo *.txt 
echo *.list 
echo "Snakefile README" 
} >> tracking.index
''')

pyutil.shellexec('''
cd $BASE
echo
mkdir -p dist;
cat tracking.index | xargs cp -avuf --parents -t dist
''')
Example #16
0
def main(
        #### necessary
        bedFile=None,
        bwFiles=None,
        ####
        DIR=None,
        figsize=[14, 14],
        debug=0,
        ylim=[0, 10],
        radius=2000,
        stepSize=10,
        NCORE=4,
        silent=0,
        gtfFile=None,
        cdsFile=None,
        annotation=None,
        GSIZE=None,
        center_summit=0,
        trackNames=None,
        backend='fluff',
        ext='png',
        **kwargs):
    #     vlim = ylim
    figsize = map(int, figsize)
    # for peakAcc in df_near.acc.unique()[:1]:

    prefix = 'PROG=chipShots_bedFile='
    #     bname  = pyutil.basename(bedFile)
    bname = pyutil.os.path.basename(bedFile)
    odname = prefix + bname
    if DIR == 'inplace':
        DIR = pyutil.os.path.dirname(bedFile) + odname
    elif DIR is None:
        DIR = odname
    pyutil.shellexec('mkdir -p %s' % DIR, silent=silent)
    DIR = pyutil.os.path.abspath(DIR)
    #     odname = pyutil.ospath

    if cdsFile is None:
        cdsFile = gtfFile + '.cds'
    if backend == 'synotil':
        # nearFile = './DE2017/type=closest_bed=lux22_radius=1_feat=genes.gtf.cds.tsv'
        # import synotil.filterByCDS
        nearFile = synotil.filterByCDS.main(
            peakFile=bedFile,
            cdsFile=cdsFile,
            downStream=radius,
            upStream=radius,
            peakRadius=1,
            GSIZE=GSIZE,
            center_summit=center_summit,
        )
        df_near = pyutil.readData(nearFile, )

        stderrLine('[MSG]Loading bed intervals from bigwig tracks....')

        chipTracks = sutil.extract_bigwig_multiple(
            fnames=bwFiles,
            bedFile=bedFile,
            radius=radius,
            stepSize=stepSize,
            callback=None,
            outIndex=trackNames,

            #                                               callback=callback,
            center_summit=center_summit,
            shift=0,  #### use positive coordinate
            stranded=False,
            NCORE=NCORE)
        if ylim is None:
            ylim = pyutil.span(
                pyutil.np.hstack([x.values.flat for x in chipTracks]), 99)
            ylim = list(ylim)
            ylim[0] = 0.
        callback = lambda x: [prepare_chipTrack(ele, vlim=ylim) for ele in x]
        chipTracks = callback(chipTracks)

        if debug:
            stderrLine(chipTracks[0].columns)

        gtf = pyutil.readData(gtfFile, ext='tsv', header=None, guess_index=0)
        gtf = scount.countMatrix(gtf, look='gtf')
        gtfs = [gtf]

        #     uniqPeak = df_near.acc.unique()
        #     bedDF = pyutil.readData(bedFile,header=None,guess_index=0)
        #     bedDF.columns = sutil.bedHeader[:len(bedDF.columns)]
        bedDF = sutil.extract_peak(bedFile)
        #     uniqPeak
        #     uniqPeak = bedDF[bedDF.columns]

        worker = pyutil.functools.partial(
            worker__drawPeak,
            DIR=DIR,
            chipTracks=chipTracks,
            df_near=df_near,
            gtfs=gtfs,
            radius=radius,
            figsize=figsize,
            ylim=ylim,
            debug=debug,
        )

        ofnames = pyutil.mp_map(
            worker,
            bedDF.acc,
            n_cpu=NCORE,
        )
    elif backend == 'fluff':
        bedDF = sdio.extract_peak(bedFile)

        argDF = bedDF.copy()
        argDF = sdio.bed__addCol__interval(argDF)
        tracks = list(bwFiles)
        argDF['tracks'] = [tracks] * len(bedDF)
        argDF['annotation'] = annotation
        argDF['DIR'] = DIR
        argDF['ext'] = ext
        if trackNames is not None:
            argDF['labels'] = [list(trackNames)] * len(bedDF)

        ofnames = pyutil.mp_map(
            #         ofnames = map(
            worker__fluff,
            (vars(x) for x in argDF.itertuples()),
            n_cpu=NCORE,
        )
#         ofnames =

    bedDF['img'] = ofnames
    indexFile = '%s/%s.index.tsv' % (DIR, bname)
    pyutil.to_tsv(bedDF, indexFile)
    indexFile = '%s/figureIndex.tsv' % (DIR)
    pyutil.to_tsv(bedDF, indexFile)

    try:
        import synotil.shot2html as shot2html
        htmlFile = shot2html.shot2html(indexFile, localPath=True)
    except Exception as e:
        stderrLine('[WARN]:cannot produce html :%s' % e)
        htmlFile = None


#     print ('[OUTPUT]:',)
#     print ('html:',htmlFile)
#     print ('index:',indexFile)
    print(indexFile)
    print(htmlFile)
    return (indexFile, htmlFile)