Example #1
0
def df__deposit(dfc,
                runID='000R',
                ext='excel.count',
                DIR='.',
                init=1,
                silent=0,
                sep='\t',
                header=1):
    #     dfc = rnaseq.copy()
    dfc = dfc.copy()
    fnames = assign__filename(dfc.columns,
                              runID=runID,
                              ext=ext,
                              DIR=DIR,
                              init=init)

    dfc.columns = fnames
    dfc.columns.name = 'fname'
    gp = dfc.reset_index().melt(value_name='TPM',
                                id_vars=['gene_id']).groupby('fname')
    for fname, df in gp:
        odf = df.drop(columns='fname').set_index('gene_id')
        pyutil.shellexec(
            'mkdir -p `dirname {fname}`'.format(**locals()),
            silent=silent,
        )
        odf.to_csv(fname, sep=sep, header=header)
    return dfc.columns
Example #2
0
def bed__merge(bedFile, silent=1, opt='-c 4 -o first'):
    bname = pyutil.os.path.basename(bedFile)
    path = pyutil.os.path.dirname(bedFile)
    ofname = pyutil.os.path.join(path, 'merged__%s' % bname)
    cmd = 'bedtools sort -i {bedFile} |  bedtools merge -i - {opt} > {ofname}'.format(
        **locals())
    pyutil.shellexec(cmd, silent=silent)
    return ofname
Example #3
0
def bed_randomise(infile, GSIZE=None, silent=1):
    '''Create a randomly distributed bed file
'''
    ofile = pyutil.basename(infile) + '_type=random.bed'
    assert GSIZE is not None
    LC = pyutil.lineCount(infile)
    cmd = "bedtools random -g {GSIZE} -l 2 -n {LC}  | tee {ofile}".format(
        **locals())
    pyutil.shellexec(cmd, silent=silent)
    return ofile
Example #4
0
def bed__makewindows(bedFile, windowSize=100, stepSize=None, silent=1):
    if stepSize is None:
        # windowSize=100
        stepSize = windowSize // 2


# bedFile = 'per-score-GT-0dot6_188C_RESEQ-combined.bed'
#         bedbase = pyutil.bname(bedbane)
    ofname = '{bedFile}.w{windowSize}s{stepSize}'.format(**locals())
    cmd = "bedtools makewindows -i srcwinnum -w {windowSize} -s {stepSize} -b {bedFile} > {ofname} ".format(
        **locals())
    pyutil.shellexec(cmd, silent=silent)
    return ofname
Example #5
0
def bed__summit(peakFile,
                GSIZE=None,
                silent=1,
                opt='-s -l -0.5 -r -0.5 -pct',
                inplace=True):
    if GSIZE is None:
        GSIZE = pyutil.os.environ.get('GSIZE', None)
    assert GSIZE is not None
    ofname = '%s.summit' % peakFile
    if not inplace:
        ofname = pyutil.os.path.basename(ofname)
    cmd = 'cat {peakFile} \
    | bedtools slop -g {GSIZE} {opt} -i - \
    > {ofname}'.format(**locals())
    pyutil.shellexec(cmd, silent=silent)
    return ofname
Example #6
0
def bed__totalLength(bedFile, silent=1):
    '''Source: https://www.biostars.org/p/68283/#68292
'''
    cmd = "cat %s | awk -F'\t' 'BEGIN{SUM=0}{ SUM+=$3-$2 }END{print SUM}'" % bedFile
    res = pyutil.shellexec(cmd, silent=silent)
    res = int(res.strip())
    return res
Example #7
0
def wig2bigwig(fname, chromSizes='chrom.sizes', silent=1):
    ofbase = pyutil.getBname(fname)
    ofname = '%s.bw' % ofbase
    cmd = '''wigToBigWig {fname} {chromSizes} {ofname}
    '''.format(**locals())
    res = pyutil.shellexec(cmd, silent=silent)
    return ofname
Example #8
0
def fig__fluffProfile(interval,
                      tracks,
                      ofname=None,
                      annotation=None,
                      scaleOpt=None,
                      fragmentSize=0,
                      labels=None,
                      silent=0):
    #     annotation = BED12
    # cmd = 'fluff profile'
    trackFlat = u' '.join(tracks)
    if scaleOpt is None:
        scaleOpt = ' -s 1:%d ' % (len(tracks))
    if ofname is None:
        ofname = interval + '.svg'
    cmd = ''
    cmd += ' fluff profile '
    cmd += scaleOpt
    cmd += ' -f {fragmentSize} '
    if annotation is not None:
        cmd += ' -a {annotation} '
    if labels is not None:
        labelFlat = u' '.join(labels)
        cmd += ' -l {labelFlat}'
    cmd += ' -o {ofname} -i {interval} -d {trackFlat} '
    cmd += ' -b white '
    cmd += ' 2>&1 '
    cmd = cmd.format(**locals())
    res = pyutil.shellexec(cmd, silent=silent)
    return ofname
Example #9
0
def bam__getHeader(fname, grepKey='SQ', silent=1, head=100):
    cmd = u'samtools view -H %s' % fname
    if grepKey is not None:
        cmd = u'{cmd} | grep {grepKey}'.format(**locals())
    if head is not None:
        cmd = u'{cmd} | head -n{head}'.format(**locals())
    res = pyutil.shellexec(cmd, silent=silent)
    return res
Example #10
0
def bed__guessWidth(bedFile, silent=1, head=100):
    res = pyutil.shellexec('head -n{head} {bedFile}'.format(**locals()),
                           silent=silent)
    dfc = extract_peak(StringIO.StringIO(res))
    span = (dfc.end - dfc.start).values.ravel()
    M = np.median(span)
    if span.std() / len(span)**0.5 > 0.1 * M:
        pyutil.sys.stderr.write('[WARN]:estimation may be unstable\n')
    return int(M)
Example #11
0
def file__concat(bedFiles,
                 silent=1,
                 ofname='concated_file',
                 ext=None
                 #                 opt='-c 4 -o first'
                 ):
    '''Concatentat a list of files
'''
    if ext is None:
        sp = bedFiles[0].rsplit('.', 1)
        if len(sp) == 2:
            ext = sp[-1]
        else:
            ext = None
    if ext is not None:
        ofname = '.'.join([ofname, ext])
    flatName = ' '.join(bedFiles)
    cmd = 'cat {flatName} >{ofname}'.format(**locals())
    #     bname = pyutil.os.path.basename(bedFile)
    #     path = pyutil.os.path.dirname(bedFile)
    #     ofname = pyutil.os.path.join(path,'merged__%s'%bname)
    #     cmd = 'bedtools sort -i {bedFile} |  bedtools merge -i - {opt} > {ofname}'.format(**locals())
    pyutil.shellexec(cmd, silent=silent)
    return ofname
Example #12
0
def closestAnnotation(
    bedFile,
    RANGE=1000,
    ANNOTATION_FILE=None,
    GSIZE=None,
    silent=True,
):
    '''
    use bedtools to find the feature closest to the 
regions contianed inthe in the given bed file.
    The annotation will be expanded by {RANGE} bp before queryed
    chrom.sizes must be supplied as {GSIZE} to make bedtools happy
'''

    FOUT = bedFile.split('/')[-1]
    FOUT = 'type=closest_bed=%s_feat=%s.tsv' % (
        pyutil.basename(bedFile), pyutil.basename(ANNOTATION_FILE))
    cmd = '''
bedtools slop -b {RANGE} -i {ANNO} -g {GSIZE} |bedtools sort > {ANNOBASE}.{RANGE}
bedtools sort -i {bedFile} |\
bedtools closest -d -a - -b {ANNOBASE}.{RANGE} | tee {FOUT}.tmp
'''.format(
        GSIZE=GSIZE,
        ANNO=ANNOTATION_FILE,
        ANNOBASE=ANNOTATION_FILE.split('/')[-1],
        bedFile=bedFile,
        RANGE=RANGE,
        FOUT=FOUT,
    ).strip()
    buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent))
    if buf.len:
        buf.seek(0)
        header = sum([
            guessBedHeader(x, prefix=k)
            for k, x in [('', bedFile), ('feat', ANNOTATION_FILE)]
        ], [])
        header += [
            'distance',
        ]
        df = pyutil.readData(buf, header=None, ext='tsv', guess_index=False)
        df.columns = header
#         df = parseBedClosest(fname = buf)
#         os.system('rm %s.tmp' % FOUT)
    else:
        assert 0, ' Buffer is empty, check error msg'
    df = df[df['distance'] == 0]
    df.to_csv(FOUT, sep='\t', index=0)
    return FOUT
Example #13
0
def readModels(DIR):
    DIR = DIR.rstrip('/')
    fnames = pyutil.shellexec("find %s/*randomState*.npy | grep normF" %
                              DIR).splitlines()

    res = map(scount.countMatrix.from_npy, fnames)
    meta = pyutil.flat2meta(
        [x.replace('/', '_').rsplit('.', 1)[0] for x in fnames])
    meta = pd.DataFrame(
        map(lambda x: dict([y for y in x if len(y) == 2]), meta))
    meta['fname_'] = list(fnames)
    meta['obj'] = res
    #     meta.model = [x.model for x in res]
    #     meta['model'] = [[x.model] for x in res]
    #     meta['model'] = [x.model for x in res]

    meta_model = meta
    return meta_model
Example #14
0
def rawFile__combineChunk(dfc, silent=0):
    ''' combine chunkedFiles in df_raw
    according to "fname" and "fnameCombined"
'''
    dfc = dfc.copy()
    #     dfc = dfc[idKeys + ['fnameCombined','fnameCombinedSize']].drop_duplicates()

    fnameFlat = ' \\\n'.join(dfc.fname)
    ofnames = dfc.fnameCombined.unique()
    assert len(ofnames)==1,\
    'contains mulitple fnameCombined!:%s'%ofnames
    ofname = ofnames[0]
    cmd = 'cat {fnameFlat} > {ofname}'.format(**locals())
    res = pyutil.shellexec(cmd, silent=silent)
    dfc['fnameCombinedSize'] = pyutil.os.path.getsize(ofname)
    dfc = dfc[idKeys +
              ['fnameCombined', 'fnameCombinedSize']].drop_duplicates()
    return dfc
Example #15
0
def findPromoter(
    INFILE=None,
    upStream=1000,
    downStream=500,
    opt='-s -i -',
    filterKey='CDS',
    OFILE=None,
    inplace=0,
    GSIZE=None,
    silent=1,
):
    '''Find the promoter from a GTF file
'''
    if GSIZE is None:
        TRY = os.environ.get('GSIZE', None)
        assert TRY is not None, 'Please specify chromosizes'
        GSIZE = TRY
    assert os.path.exists(GSIZE), 'File does not exist:"%s"' % GSIZE

    if OFILE is None:
        OFILE = os.path.basename(INFILE) + '.promoter'
    if inplace:
        OFILE = os.path.join(os.path.dirname(INFILE), OFILE)

    cmd = 'cat %s' % INFILE
    if filterKey is not None:
        cmd += '| grep {} \\\n'.format(filterKey)
    cmd += r'''
    | bedtools slop -s -l 0 -r -1.0 -pct {opt} \
    | bedtools slop -s -l {upStream} -r {downStream} {opt} \
    | sed "s/\"//g"  \
    >{OFILE}
    '''.format(
        #         INFILE = INFILE,
        OFILE=OFILE,
        #         filterKey=filterKey,
        upStream=upStream,
        downStream=downStream,
        opt='%s -g %s' % (opt, GSIZE),
    ).strip()
    res = pyutil.shellexec(cmd, silent=silent)
    return OFILE
Example #16
0
def guessBedHeader(fname,
                   silent=True,
                   ext='tsv',
                   guess_index=0,
                   prefix='',
                   **kwargs):
    cmd = 'head -n5 %s' % fname
    buf = StringIO.StringIO(pyutil.shellexec(cmd, silent=silent))
    df = pyutil.readData(buf,
                         ext=ext,
                         header=None,
                         guess_index=guess_index,
                         **kwargs)
    if len(df.columns) > len(bedHeader):
        header = bedHeader + list(df.columns)[len(bedHeader):]
    else:
        header = bedHeader[:len(df.columns)]
    if prefix:
        header = ['%s_%s' % (prefix, x) for x in header]
    return map(str, header)
Example #17
0
def job__chipTargPaired(
    bwCurr=None,
    bwMeta=None,
    control=None,
    treatment=None,
    xlab=None,
    ylab=None,
    name=None,
    #     bwMeta,
    NCORE=2,
    params__peakBW=None,
    CUTOFF_FC=3.0,
    CUTOFF_CHIPDIFF=0.7,
    innerRadius=100,
):
    figs = pyutil.collections.OrderedDict()

    if control is not None and treatment is not None:
        xlab, ylab = control, treatment
    if xlab is None or ylab is None:
        xlab, ylab = bwCurr.index
    elif bwCurr is None:
        bwCurr = bwMeta.reindex([xlab, ylab])

    if params__peakBW is None:

        params__peakBW = dict(
            outerRadius=500,
            innerRadius=innerRadius,
            NCORE=NCORE,
            outIndex=bwCurr.header,
            #     detailByCHIP = 0,
        )
    params__peakBW['innerRadius'] = innerRadius

    if name is None:
        name = '{xlab}-{ylab}'.format(**locals())
#     bwCurr = bwMeta
#     bwCurr = bwCurr.loc[[xlab,ylab]]

#     bwCurr.npkFile

    dfs = map(
        sdio.extract_peak,
        bwCurr.npkFile,
    )

    fig, ax = plt.subplots(1, 1, figsize=[7, 7])
    #     ax = plt.gca()
    for df in dfs:
        df['per_FC'] = pyutil.dist2ppf(df.FC)
        df.plot.scatter('per_FC', 'FC', ax=ax)

    fnames = [
        pyutil.queryCopy(infile=fname,
                         query='FC>%.3f' % CUTOFF_FC,
                         reader=sdio.extract_peak,
                         inplace=False) for fname in bwCurr.npkFile
    ]
    # dfs[1]

    peakFlat = ' '.join(fnames)
    ofname = '%s-combined.bed' % ('-'.join(bwCurr.index))
    pyutil.shellexec('cat {peakFlat}>{ofname}'.format(**locals()))
    ofname = sdio.npk_expandSummit(fname=ofname, radius=1)

    pyutil.lineCount(ofname)
    peakFileOrig = peakFile = ofname

    res = sjob.figs__peakBW(peakFile=peakFile,
                            bwFiles=bwCurr.RPKMFile,
                            name=name,
                            **params__peakBW)
    figs.update(res[0])

    bwTrack, bwAvg = res[1]
    bwAvg.columns = bwAvg.columns.map(
        pyutil.df2mapper(bwCurr, 'header', 'index').get)
    #     .set_index('RPKMFile').loc[bwAvg.columns].
    #     bwAvg.columns = bwCurr.index

    xs, ys = bwAvg[[xlab, ylab]].values.T
    #     clu = None

    #     peakIndex = pyutil.df__pad(bwAvg).query(query).index
    clu = pd.DataFrame(pyutil.df__pad(bwAvg))
    query = ' val_{ylab} - val_{xlab} > {CUTOFF_CHIPDIFF} '.format(**locals())
    qsans = pyutil.sanitise_query(query)
    peakIndex = clu.query(query).index
    clu['clu'] = clu.eval('index in @peakIndex')

    stats = sdio.extract_peak(peakFile).set_index('acc', drop=0)
    stats['CHIPDIFF'] = clu.eval(query.split('>')[0])

    pyvis.qc_2var(xs, ys, clu=clu.clu, xlab=xlab, ylab=ylab)
    figs['scatterPlot__%s' % name] = plt.gcf()
    cluFile = ofname = qsans + '.csv'
    clu.to_csv(ofname)
    print(ofname, pyutil.lineCount(ofname))
    peakBase = pyutil.getBname(peakFile)
    ofname = '{peakBase}-{qsans}.bed'.format(**locals())
    peakFile = pyutil.to_tsv(stats.reindex(peakIndex), ofname)
    pyutil.shellexec('mkdir -p output/')
    pyutil.file__link(ofname, 'output/%s.bed' % name, force=True)

    #     peakFile = pyutil.queryCopy(peakFile,
    #                                 query='acc in @peakIndex',
    #                                 reader=sdio.extract_peak,
    #                                 peakIndex=peakIndex,
    #                                )
    #     peakFile =  '{peakFile}-{qsans}.bed'
    #     pyutil.fileDict__main(ofname='FILE.json',
    #                          **pyutil.dictFilter(locals(),
    #                                              keys=['cluFile','peakFile',
    #                                             'peakFileOrig']
    #                                             ))

    pyutil.fileDict__save(d=locals(),
                          keys=['cluFile', 'peakFile', 'peakFileOrig'],
                          fname='FILE.json')
    return figs, clu
Example #18
0
def main(
        #### necessary
        bedFile=None,
        bwFiles=None,
        ####
        DIR=None,
        figsize=[14, 14],
        debug=0,
        ylim=[0, 10],
        radius=2000,
        stepSize=10,
        NCORE=4,
        silent=0,
        gtfFile=None,
        cdsFile=None,
        annotation=None,
        GSIZE=None,
        center_summit=0,
        trackNames=None,
        backend='fluff',
        ext='png',
        **kwargs):
    #     vlim = ylim
    figsize = map(int, figsize)
    # for peakAcc in df_near.acc.unique()[:1]:

    prefix = 'PROG=chipShots_bedFile='
    #     bname  = pyutil.basename(bedFile)
    bname = pyutil.os.path.basename(bedFile)
    odname = prefix + bname
    if DIR == 'inplace':
        DIR = pyutil.os.path.dirname(bedFile) + odname
    elif DIR is None:
        DIR = odname
    pyutil.shellexec('mkdir -p %s' % DIR, silent=silent)
    DIR = pyutil.os.path.abspath(DIR)
    #     odname = pyutil.ospath

    if cdsFile is None:
        cdsFile = gtfFile + '.cds'
    if backend == 'synotil':
        # nearFile = './DE2017/type=closest_bed=lux22_radius=1_feat=genes.gtf.cds.tsv'
        # import synotil.filterByCDS
        nearFile = synotil.filterByCDS.main(
            peakFile=bedFile,
            cdsFile=cdsFile,
            downStream=radius,
            upStream=radius,
            peakRadius=1,
            GSIZE=GSIZE,
            center_summit=center_summit,
        )
        df_near = pyutil.readData(nearFile, )

        stderrLine('[MSG]Loading bed intervals from bigwig tracks....')

        chipTracks = sutil.extract_bigwig_multiple(
            fnames=bwFiles,
            bedFile=bedFile,
            radius=radius,
            stepSize=stepSize,
            callback=None,
            outIndex=trackNames,

            #                                               callback=callback,
            center_summit=center_summit,
            shift=0,  #### use positive coordinate
            stranded=False,
            NCORE=NCORE)
        if ylim is None:
            ylim = pyutil.span(
                pyutil.np.hstack([x.values.flat for x in chipTracks]), 99)
            ylim = list(ylim)
            ylim[0] = 0.
        callback = lambda x: [prepare_chipTrack(ele, vlim=ylim) for ele in x]
        chipTracks = callback(chipTracks)

        if debug:
            stderrLine(chipTracks[0].columns)

        gtf = pyutil.readData(gtfFile, ext='tsv', header=None, guess_index=0)
        gtf = scount.countMatrix(gtf, look='gtf')
        gtfs = [gtf]

        #     uniqPeak = df_near.acc.unique()
        #     bedDF = pyutil.readData(bedFile,header=None,guess_index=0)
        #     bedDF.columns = sutil.bedHeader[:len(bedDF.columns)]
        bedDF = sutil.extract_peak(bedFile)
        #     uniqPeak
        #     uniqPeak = bedDF[bedDF.columns]

        worker = pyutil.functools.partial(
            worker__drawPeak,
            DIR=DIR,
            chipTracks=chipTracks,
            df_near=df_near,
            gtfs=gtfs,
            radius=radius,
            figsize=figsize,
            ylim=ylim,
            debug=debug,
        )

        ofnames = pyutil.mp_map(
            worker,
            bedDF.acc,
            n_cpu=NCORE,
        )
    elif backend == 'fluff':
        bedDF = sdio.extract_peak(bedFile)

        argDF = bedDF.copy()
        argDF = sdio.bed__addCol__interval(argDF)
        tracks = list(bwFiles)
        argDF['tracks'] = [tracks] * len(bedDF)
        argDF['annotation'] = annotation
        argDF['DIR'] = DIR
        argDF['ext'] = ext
        if trackNames is not None:
            argDF['labels'] = [list(trackNames)] * len(bedDF)

        ofnames = pyutil.mp_map(
            #         ofnames = map(
            worker__fluff,
            (vars(x) for x in argDF.itertuples()),
            n_cpu=NCORE,
        )
#         ofnames =

    bedDF['img'] = ofnames
    indexFile = '%s/%s.index.tsv' % (DIR, bname)
    pyutil.to_tsv(bedDF, indexFile)
    indexFile = '%s/figureIndex.tsv' % (DIR)
    pyutil.to_tsv(bedDF, indexFile)

    try:
        import synotil.shot2html as shot2html
        htmlFile = shot2html.shot2html(indexFile, localPath=True)
    except Exception as e:
        stderrLine('[WARN]:cannot produce html :%s' % e)
        htmlFile = None


#     print ('[OUTPUT]:',)
#     print ('html:',htmlFile)
#     print ('index:',indexFile)
    print(indexFile)
    print(htmlFile)
    return (indexFile, htmlFile)
Example #19
0
# execfile('/home/feng/headers/header__import.py')
import pymisca.util as pyutil
dfc = pyutil.readData(pyutil.base__file('TOUCHED.list'),ext='tsv',header=None)

ind = dfc.query('~index.duplicated()').sort_index()
# print (ind.to_csv())
ind.to_csv(pyutil.base__file('file.index',force=1))
pyutil.shellexec('''
cd $BASE
cat file.index | grep ^RNA | xargs tar -cvzf RNA-seq.tar.gz
''')
                
pyutil.shellexec('''
cd $BASE
echo > tracking.index
{
cat file.index | grep -v ^RNA-seq 
echo *.tar.gz 
echo *.index 
echo *.txt 
echo *.list 
echo "Snakefile README" 
} >> tracking.index
''')

pyutil.shellexec('''
cd $BASE
echo
mkdir -p dist;
cat tracking.index | xargs cp -avuf --parents -t dist
''')
Example #20
0
def file__header(fname, head=10, silent=1):
    res = pyutil.shellexec('head -n{head} {fname}'.format(**locals()),
                           silent=silent)
    res = pyutil.StringIO.StringIO(res)
    return res
Example #21
0
            dct = msg.raw.copy()
            #### record time to precision
            #         keys = ['create_time','receive_time','latency']
            #         for k in keys:
            #             dct[k] = getattr(msg,k)
            json.dump(dct, f)
            f.write('\n')
        except Exception as e:
            print('[ERROR]', e)
    #     print (dct)
        print(msg)

    #     json.dump
    return dumpMsg


if __name__ == '__main__':
    bot = wxpy.Bot()
    logDir = '%s-%s' % (
        bot.self.uin,
        bot.self.name,
    )
    pyutil.shellexec(u'mkdir -p {logDir}'.format(**locals()))
    logFile = '%s/messages.json' % logDir
    f = open(logFile, 'a', 0)

    callback = make__dumpMsg(f=f)
    chats = bot.friends(update=True) + bot.groups(update=True)
    bot.register(except_self=False, chats=chats)(callback)
    if not pyutil.hasIPD:
        wxpy.embed()
Example #22
0
def process(k=None, npkFile=None, gPar=None, dbg=0, ANNOTATION_FILE=None):
    if k is None:
        assert npkFile, 'must specify one arg'
        k = npkFile.rsplit('.', 1)[0].split('/')[-1]

#             d = condDict[k]
    outd = {'files': {}}

    if 1:
        #         parameter_string = 'fc=1.5 q=0.001 p=0.01'
        parameter_string = make_param_string(gPar)


#             fileMacs = '%s/peaks/%s_peaks.narrowPeak'%(SUMMARY_DIR,k)
    small_narrowPeak = '{key}.snpk'.format(key=k)

    ### PeakFiltering
    cmd = 'python {SCRIPT} {INFILE} {PARAM} > {OUTF}'.format(
        SCRIPT=PEAK_SELECT_SCRIPT,
        INFILE=npkFile,
        PARAM=parameter_string,
        OUTF=small_narrowPeak,
    )
    print(cmd)
    os.system(cmd)
    outd['param'] = parameter_string

    #     cmd = 'cat {INFILE}  > {OUTF}'.format(
    # #         SCRIPT=  PEAK_SELECT_SCRIPT,
    #         INFILE = npkFile,
    # #         PARAM= parameter_string,
    #         OUTF = small_narrowPeak,
    #     )
    #     print(cmd);os.system(cmd)
    #     outd['param'] = parameter_string
    #     raise 0
    #### Fancy Histogram
    fc_thresholds = [x * 0.1 for x in range(10, 10 * MAX_FOLD_CHANGE, 2)]
    npeak_lst = calc_npeak(fc_thresholds, k + '_peaks.narrowPeak',
                           gPar['PVALUE'], gPar['QVALUE'])
    plotName = SUMMARY_DIR.strip('/') + '/' + 'npeaks_vs_fc_' + k + '.txt'
    #     plot_npeak_vs_fc(fc_thresholds, npeak_lst, plotName)

    #### Produce geneLists

    file_bedmap = '%s.bedmap.tsv' % k

    #     cmd = 'bedmap --echo --echo-map-id-uniq --delim \'\t\' ' \
    #       + '--range ' + gPar['TARGET_RANGE'] \
    #       + ' ' + small_narrowPeak \
    #       + ' ' + ANNOTATION_FILE + ' | tee ' + file_bedmap +'.tmp'

    #     buf = StringIO.StringIO(pyutil.shellexec(cmd))
    # #     buf = file_bed
    # #     res_bedmap = pd.read_table(buf,header=None)

    # #     if buf.read():
    #     if buf.len:
    #         buf.seek(0)
    #         df = sutil.parseBedmap(fname = buf)
    #     else:
    #         header = sutil.bedHeader + ['hit']
    #         df = pd.DataFrame(columns = header)

    cmd = '''
bedtools slop -b {RANGE} -i {ANNO} -g $GSIZE |bedtools sort > {ANNOBASE}.{RANGE}
bedtools closest -d -a {SNPK} -b {ANNOBASE}.{RANGE} | tee {FOUT}.tmp
'''.format(ANNO=ANNOTATION_FILE,
           ANNOBASE=ANNOTATION_FILE.split('/')[-1],
           SNPK=small_narrowPeak,
           RANGE=gPar['TARGET_RANGE'],
           FOUT=file_bedmap).strip()

    buf = StringIO.StringIO(pyutil.shellexec(cmd))
    if buf.len:
        buf.seek(0)
        df = sutil.parseBedClosest(fname=buf)
    else:
        assert 0, ' Buffer is empty, check error msg'
    df['condition'] = k
    df = df[df['distance'] == 0]

    #         raise e
    df = df.sort_values('FC', ascending=False, inplace=False)

    #### deduplication on gene acc
    df = df.loc[~df.duplicated('hit')]
    res_bedmap = df
    df.to_csv(file_bedmap, sep='\t')

    genes = df

    outd['genes'] = None
    outd['nGene'] = len(df['hit'].unique())
    outd['file_bedmap'] = file_bedmap
    #     outd['res_bedmap'] = res_bedmap

    fname = '%s/%s.gene.txt' % (SUMMARY_DIR, k)
    dfc = df.copy()[[
        'hit',
        'FC',
        'acc',
    ]]
    dfc.columns = ['geneAcc', 'maxFoldChange', 'peakAcc']
    dfc.to_csv(fname, sep='\t')

    outd['glst_filename'] = fname
    #     outd['goenrich_filename'] =  make_goenrichment_file(SUMMARY_DIR + '/' + k, genes)
    outd['goenrich_filename'] = 'NotImplemented'
    outd['plot_file'] = plotName
    outd['peak_file'] = small_narrowPeak

    outd['key'] = k
    outd['extra'] = ''
    return outd
Example #23
0
 def check_DIR(self, DIR=None):
     DIR = self.DIR if DIR is None else DIR
     pyutil.shellexec('mkdir -p %s' % DIR)
     return DIR
Example #24
0
def summitDist(peak1,
               peak2,
               CUTOFF=400,
               silent=1,
               GSIZE=None,
               as_fname=0,
               **kwargs):
    '''Find nearby summits within a distance cutoff
'''
    if GSIZE is None:
        GSIZE = pyutil.os.environ.get('GSIZE', None)
    assert GSIZE is not None
    RANGE = CUTOFF // 2 - 1
    infiles = [peak1, peak2]
    #     def file_ncol(fname):
    #         cmd = 'wc -l %s'%(fname)
    #         res = pyutil.shellexec(cmd,silent=silent)
    #         ncol = res[0].strip().split('\t')
    #     incols =
    incols = map(pyutil.file_ncol, infiles)

    ### padding/inflate the summit to have radius
    lst = []
    for infile in infiles:

        ofile = "{infile}.{RANGE}".format(**locals()).split('/')[-1]
        lst += [ofile]

        cmd = "bedtools slop -g {GSIZE} -b {RANGE} -i {infile} \
          | tee {ofile}".format(**locals())
        _ = pyutil.shellexec(cmd, silent=silent)

    slop1, slop2 = lst
    FOUT = 'infiles:'+ ":".join(map(pyutil.basename,infiles)) \
        + "__cutoff:{}.tsv".format(CUTOFF)

    # ### bed format 1=chrom, 2=start, 3=end
    # cols = ','.join(map(str,[2,3,] + [x + incols[0] for x in [2,3]]))
    # cmd = "bedtools closest -a {slop1} -b {slop2} \
    #   | bedtools overlap -cols {cols} \
    #   | tee {FOUT}".format(**locals())

    cmd = "bedtools intersect -wo -a {slop1} -b {slop2} \
      | tee {FOUT}".format(**locals())

    buf = pyutil.shellexec(cmd, silent=silent)

    ### [TBC]Memory-intensive, Replace with awk mutation in the future
    columns = header_closest(peak1, peak2)

    df = pyutil.readData(StringIO.StringIO(buf),
                         header=None,
                         ext='tsv',
                         guess_index=False,
                         columns=columns)
    df.distance = CUTOFF - df.distance
    df.to_csv(FOUT, sep='\t', index=False)
    if as_fname:
        return FOUT
    else:
        return df