Exemple #1
0
    def test_sorted_stream(self):
        s = [(10, 0.8), (15, 2.8), (12, 19.5), (12, 1.4), (13, 0.1)]

        stream = fstream(s, fields=['start', 'score'])
        res = list(sorted_stream(stream, fields=['start']))
        expected = [(10, 0.8), (12, 19.5), (12, 1.4), (13, 0.1), (15, 2.8)]
        self.assertListEqual(res, expected)

        stream = fstream(s, fields=['start', 'score'])
        res = list(sorted_stream(stream, fields=['start', 'score']))
        expected = [(10, 0.8), (12, 1.4), (12, 19.5), (13, 0.1), (15, 2.8)]
        self.assertListEqual(res, expected)

        s = [('chrX', 0, 1, 0.8), ('chrIX', 3, 5, 2.8), ('chrIX', 3, 9, 1.4),
             ('chrIX', 2, 10, 0.1), ('chrIX', 7, 10, 0.8)]
        stream = fstream(s, fields=['chr', 'start', 'end', 'score'])
        res = list(sorted_stream(stream, fields=['start', 'chr']))
        expected = [('chrX', 0, 1, 0.8), ('chrIX', 2, 10, 0.1),
                    ('chrIX', 3, 5, 2.8), ('chrIX', 3, 9, 1.4),
                    ('chrIX', 7, 10, 0.8)]
        self.assertListEqual(res, expected)

        stream = fstream(s, fields=['chr', 'start', 'end', 'score'])
        res = list(
            sorted_stream(stream,
                          fields=['chr', 'start', 'score'],
                          chrnames=self.a.chrnames))
        expected = [('chrIX', 2, 10, 0.1), ('chrIX', 3, 9, 1.4),
                    ('chrIX', 3, 5, 2.8), ('chrIX', 7, 10, 0.8),
                    ('chrX', 0, 1, 0.8)]
        self.assertListEqual(res, expected)
def fimo(motifs,fasta,qval=True):
    # Run Fimo
    if qval:
        options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh"
    else:
        options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001"
    cmd = "fimo " + options + " %s %s" % (motifs, fasta)
    print "Running >>",cmd
    os.system(cmd)
    os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt")

    # Bed output
    t = track('fimo.txt', fields=["name","chr","start","end","strand","score","p-value","q-value","sequence"])
    t.fields = ["name","chr","start","end","strand","a","score","q","sequence"]
    s = t.read()
    s = select(s,['chr','start','end','name','score','strand'])
    s = apply(s,'chr',lambda x:x.split('|')[1])
    s = sorted_stream(s)
    s = cobble(s)
    s = apply(s,'name',lambda x:'|'.join(list(set(x.split('|')))))
    outname = 'fimo.bed'
    bed = track(outname,fields=s.fields)
    bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite')
    bed.write(s)
    if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
Exemple #3
0
def fimo(motifs, fasta, qval=True):
    # Run Fimo
    if qval:
        options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh"
    else:
        options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001"
    cmd = "fimo " + options + " %s %s" % (motifs, fasta)
    print "Running >>", cmd
    os.system(cmd)
    os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt")

    # Bed output
    t = track('fimo.txt',
              fields=[
                  "name", "chr", "start", "end", "strand", "score", "p-value",
                  "q-value", "sequence"
              ])
    t.fields = [
        "name", "chr", "start", "end", "strand", "a", "score", "q", "sequence"
    ]
    s = t.read()
    s = select(s, ['chr', 'start', 'end', 'name', 'score', 'strand'])
    s = apply(s, 'chr', lambda x: x.split('|')[1])
    s = sorted_stream(s)
    s = cobble(s)
    s = apply(s, 'name', lambda x: '|'.join(list(set(x.split('|')))))
    outname = 'fimo.bed'
    bed = track(outname, fields=s.fields)
    bed.make_header(name="TSS_motifs",
                    description="Motifs +-XKb around TSS",
                    mode='overwrite')
    bed.write(s)
    if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
Exemple #4
0
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile):
    files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n")
    logfile.flush()
    for gid, motifbed in bedlist.iteritems():
        #        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed, format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0], len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1],
                                               len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(
                    segment_features(FeatureStream(regs,
                                                   fields=['start', 'end']),
                                     nbins=motif[1],
                                     upstream=_plot_flank,
                                     downstream=_plot_flank))
                for t in score_by_feature(
                    [s.read(chrom) for s in signals[gid]], tFeat):
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1], _plot_flank[1] + nbins)
            for k in range(nbins):
                X[k + _plot_flank[1]] = str(k + 1)
            ####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])],
                     mfrow=[4, 2],
                     output=files[gid]['pdf'],
                     new=new,
                     last=(last == 0),
                     legend=snames,
                     main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf, "w") as dff:
                dff.write("\t".join([""] + [str(x) for x in X]) + "\n")
                for n, sn in enumerate(snames):
                    dff.write("\t".join([sn] + [str(x)
                                                for x in dat[:, n]]) + "\n")
            files[gid]['mat'].append((mname, _datf))
    return files
Exemple #5
0
def sort(*args, **kw):
    if len(args) < 1: raise Usage("No input file provided")
    chrmeta = _get_chrmeta(**kw)
    for infile in args:
        intrack = track.track(infile, format=kw['format'], chrmeta=chrmeta)
        outname = kw['output'] or intrack.name + '_sorted.' + intrack.format
        outtrack = track.track(outname, chrmeta=intrack.chrmeta)
        instream = intrack.read()
        s = sorted_stream(instream, chrnames=json.loads(kw['chromosomes']))
        outtrack.write(s)
        intrack.close()
    return 0
Exemple #6
0
def sort(*args,**kw):
    if len(args) < 1: raise Usage("No input file provided")
    chrmeta = _get_chrmeta(**kw)
    for infile in args:
        intrack = track.track(infile,format=kw['format'],chrmeta=chrmeta)
        outname = kw['output'] or intrack.name+'_sorted.'+intrack.format
        outtrack = track.track(outname, chrmeta=intrack.chrmeta)
        instream = intrack.read()
        s = sorted_stream(instream, chrnames=json.loads(kw['chromosomes']))
        outtrack.write(s)
        intrack.close()
    return 0
Exemple #7
0
    def test_sorted_stream(self):
        s = [(10,0.8),(15,2.8),(12,19.5),(12,1.4),(13,0.1)]

        stream = fstream(s, fields=['start','score'])
        res = list(sorted_stream(stream,fields=['start']))
        expected = [(10,0.8),(12,19.5),(12,1.4),(13,0.1),(15,2.8)]
        self.assertListEqual(res,expected)

        stream = fstream(s, fields=['start','score'])
        res = list(sorted_stream(stream,fields=['start','score']))
        expected = [(10,0.8),(12,1.4),(12,19.5),(13,0.1),(15,2.8)]
        self.assertListEqual(res,expected)

        s = [('chrX',0,1,0.8),('chrIX',3,5,2.8),('chrIX',3,9,1.4),('chrIX',2,10,0.1),('chrIX',7,10,0.8)]
        stream = fstream(s, fields=['chr','start','end','score'])
        res = list(sorted_stream(stream, fields=['start','chr']))
        expected = [('chrX',0,1,0.8),('chrIX',2,10,0.1),('chrIX',3,5,2.8),('chrIX',3,9,1.4),('chrIX',7,10,0.8)]
        self.assertListEqual(res,expected)

        stream = fstream(s, fields=['chr','start','end','score'])
        res = list(sorted_stream(stream, fields=['chr','start','score'], chrnames=self.a.chrnames))
        expected = [('chrIX',2,10,0.1),('chrIX',3,9,1.4),('chrIX',3,5,2.8),('chrIX',7,10,0.8),('chrX',0,1,0.8)]
        self.assertListEqual(res,expected)
Exemple #8
0
def plot_footprint_profile( ex, bedlist, signals, chrnames, groups, logfile ):
    files = dict((gid,{'pdf':"",'mat':[]}) for gid in bedlist.keys())
    logfile.write("Plotting footprints:\n");logfile.flush()
    for gid, motifbed in bedlist.iteritems():
#        signals = [track(sig) for sig in siglist[gid]]
        snames = [sig.name for sig in signals[gid]]
        tmotif = track(motifbed,format='bed')
        data = {}
        numregs = {}
        for chrom in chrnames:
            fread = {}
            for r in tmotif.read(chrom):
                r2 = r[3].split(":")
                key = (r2[0],len(r2[1]))
                if key in fread: fread[key].append(r[1:3])
                else: fread[key] = [r[1:3]]
            for motif, regs in fread.iteritems():
                if motif not in data:
                    data[motif] = zeros(shape=(motif[1]+2*_plot_flank[1], len(signals[gid])))
                    numregs[motif] = 0
                numregs[motif] += len(regs)
                tFeat = sorted_stream(segment_features(FeatureStream(regs,fields=['start','end']),
                                                       nbins=motif[1],upstream=_plot_flank,downstream=_plot_flank))
                for t in score_by_feature([s.read(chrom) for s in signals[gid]], tFeat): 
                    data[motif][t[2]] += t[3:]
        files[gid]['pdf'] = unique_filename_in()
        new = True
        last = len(data)
        for motif, dat in data.iteritems():
            last -= 1
            mname, nbins = motif
            dat /= float(numregs[motif])
            X = range(-_plot_flank[1],_plot_flank[1]+nbins)
            for k in range(nbins): X[k+_plot_flank[1]] = str(k+1)
####### Could do a heatmap (sort by intensity)...
            lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4,2],
                     output=files[gid]['pdf'], new=new, last=(last==0), 
                     legend=snames, main=mname)
            new = False
            _datf = unique_filename_in()
            with open(_datf,"w") as dff:
                dff.write("\t".join([""]+[str(x) for x in X])+"\n")
                for n,sn in enumerate(snames):
                    dff.write("\t".join([sn]+[str(x) for x in dat[:, n]])+"\n")
            files[gid]['mat'].append((mname,_datf))
    return files
Exemple #9
0
def summed_feature_matrix(trackScores,trackFeatures,method='mean',**kw):
    """
    Each feature in *trackFeatures* is segmented into bins using bbcflib.gfminer.stream.segment_features
    (with parameters passed from *\*\*kw*).
    This creates a matrix with a column for each track in *trackScores* and a row for each bin in the segmented features.
    The values of a matrix entry is the score from one track in *trackScores* in one bin summed over all features.


    Example::

                      gene1                 gene2
        X: -----#####|#####|#####--------###|###|###-----  (features, nbins=3)
        Y: _____________666|66666________666|666|666_____
        Z: _____22222|22222|22222________________________

             Y   Z
        R: [[3.  1.],   # bin 0
            [4.  1.],   # bin 1
            [6.  1.]]   # bin 2

    Note: the whole segmented features track will be loaded in memory.

    :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s).
    :param trackFeatures: (FeatureStream) feature track.
    :param method: (str) Operation applied to the list of scores for one feature.
        It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'.
    :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`).
    :rtype: numpy.ndarray, int (number of features)
    """
    nfields = len(trackFeatures.fields)
    trackFeatures = sorted_stream(segment_features(trackFeatures,**kw))
    all_means = score_by_feature(trackScores,trackFeatures,method=method)
    if isinstance(trackScores,(list,tuple)):
        nscores = len(trackScores)
    else:
        nscores = 1
    nbins = kw.get('nbins',segment_features.func_defaults[0]) \
            + kw.get('upstream',(0,0))[1] \
            + kw.get('downstream',(0,0))[1]
    averages = numpy.zeros(shape=(nbins,nscores))
    ntot = -1
    for ntot,x in enumerate(all_means):
        averages[x[nfields]] += x[(nfields+1):]
    return averages, (ntot+1)/nbins
Exemple #10
0
def coverageInRepeats(ex, infile, genomeName='mm9', repeatsPath=GlobalRepbasePath,
                      outdir=None, via='lsf'):
    """
    Completes the segment info bed file with the coverage in repeats of each segment.
    For now, works only for mm9, hg19 and dm3.
    """
    if not(isinstance(infile,dict)):
        infile = {"":infile}
    if outdir is None:
        resfile = unique_filename_in()+".bed"
        outf = open(resfile,'w')
    repeatsFile = os.path.join(repeatsPath, genomeName, genomeName+'_rmsk.bed')
    if not(os.path.exists(repeatsFile)):
        print("coverage in repeats not calculated as file "+repeatsFile+" does not exist.")
        if outdir is None:
            outf.close()
            cat([inf[0] for inf in infile.values()],out=resfile)
        else:
            for chrom,inf in infile.iteritems():
                shutil.copy(inf[0], os.path.join(outdir,chrom+".bed"))
            resfile = outdir
        return resfile
    futures = {}
    for chrom,inf in infile.iteritems():
        tmpfile = unique_filename_in()
        futures[chrom] = (tmpfile,coverageBed.nonblocking(ex,repeatsFile,inf[0],via=via,stdout=tmpfile))
    for chrom,fut in futures.iteritems():
        if not(outdir is None):
            resfile = os.path.join(outdir,chrom+".bed")
            outf = open(resfile,'w')
        fut[1].wait()
        coverout = track(fut[0],format='text',fields=['chr','start','end','name','c1','c2','c3','c4'])
        for s in sorted_stream(coverout.read(),[chrom]):
            s_split = s[3].split('|')
            infos = '|'.join(s_split[0:(len(s_split)-4)]+list(s[4:8]))
            outf.write('\t'.join([str(x) for x in s[0:3]+(infos,)])+'\n')
        if not(outdir is None):
            outf.close()
    if outdir is None: outf.close()
    else: resfile = outdir
    return resfile
Exemple #11
0
def main(argv = None):
    try:
        usage = "camelPeaks.py [OPTIONS]"
        desc = """A ChIP-seq peak deconvolution algorithm."""
        parser = optparse.OptionParser(usage=usage, description=desc)
        for opt in opts:
            parser.add_option(opt[0],opt[1],help=opt[2],**opt[3])
        (opt, args) = parser.parse_args()
        if not(opt.peaks and os.path.exists(opt.peaks)):
            parser.print_help()
            raise Usage("Specify a valid peaks file with -p.")
        if not(opt.forward and os.path.exists(opt.forward)):
            parser.print_help()
            raise Usage("Specify a valid forward strand density file with -f.")
        if not(opt.reverse and os.path.exists(opt.reverse)):
            parser.print_help()
            raise Usage("Specify a valid reverse strand density file with -r.")
####
        if opt.chromosome and opt.length: chrmeta = {opt.chromosome: {'length': opt.length}}
        else: chrmeta = opt.genome
        peak_track = track(opt.peaks,chrmeta=chrmeta)
        chrmeta = peak_track.chrmeta
        if opt.chromosome: chrmeta = {opt.chromosome: chrmeta[opt.chromosome]}
        track_info = {'datatype': peak_track.info.get('datatype','qualitative')}
        outbed = track(opt.output+"_peaks.bed", chrmeta=chrmeta,
                             fields=["chr","start","end","name","score"])
        outwig = track(opt.output+"_deconv.bedgraph", chrmeta=chrmeta)
        outwig.open(mode='overwrite')
        topts = {'chrmeta': chrmeta, 'readonly': True}
        for chrom,cv in chrmeta.iteritems():
            peak_stream = sorted_stream(peak_track.read(selection=chrom),[chrom])
            strands = {track(opt.forward,**topts).read(chrom,fields=['start','end','score']): 'plus',
                       track(opt.reverse,**topts).read(chrom,fields=['start','end','score']): 'minus'}
            robjects.r('options(stringsAsFactors=F)')
            robjects.r('counts=data.frame()')
            for row_count,peak in enumerate(peak_stream):
                start = int(peak[peak_stream.fields.index('start')])
                end = int(peak[peak_stream.fields.index('end')])
                if end-start > opt.sizecutoff: continue
                if start < 0: start = 0
                if not(end <= cv['length']): end = cv['length']
                if 'name' in peak_stream.fields:
                    reg_name = peak[peak_stream.fields.index('name')]
                else:
                    reg_name = str(row_count+1)
                data_block = robjects.DataFrame({'pos':   robjects.IntVector(range(start+1,end+1)),
                                                 'plus':  robjects.FloatVector([0]*(end-start)),
                                                 'minus': robjects.FloatVector([0]*(end-start)),
                                                 'name':  robjects.StrVector([reg_name]*(end-start))})
                for stream,strnd in strands.iteritems():
                    for row in stream:
                        if row[0]<start: continue
                        if row[1]>end: break
                        data_block.rx2(strnd)[(row[0]-start):(row[1]-start)] = \
                            robjects.FloatVector([row[2]]*(row[1]-row[0]))
                robjects.r.assign('newblock',data_block)
                robjects.r('counts=rbind(counts,newblock)')
            robjects.r('read.length=%i' %opt.extension)
            robjects.r('chr.name="%s"' %chrom)
            robjects.r('pdf.file="%s.pdf"' %opt.output)
            robjects.r('mu=%i' %opt.mu)
            robjects.r('ktype="%s"' %opt.kernel)
            robjects.r('source("%s")' %os.path.join(opt.script,"deconv_fcts.R"))
            robjects.r("""
    counts = split(counts[,c("pos","plus","minus")],counts$name)
    pdf(file=pdf.file,title='chip-seq',paper='a4',width=8,height=11)
    par(cex=1.5,lwd=1.5)
    ccf = cross.correlate(counts,threshold=.5)
    plot(ccf$lag,ccf$acf,t='l',ylim=c(0,1),
         xlab='Lag',ylab='Cross-correlation',
         main=paste('Strand cross-correlation',chr.name))
    cut.ccf = ccf$acf
    cut.ccf[which(ccf$lag<mu)] = 0
    lambda = ccf$lag[which.max(cut.ccf)]
    sol = inverse.solve(counts,mu=mu,lambda=lambda,len=read.length,regul=1e-3,optimize=TRUE,ktype=ktype)
    col = 'red'
    lab = paste('lambda=',sol$par$lambda,sep='')
    abline(v=sol$par$lambda,col=col)
    text(sol$par$lambda,0,lab,col=col,pos=4)
    col = 'blue'
    lab = paste('mu=',sol$par$mu,sep='')
    abline(v=sol$par$mu,col=col)
    text(sol$par$mu,0.3,lab,col=col,pos=4)
    col = 'darkgreen'
    lab = paste('l=',read.length,sep='')
    abline(v=read.length,col=col)
    text(read.length,0.6,lab,col=col,pos=4)
    par(mfrow=c(4,2))
    for (n in names(counts)) {
      if (sol$sol[[n]]$value>.65) next
      plot.sol(counts[[n]],sol$sol[[n]],sol$par)
      title(sub=chr.name)
    }
    dev.off()
    bed = data.frame()
    cutoff = 1e-3
    for (n in names(counts)) {
      I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob))
      if (length(I)<2) next
      interval = range(counts[[n]]$pos[I])
      score = sum(sol$sol[[n]]$prob[I])
      name = paste('ID=',n,';FERR=',round(sol$sol[[n]]$val,digits=4),sep='')
      bed = rbind(bed,data.frame(
          start=interval[1],end=interval[2],
          name=name,score=score))
    }
    bed[,'start'] = as.integer(bed[,'start']-1)
    wig = data.frame()
    for (n in names(counts)) {
      I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob))
      wig = rbind(wig,data.frame(
          pos = as.integer(counts[[n]]$pos[I]),
          score = as.numeric(sol$sol[[n]]$prob[I])))
    }
    """)
            nrow = robjects.r("nrow(bed)")[0]
            outbed.write(((robjects.r("bed").rx2('start')[ri],
                           robjects.r("bed").rx2('end')[ri],
                           robjects.r("bed").rx2('name')[ri],
                           robjects.r("bed").rx2('score')[ri]) for ri in xrange(nrow)),
                         fields=["start","end","name","score"], chrom=chrom, mode='append')
            nrow = robjects.r("nrow(wig)")[0]
            outwig.write(((robjects.r("wig").rx2('pos')[ri]-1,
                           robjects.r("wig").rx2('pos')[ri],
                           robjects.r("wig").rx2('score')[ri]) for ri in xrange(nrow)),
                         fields=["start","end","score"], chrom=chrom, mode='append')
        outwig.close()
        print "************OUTPUT FILES**********"
        print "\n".join([opt.output+".pdf",
                         opt.output+"_peaks.bed",
                         opt.output+"_deconv.bedgraph"])
        print "************PARAMETERS**********"
        print "lambda=%f|mu=%f|len=%i" %(robjects.r("sol$par$lambda")[0],robjects.r("sol$par$mu")[0],robjects.r("read.length")[0])
        sys.exit(0)
    except Usage, err:
        print >>sys.stderr, err.msg
        print >>sys.stderr, usage
        sys.exit(2)
Exemple #12
0
 def __call__(self, **kw):
     feature_type = int(kw.get('feature_type') or 0)
     assembly_id = kw.get('assembly') or None
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
         exons = assembly.exon_track
     elif not (feature_type == 3):
         raise ValueError("Please specify an assembly")
     signals = kw.get('SigMulti', {}).get('signals', [])
     if not isinstance(signals, list): signals = [signals]
     signals = [track(sig, chrmeta=chrmeta) for sig in signals]
     snames = [sig.name for sig in signals]
     if feature_type == 0:  #bodies
         features = genes
     elif feature_type == 1:  #promoters
         prom_pars = {
             'before_start': int(kw.get('upstream') or prom_up_def),
             'after_start': int(kw.get('downstream') or prom_down_def),
             'on_strand': True
         }
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type == 2:  #exons
         features = exons
     elif feature_type == 3:  #custom track
         _t = track(kw.get('features'), chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         raise ValueError("Feature type not known: %i" % feature_type)
     highlights = kw.get('HiMulti', {}).get('highlights', [])
     if not isinstance(highlights, list): highlights = [highlights]
     if highlights is not None:
         highlights = [track(hi, chrmeta=chrmeta) for hi in highlights]
         hinames = [t.name for t in highlights]
     pdf = self.temporary_path(fname='plot_pairs.pdf')
     narr = None
     set_index = []
     set_labels = []
     if int(kw['mode']) == 0:  #correl
         cormax = int(kw.get('cormax') or _cormax)
         xarr = array(range(-cormax, cormax + 1))
         srtdchrom = sorted(chrmeta.keys())
         features = [
             x[:3] for chrom in srtdchrom
             for x in sorted_stream(features(chrom))
         ]
         _f = ['chr', 'start', 'end', 'score']
         narr = correlation([s.read(fields=_f) for s in signals], features,
                            (-cormax, cormax), True)
     elif int(kw['mode']) == 1:  #density
         xarr = None
         for chrom in chrmeta:
             feat = features(chrom)
             if 'name' not in feat.fields:
                 feat = add_name_field(feat)
             means = score_by_feature([s.read(chrom) for s in signals],
                                      feat)
             mf = means.fields[len(feat.fields):]
             _n, _l = score_array(means, mf)
             if _n.size == 0: continue
             if narr is None: narr = _n
             else: narr = vstack((narr, _n))
         set_index = [narr.shape[0]]
         for hitrack in highlights:
             for chrom in chrmeta:
                 hiread = hitrack.read(chrom)
                 if 'name' not in hiread.fields:
                     hiread = add_name_field(hiread)
                 means = score_by_feature([s.read(chrom) for s in signals],
                                          hiread)
                 mf = means.fields[len(hiread.fields):]
                 _n, _l = score_array(means, mf)
                 if _n.size == 0: continue
                 narr = vstack((narr, _n))
                 set_labels.extend(_l)
             set_index.append(narr.shape[0])
     else:
         raise ValueError("Mode not implemented: %s" % kw['mode'])
     if narr is None:
         raise ValueError("No data")
     pairs(narr,
           xarr,
           labels=snames,
           output=pdf,
           highlights=[set_index, set_labels])
     self.new_file(pdf, 'plot_pairs')
     return self.display_time()
Exemple #13
0
    def __call__(self, **kw):
        feature_type = int(kw.get("feature_type") or 0)
        individual = kw.get("individual", False)
        if isinstance(individual, basestring):
            individual = individual.lower() in ["1", "true", "t", "on"]
        if individual and int(kw["mode"]) != 1:
            raise ValueError("Only correlation plots can work with the 'individual' option.")

        assembly_id = kw.get("assembly") or None
        chrmeta = "guess"
        if assembly_id:
            assembly = genrep.Assembly(assembly_id)
            chrmeta = assembly.chrmeta
            genes = assembly.gene_track
            exons = assembly.exon_track
        elif not (feature_type == 3):
            raise ValueError("Please specify an assembly")
        # signals = kw.get('SigMulti',{}).get('signals', [])
        signals = kw.get("signals", [])
        if not isinstance(signals, list):
            signals = [signals]
        signals = [track(sig, chrmeta=chrmeta) for sig in signals]
        snames = [sig.name for sig in signals]
        if feature_type == 0:  # bodies
            features = genes
        elif feature_type == 1:  # promoters
            prom_pars = {
                "before_start": int(kw.get("upstream") or prom_up_def),
                "after_start": int(kw.get("downstream") or prom_down_def),
                "on_strand": True,
            }
            features = lambda c: neighborhood(genes(c), **prom_pars)
        elif feature_type == 2:  # exons
            features = exons
        elif feature_type == 3:  # custom track
            _t = track(kw.get("features"), chrmeta=chrmeta)
            chrmeta = _t.chrmeta
            features = _t.read
        else:
            raise ValueError("Feature type not known: %i" % feature_type)
        # highlights = kw.get('HiMulti',{}).get('highlights', [])
        highlights = kw.get("highlights", [])
        if not isinstance(highlights, list):
            highlights = [highlights]
        if highlights is not None:
            highlights = [track(hi, chrmeta=chrmeta) for hi in highlights]
            hinames = [t.name for t in highlights]
        pdf = self.temporary_path(fname="plot_pairs.pdf")
        narr = None
        set_index = []
        set_labels = []
        _new = True
        if int(kw["mode"]) == 1:  # correl
            cormax = int(kw.get("cormax") or _cormax)
            xarr = array(range(-cormax, cormax + 1))
            _f = ["chr", "start", "end", "score"]
            features = [x[:3] for chrom in chrmeta for x in sorted_stream(features(chrom))]
            table = self.temporary_path(fname="table.txt")
            with open(table, "w") as t:
                t.write("\t".join(["chr", "start", "end", "max(correlation)", "lag_max"]) + "\n")
                if individual:
                    for nplot, feature in enumerate(features):
                        if narr is not None and nplot < _MAX_PLOTS_:
                            pairs(narr, xarr, labels=snames, output=pdf, new=_new, last=False)
                            _new = False
                        narr = correlation([s.read(fields=_f) for s in signals], [feature], (-cormax, cormax), True)
                        list_corr = list(narr[0][0])
                        max_corr = max(list_corr)
                        lag_max = list_corr.index(max_corr) - cormax
                        t.write("\t".join([str(x) for x in feature[:3] + (max_corr, lag_max)]) + "\n")
                else:
                    narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True)
                    list_corr = list(narr[0][0])
                    max_corr = max(list_corr)
                    lag_max = list_corr.index(max_corr) - cormax
                    t.write("\t".join(["-", "-", "-"] + [str(max_corr), str(lag_max)]) + "\n")
        elif int(kw["mode"]) == 0:  # density
            xarr = None
            for chrom in chrmeta:
                feat = features(chrom)
                if "name" not in feat.fields:
                    feat = add_name_field(feat)
                means = score_by_feature([s.read(chrom) for s in signals], feat)
                mf = means.fields[len(feat.fields) :]
                _n, _l = score_array(means, mf)
                if _n.size == 0:
                    continue
                if narr is None:
                    narr = _n
                else:
                    narr = vstack((narr, _n))
            set_index = [narr.shape[0]]
            for hitrack in highlights:
                for chrom in chrmeta:
                    hiread = hitrack.read(chrom)
                    if "name" not in hiread.fields:
                        hiread = add_name_field(hiread)
                    means = score_by_feature([s.read(chrom) for s in signals], hiread)
                    mf = means.fields[len(hiread.fields) :]
                    _n, _l = score_array(means, mf)
                    if _n.size == 0:
                        continue
                    narr = vstack((narr, _n))
                    set_labels.extend(_l)
                set_index.append(narr.shape[0])
        else:
            raise ValueError("Mode not implemented: %s" % kw["mode"])
        if narr is None:
            raise ValueError("No data")
        pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels], new=_new, last=True)
        if int(kw["mode"]) == 1:
            self.new_file(table, "table")
        self.new_file(pdf, "plot_pairs")
        return self.display_time()
Exemple #14
0
 def __call__(self, **kw):
     feature_type = int(kw.get('feature_type') or 0)
     assembly_id = kw.get('assembly') or None
     chrmeta = "guess"
     if assembly_id:
         assembly = genrep.Assembly(assembly_id)
         chrmeta = assembly.chrmeta
         genes = assembly.gene_track
         exons = assembly.exon_track
     elif not(feature_type == 3):
         raise ValueError("Please specify an assembly")
     signals = kw.get('SigMulti',{}).get('signals', [])
     if not isinstance(signals, list): signals = [signals]
     signals = [track(sig, chrmeta=chrmeta) for sig in signals]
     snames = [sig.name for sig in signals]
     if feature_type == 0: #bodies
         features = genes
     elif feature_type == 1: #promoters
         prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def),
                      'after_start': int(kw.get('downstream') or prom_down_def),
                      'on_strand': True}
         features = lambda c: neighborhood(genes(c), **prom_pars)
     elif feature_type == 2: #exons
         features = exons
     elif feature_type == 3: #custom track
         _t = track(kw.get('features'), chrmeta=chrmeta)
         chrmeta = _t.chrmeta
         features = _t.read
     else:
         raise ValueError("Feature type not known: %i" % feature_type)
     highlights = kw.get('HiMulti',{}).get('highlights', [])
     if not isinstance(highlights, list): highlights = [highlights]
     if highlights is not None:
         highlights = [track(hi, chrmeta=chrmeta) for hi in highlights]
         hinames = [t.name for t in highlights]
     pdf = self.temporary_path(fname='plot_pairs.pdf')
     narr = None
     set_index = []
     set_labels = []
     if int(kw['mode']) == 0: #correl
         cormax = int(kw.get('cormax') or _cormax)
         xarr = array(range(-cormax, cormax + 1))
         srtdchrom = sorted(chrmeta.keys())
         features = [x[:3] for chrom in srtdchrom
                     for x in sorted_stream(features(chrom))]
         _f = ['chr', 'start', 'end', 'score']
         narr = correlation([s.read(fields=_f) for s in signals],
                            features, (-cormax, cormax), True)
     elif int(kw['mode']) == 1: #density
         xarr = None
         for chrom in chrmeta:
             feat = features(chrom)
             if 'name' not in feat.fields:
                 feat = add_name_field(feat)
             means = score_by_feature([s.read(chrom) for s in signals], feat)
             mf = means.fields[len(feat.fields):]
             _n, _l = score_array(means, mf)
             if _n.size == 0: continue
             if narr is None: narr = _n
             else:            narr = vstack((narr, _n))
         set_index = [narr.shape[0]]
         for hitrack in highlights:
             for chrom in chrmeta:
                 hiread = hitrack.read(chrom)
                 if 'name' not in hiread.fields:
                     hiread = add_name_field(hiread)
                 means = score_by_feature([s.read(chrom) for s in signals], hiread)
                 mf = means.fields[len(hiread.fields):]
                 _n, _l = score_array(means, mf)
                 if _n.size == 0: continue
                 narr = vstack((narr, _n))
                 set_labels.extend(_l)
             set_index.append(narr.shape[0])
     else:
         raise ValueError("Mode not implemented: %s" % kw['mode'])
     if narr is None:
         raise ValueError("No data")
     pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index,set_labels])
     self.new_file(pdf, 'plot_pairs')
     return self.display_time()
Exemple #15
0
def main(argv=None):
    try:
        usage = "camelPeaks.py [OPTIONS]"
        desc = """A ChIP-seq peak deconvolution algorithm."""
        parser = optparse.OptionParser(usage=usage, description=desc)
        for opt in opts:
            parser.add_option(opt[0], opt[1], help=opt[2], **opt[3])
        (opt, args) = parser.parse_args()
        if not (opt.peaks and os.path.exists(opt.peaks)):
            parser.print_help()
            raise Usage("Specify a valid peaks file with -p.")
        if not (opt.forward and os.path.exists(opt.forward)):
            parser.print_help()
            raise Usage("Specify a valid forward strand density file with -f.")
        if not (opt.reverse and os.path.exists(opt.reverse)):
            parser.print_help()
            raise Usage("Specify a valid reverse strand density file with -r.")


####
        if opt.chromosome and opt.length:
            chrmeta = {opt.chromosome: {'length': opt.length}}
        else:
            chrmeta = opt.genome
        peak_track = track(opt.peaks, chrmeta=chrmeta)
        chrmeta = peak_track.chrmeta
        if opt.chromosome: chrmeta = {opt.chromosome: chrmeta[opt.chromosome]}
        track_info = {
            'datatype': peak_track.info.get('datatype', 'qualitative')
        }
        outbed = track(opt.output + "_peaks.bed",
                       chrmeta=chrmeta,
                       fields=["chr", "start", "end", "name", "score"])
        outwig = track(opt.output + "_deconv.bedgraph", chrmeta=chrmeta)
        outwig.open(mode='overwrite')
        topts = {'chrmeta': chrmeta, 'readonly': True}
        for chrom, cv in chrmeta.iteritems():
            peak_stream = sorted_stream(peak_track.read(selection=chrom),
                                        [chrom])
            strands = {
                track(opt.forward, **topts).read(chrom,
                                                 fields=[
                                                     'start', 'end', 'score'
                                                 ]):
                'plus',
                track(opt.reverse, **topts).read(chrom,
                                                 fields=[
                                                     'start', 'end', 'score'
                                                 ]):
                'minus'
            }
            robjects.r('options(stringsAsFactors=F)')
            robjects.r('counts=data.frame()')
            for row_count, peak in enumerate(peak_stream):
                start = int(peak[peak_stream.fields.index('start')])
                end = int(peak[peak_stream.fields.index('end')])
                if end - start > opt.sizecutoff: continue
                if start < 0: start = 0
                if not (end <= cv['length']): end = cv['length']
                if 'name' in peak_stream.fields:
                    reg_name = peak[peak_stream.fields.index('name')]
                else:
                    reg_name = str(row_count + 1)
                data_block = robjects.DataFrame({
                    'pos':
                    robjects.IntVector(range(start + 1, end + 1)),
                    'plus':
                    robjects.FloatVector([0] * (end - start)),
                    'minus':
                    robjects.FloatVector([0] * (end - start)),
                    'name':
                    robjects.StrVector([reg_name] * (end - start))
                })
                for stream, strnd in strands.iteritems():
                    for row in stream:
                        if row[0] < start: continue
                        if row[1] > end: break
                        data_block.rx2(strnd)[(row[0]-start):(row[1]-start)] = \
                            robjects.FloatVector([row[2]]*(row[1]-row[0]))
                robjects.r.assign('newblock', data_block)
                robjects.r('counts=rbind(counts,newblock)')
            robjects.r('read.length=%i' % opt.extension)
            robjects.r('chr.name="%s"' % chrom)
            robjects.r('pdf.file="%s.pdf"' % opt.output)
            robjects.r('mu=%i' % opt.mu)
            robjects.r('ktype="%s"' % opt.kernel)
            robjects.r('source("%s")' %
                       os.path.join(opt.script, "deconv_fcts.R"))
            robjects.r("""
    counts = split(counts[,c("pos","plus","minus")],counts$name)
    pdf(file=pdf.file,title='chip-seq',paper='a4',width=8,height=11)
    par(cex=1.5,lwd=1.5)
    ccf = cross.correlate(counts,threshold=.5)
    plot(ccf$lag,ccf$acf,t='l',ylim=c(0,1),
         xlab='Lag',ylab='Cross-correlation',
         main=paste('Strand cross-correlation',chr.name))
    cut.ccf = ccf$acf
    cut.ccf[which(ccf$lag<mu)] = 0
    lambda = ccf$lag[which.max(cut.ccf)]
    sol = inverse.solve(counts,mu=mu,lambda=lambda,len=read.length,regul=1e-3,optimize=TRUE,ktype=ktype)
    col = 'red'
    lab = paste('lambda=',sol$par$lambda,sep='')
    abline(v=sol$par$lambda,col=col)
    text(sol$par$lambda,0,lab,col=col,pos=4)
    col = 'blue'
    lab = paste('mu=',sol$par$mu,sep='')
    abline(v=sol$par$mu,col=col)
    text(sol$par$mu,0.3,lab,col=col,pos=4)
    col = 'darkgreen'
    lab = paste('l=',read.length,sep='')
    abline(v=read.length,col=col)
    text(read.length,0.6,lab,col=col,pos=4)
    par(mfrow=c(4,2))
    for (n in names(counts)) {
      if (sol$sol[[n]]$value>.65) next
      plot.sol(counts[[n]],sol$sol[[n]],sol$par)
      title(sub=chr.name)
    }
    dev.off()
    bed = data.frame()
    cutoff = 1e-3
    for (n in names(counts)) {
      I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob))
      if (length(I)<2) next
      interval = range(counts[[n]]$pos[I])
      score = sum(sol$sol[[n]]$prob[I])
      name = paste('ID=',n,';FERR=',round(sol$sol[[n]]$val,digits=4),sep='')
      bed = rbind(bed,data.frame(
          start=interval[1],end=interval[2],
          name=name,score=score))
    }
    bed[,'start'] = as.integer(bed[,'start']-1)
    wig = data.frame()
    for (n in names(counts)) {
      I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob))
      wig = rbind(wig,data.frame(
          pos = as.integer(counts[[n]]$pos[I]),
          score = as.numeric(sol$sol[[n]]$prob[I])))
    }
    """)
            nrow = robjects.r("nrow(bed)")[0]
            outbed.write(((robjects.r("bed").rx2('start')[ri],
                           robjects.r("bed").rx2('end')[ri],
                           robjects.r("bed").rx2('name')[ri],
                           robjects.r("bed").rx2('score')[ri])
                          for ri in xrange(nrow)),
                         fields=["start", "end", "name", "score"],
                         chrom=chrom,
                         mode='append')
            nrow = robjects.r("nrow(wig)")[0]
            outwig.write(((robjects.r("wig").rx2('pos')[ri] - 1,
                           robjects.r("wig").rx2('pos')[ri],
                           robjects.r("wig").rx2('score')[ri])
                          for ri in xrange(nrow)),
                         fields=["start", "end", "score"],
                         chrom=chrom,
                         mode='append')
        outwig.close()
        print "************OUTPUT FILES**********"
        print "\n".join([
            opt.output + ".pdf", opt.output + "_peaks.bed",
            opt.output + "_deconv.bedgraph"
        ])
        print "************PARAMETERS**********"
        print "lambda=%f|mu=%f|len=%i" % (robjects.r("sol$par$lambda")[0],
                                          robjects.r("sol$par$mu")[0],
                                          robjects.r("read.length")[0])
        sys.exit(0)
    except Usage, err:
        print >> sys.stderr, err.msg
        print >> sys.stderr, usage
        sys.exit(2)
Exemple #16
0
def feature_matrix(trackScores,trackFeatures,segment=False,method='mean',**kw):
    """
    Return an array with as many lines as there are features in *trackFeatures*, and as many columns
    as there are score tracks in *trackScores*. Each element in the matrix thus corresponds to the
    (average) score of some genomic feature.

    If *segment* is True, each feature will be segmented into bins using
    bbcflib.gfminer.stream.intervals.segment_features (additional parameters in *\*\*kw* will be passed to this function).
    Then each element of the array is itself an array with *nbins* lines and one column for each track in *trackScores*.

    If *segment* is False, then each element of the array is an array with one element for each track in *trackScores*.

    Example::

                      gene1                 gene2
        X: -----#####|#####|#####--------###|###|###-----  (features)
        Y: _____________666|66666________666|666|666_____  (scores1)
        Z: _____22222|22222|22222________________________  (scores2)

        With segment=True, nbins=3:

              Y   Z
        R: [[[0.  2.],    # bin0 \
             [2.  2.],    # bin1  } gene 1
             [6.  2.]],   # bin2 /
            [[6.  0.],    # bin0 \
             [6.  0.],    # bin1  } gene2
             [6.  0.]]]   # bin2 /

        With segment=False:

              Y   Z
        R:  [[3.  2.]
             [6.  0.]]

    Note: the whole segmented features track will be loaded in memory.

    :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s).
    :param trackFeatures: (FeatureStream) feature track.
    :param segment: (bool) segment each feature into bins.[False]
    :param method: (str) Operation applied to the list of scores for one feature.
        It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'.
    :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`).
    :rtype: tuple (numpy.ndarray of strings, numpy.ndarray of floats)
    """
    nbins = 1
    nscores = 1
    if segment:
        trackFeatures = sorted_stream(segment_features(trackFeatures,**kw))
        nbins = kw.get('nbins',segment_features.func_defaults[0]) \
                + kw.get('upstream',(0,0))[1] \
                + kw.get('downstream',(0,0))[1]
    all_means = score_by_feature(trackScores,trackFeatures,method=method)
    nfields = len(trackFeatures.fields)
    if isinstance(trackScores,(list,tuple)):
        nscores = len(trackScores)
    scores_dict = {}
    if segment:
        empty_mat = numpy.zeros(shape=(nbins,nscores))
    else:
        empty_mat = numpy.zeros(nscores)
    name_idx = all_means.fields.index('name')
    for t in all_means:
        _n = t[name_idx]
        scores_dict.setdefault(_n, empty_mat.copy())
        if segment:
            scores_dict[_n][t[nfields-1]] = t[nfields:]
        else:
            scores_dict[_n] = t[nfields:]
    feat_names = numpy.array(scores_dict.keys())
    scores_mat = numpy.array(scores_dict.values())
    return (feat_names,scores_mat)