def test_sorted_stream(self): s = [(10, 0.8), (15, 2.8), (12, 19.5), (12, 1.4), (13, 0.1)] stream = fstream(s, fields=['start', 'score']) res = list(sorted_stream(stream, fields=['start'])) expected = [(10, 0.8), (12, 19.5), (12, 1.4), (13, 0.1), (15, 2.8)] self.assertListEqual(res, expected) stream = fstream(s, fields=['start', 'score']) res = list(sorted_stream(stream, fields=['start', 'score'])) expected = [(10, 0.8), (12, 1.4), (12, 19.5), (13, 0.1), (15, 2.8)] self.assertListEqual(res, expected) s = [('chrX', 0, 1, 0.8), ('chrIX', 3, 5, 2.8), ('chrIX', 3, 9, 1.4), ('chrIX', 2, 10, 0.1), ('chrIX', 7, 10, 0.8)] stream = fstream(s, fields=['chr', 'start', 'end', 'score']) res = list(sorted_stream(stream, fields=['start', 'chr'])) expected = [('chrX', 0, 1, 0.8), ('chrIX', 2, 10, 0.1), ('chrIX', 3, 5, 2.8), ('chrIX', 3, 9, 1.4), ('chrIX', 7, 10, 0.8)] self.assertListEqual(res, expected) stream = fstream(s, fields=['chr', 'start', 'end', 'score']) res = list( sorted_stream(stream, fields=['chr', 'start', 'score'], chrnames=self.a.chrnames)) expected = [('chrIX', 2, 10, 0.1), ('chrIX', 3, 9, 1.4), ('chrIX', 3, 5, 2.8), ('chrIX', 7, 10, 0.8), ('chrX', 0, 1, 0.8)] self.assertListEqual(res, expected)
def fimo(motifs,fasta,qval=True): # Run Fimo if qval: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh" else: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001" cmd = "fimo " + options + " %s %s" % (motifs, fasta) print "Running >>",cmd os.system(cmd) os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt") # Bed output t = track('fimo.txt', fields=["name","chr","start","end","strand","score","p-value","q-value","sequence"]) t.fields = ["name","chr","start","end","strand","a","score","q","sequence"] s = t.read() s = select(s,['chr','start','end','name','score','strand']) s = apply(s,'chr',lambda x:x.split('|')[1]) s = sorted_stream(s) s = cobble(s) s = apply(s,'name',lambda x:'|'.join(list(set(x.split('|'))))) outname = 'fimo.bed' bed = track(outname,fields=s.fields) bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite') bed.write(s) if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
def fimo(motifs, fasta, qval=True): # Run Fimo if qval: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh" else: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001" cmd = "fimo " + options + " %s %s" % (motifs, fasta) print "Running >>", cmd os.system(cmd) os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt") # Bed output t = track('fimo.txt', fields=[ "name", "chr", "start", "end", "strand", "score", "p-value", "q-value", "sequence" ]) t.fields = [ "name", "chr", "start", "end", "strand", "a", "score", "q", "sequence" ] s = t.read() s = select(s, ['chr', 'start', 'end', 'name', 'score', 'strand']) s = apply(s, 'chr', lambda x: x.split('|')[1]) s = sorted_stream(s) s = cobble(s) s = apply(s, 'name', lambda x: '|'.join(list(set(x.split('|'))))) outname = 'fimo.bed' bed = track(outname, fields=s.fields) bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite') bed.write(s) if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile): files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n") logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed, format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0], len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream( segment_features(FeatureStream(regs, fields=['start', 'end']), nbins=motif[1], upstream=_plot_flank, downstream=_plot_flank)) for t in score_by_feature( [s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1], _plot_flank[1] + nbins) for k in range(nbins): X[k + _plot_flank[1]] = str(k + 1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4, 2], output=files[gid]['pdf'], new=new, last=(last == 0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf, "w") as dff: dff.write("\t".join([""] + [str(x) for x in X]) + "\n") for n, sn in enumerate(snames): dff.write("\t".join([sn] + [str(x) for x in dat[:, n]]) + "\n") files[gid]['mat'].append((mname, _datf)) return files
def sort(*args, **kw): if len(args) < 1: raise Usage("No input file provided") chrmeta = _get_chrmeta(**kw) for infile in args: intrack = track.track(infile, format=kw['format'], chrmeta=chrmeta) outname = kw['output'] or intrack.name + '_sorted.' + intrack.format outtrack = track.track(outname, chrmeta=intrack.chrmeta) instream = intrack.read() s = sorted_stream(instream, chrnames=json.loads(kw['chromosomes'])) outtrack.write(s) intrack.close() return 0
def sort(*args,**kw): if len(args) < 1: raise Usage("No input file provided") chrmeta = _get_chrmeta(**kw) for infile in args: intrack = track.track(infile,format=kw['format'],chrmeta=chrmeta) outname = kw['output'] or intrack.name+'_sorted.'+intrack.format outtrack = track.track(outname, chrmeta=intrack.chrmeta) instream = intrack.read() s = sorted_stream(instream, chrnames=json.loads(kw['chromosomes'])) outtrack.write(s) intrack.close() return 0
def test_sorted_stream(self): s = [(10,0.8),(15,2.8),(12,19.5),(12,1.4),(13,0.1)] stream = fstream(s, fields=['start','score']) res = list(sorted_stream(stream,fields=['start'])) expected = [(10,0.8),(12,19.5),(12,1.4),(13,0.1),(15,2.8)] self.assertListEqual(res,expected) stream = fstream(s, fields=['start','score']) res = list(sorted_stream(stream,fields=['start','score'])) expected = [(10,0.8),(12,1.4),(12,19.5),(13,0.1),(15,2.8)] self.assertListEqual(res,expected) s = [('chrX',0,1,0.8),('chrIX',3,5,2.8),('chrIX',3,9,1.4),('chrIX',2,10,0.1),('chrIX',7,10,0.8)] stream = fstream(s, fields=['chr','start','end','score']) res = list(sorted_stream(stream, fields=['start','chr'])) expected = [('chrX',0,1,0.8),('chrIX',2,10,0.1),('chrIX',3,5,2.8),('chrIX',3,9,1.4),('chrIX',7,10,0.8)] self.assertListEqual(res,expected) stream = fstream(s, fields=['chr','start','end','score']) res = list(sorted_stream(stream, fields=['chr','start','score'], chrnames=self.a.chrnames)) expected = [('chrIX',2,10,0.1),('chrIX',3,9,1.4),('chrIX',3,5,2.8),('chrIX',7,10,0.8),('chrX',0,1,0.8)] self.assertListEqual(res,expected)
def plot_footprint_profile( ex, bedlist, signals, chrnames, groups, logfile ): files = dict((gid,{'pdf':"",'mat':[]}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n");logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed,format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0],len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1]+2*_plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream(segment_features(FeatureStream(regs,fields=['start','end']), nbins=motif[1],upstream=_plot_flank,downstream=_plot_flank)) for t in score_by_feature([s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1],_plot_flank[1]+nbins) for k in range(nbins): X[k+_plot_flank[1]] = str(k+1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4,2], output=files[gid]['pdf'], new=new, last=(last==0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf,"w") as dff: dff.write("\t".join([""]+[str(x) for x in X])+"\n") for n,sn in enumerate(snames): dff.write("\t".join([sn]+[str(x) for x in dat[:, n]])+"\n") files[gid]['mat'].append((mname,_datf)) return files
def summed_feature_matrix(trackScores,trackFeatures,method='mean',**kw): """ Each feature in *trackFeatures* is segmented into bins using bbcflib.gfminer.stream.segment_features (with parameters passed from *\*\*kw*). This creates a matrix with a column for each track in *trackScores* and a row for each bin in the segmented features. The values of a matrix entry is the score from one track in *trackScores* in one bin summed over all features. Example:: gene1 gene2 X: -----#####|#####|#####--------###|###|###----- (features, nbins=3) Y: _____________666|66666________666|666|666_____ Z: _____22222|22222|22222________________________ Y Z R: [[3. 1.], # bin 0 [4. 1.], # bin 1 [6. 1.]] # bin 2 Note: the whole segmented features track will be loaded in memory. :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s). :param trackFeatures: (FeatureStream) feature track. :param method: (str) Operation applied to the list of scores for one feature. It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'. :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`). :rtype: numpy.ndarray, int (number of features) """ nfields = len(trackFeatures.fields) trackFeatures = sorted_stream(segment_features(trackFeatures,**kw)) all_means = score_by_feature(trackScores,trackFeatures,method=method) if isinstance(trackScores,(list,tuple)): nscores = len(trackScores) else: nscores = 1 nbins = kw.get('nbins',segment_features.func_defaults[0]) \ + kw.get('upstream',(0,0))[1] \ + kw.get('downstream',(0,0))[1] averages = numpy.zeros(shape=(nbins,nscores)) ntot = -1 for ntot,x in enumerate(all_means): averages[x[nfields]] += x[(nfields+1):] return averages, (ntot+1)/nbins
def coverageInRepeats(ex, infile, genomeName='mm9', repeatsPath=GlobalRepbasePath, outdir=None, via='lsf'): """ Completes the segment info bed file with the coverage in repeats of each segment. For now, works only for mm9, hg19 and dm3. """ if not(isinstance(infile,dict)): infile = {"":infile} if outdir is None: resfile = unique_filename_in()+".bed" outf = open(resfile,'w') repeatsFile = os.path.join(repeatsPath, genomeName, genomeName+'_rmsk.bed') if not(os.path.exists(repeatsFile)): print("coverage in repeats not calculated as file "+repeatsFile+" does not exist.") if outdir is None: outf.close() cat([inf[0] for inf in infile.values()],out=resfile) else: for chrom,inf in infile.iteritems(): shutil.copy(inf[0], os.path.join(outdir,chrom+".bed")) resfile = outdir return resfile futures = {} for chrom,inf in infile.iteritems(): tmpfile = unique_filename_in() futures[chrom] = (tmpfile,coverageBed.nonblocking(ex,repeatsFile,inf[0],via=via,stdout=tmpfile)) for chrom,fut in futures.iteritems(): if not(outdir is None): resfile = os.path.join(outdir,chrom+".bed") outf = open(resfile,'w') fut[1].wait() coverout = track(fut[0],format='text',fields=['chr','start','end','name','c1','c2','c3','c4']) for s in sorted_stream(coverout.read(),[chrom]): s_split = s[3].split('|') infos = '|'.join(s_split[0:(len(s_split)-4)]+list(s[4:8])) outf.write('\t'.join([str(x) for x in s[0:3]+(infos,)])+'\n') if not(outdir is None): outf.close() if outdir is None: outf.close() else: resfile = outdir return resfile
def main(argv = None): try: usage = "camelPeaks.py [OPTIONS]" desc = """A ChIP-seq peak deconvolution algorithm.""" parser = optparse.OptionParser(usage=usage, description=desc) for opt in opts: parser.add_option(opt[0],opt[1],help=opt[2],**opt[3]) (opt, args) = parser.parse_args() if not(opt.peaks and os.path.exists(opt.peaks)): parser.print_help() raise Usage("Specify a valid peaks file with -p.") if not(opt.forward and os.path.exists(opt.forward)): parser.print_help() raise Usage("Specify a valid forward strand density file with -f.") if not(opt.reverse and os.path.exists(opt.reverse)): parser.print_help() raise Usage("Specify a valid reverse strand density file with -r.") #### if opt.chromosome and opt.length: chrmeta = {opt.chromosome: {'length': opt.length}} else: chrmeta = opt.genome peak_track = track(opt.peaks,chrmeta=chrmeta) chrmeta = peak_track.chrmeta if opt.chromosome: chrmeta = {opt.chromosome: chrmeta[opt.chromosome]} track_info = {'datatype': peak_track.info.get('datatype','qualitative')} outbed = track(opt.output+"_peaks.bed", chrmeta=chrmeta, fields=["chr","start","end","name","score"]) outwig = track(opt.output+"_deconv.bedgraph", chrmeta=chrmeta) outwig.open(mode='overwrite') topts = {'chrmeta': chrmeta, 'readonly': True} for chrom,cv in chrmeta.iteritems(): peak_stream = sorted_stream(peak_track.read(selection=chrom),[chrom]) strands = {track(opt.forward,**topts).read(chrom,fields=['start','end','score']): 'plus', track(opt.reverse,**topts).read(chrom,fields=['start','end','score']): 'minus'} robjects.r('options(stringsAsFactors=F)') robjects.r('counts=data.frame()') for row_count,peak in enumerate(peak_stream): start = int(peak[peak_stream.fields.index('start')]) end = int(peak[peak_stream.fields.index('end')]) if end-start > opt.sizecutoff: continue if start < 0: start = 0 if not(end <= cv['length']): end = cv['length'] if 'name' in peak_stream.fields: reg_name = peak[peak_stream.fields.index('name')] else: reg_name = str(row_count+1) data_block = robjects.DataFrame({'pos': robjects.IntVector(range(start+1,end+1)), 'plus': robjects.FloatVector([0]*(end-start)), 'minus': robjects.FloatVector([0]*(end-start)), 'name': robjects.StrVector([reg_name]*(end-start))}) for stream,strnd in strands.iteritems(): for row in stream: if row[0]<start: continue if row[1]>end: break data_block.rx2(strnd)[(row[0]-start):(row[1]-start)] = \ robjects.FloatVector([row[2]]*(row[1]-row[0])) robjects.r.assign('newblock',data_block) robjects.r('counts=rbind(counts,newblock)') robjects.r('read.length=%i' %opt.extension) robjects.r('chr.name="%s"' %chrom) robjects.r('pdf.file="%s.pdf"' %opt.output) robjects.r('mu=%i' %opt.mu) robjects.r('ktype="%s"' %opt.kernel) robjects.r('source("%s")' %os.path.join(opt.script,"deconv_fcts.R")) robjects.r(""" counts = split(counts[,c("pos","plus","minus")],counts$name) pdf(file=pdf.file,title='chip-seq',paper='a4',width=8,height=11) par(cex=1.5,lwd=1.5) ccf = cross.correlate(counts,threshold=.5) plot(ccf$lag,ccf$acf,t='l',ylim=c(0,1), xlab='Lag',ylab='Cross-correlation', main=paste('Strand cross-correlation',chr.name)) cut.ccf = ccf$acf cut.ccf[which(ccf$lag<mu)] = 0 lambda = ccf$lag[which.max(cut.ccf)] sol = inverse.solve(counts,mu=mu,lambda=lambda,len=read.length,regul=1e-3,optimize=TRUE,ktype=ktype) col = 'red' lab = paste('lambda=',sol$par$lambda,sep='') abline(v=sol$par$lambda,col=col) text(sol$par$lambda,0,lab,col=col,pos=4) col = 'blue' lab = paste('mu=',sol$par$mu,sep='') abline(v=sol$par$mu,col=col) text(sol$par$mu,0.3,lab,col=col,pos=4) col = 'darkgreen' lab = paste('l=',read.length,sep='') abline(v=read.length,col=col) text(read.length,0.6,lab,col=col,pos=4) par(mfrow=c(4,2)) for (n in names(counts)) { if (sol$sol[[n]]$value>.65) next plot.sol(counts[[n]],sol$sol[[n]],sol$par) title(sub=chr.name) } dev.off() bed = data.frame() cutoff = 1e-3 for (n in names(counts)) { I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob)) if (length(I)<2) next interval = range(counts[[n]]$pos[I]) score = sum(sol$sol[[n]]$prob[I]) name = paste('ID=',n,';FERR=',round(sol$sol[[n]]$val,digits=4),sep='') bed = rbind(bed,data.frame( start=interval[1],end=interval[2], name=name,score=score)) } bed[,'start'] = as.integer(bed[,'start']-1) wig = data.frame() for (n in names(counts)) { I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob)) wig = rbind(wig,data.frame( pos = as.integer(counts[[n]]$pos[I]), score = as.numeric(sol$sol[[n]]$prob[I]))) } """) nrow = robjects.r("nrow(bed)")[0] outbed.write(((robjects.r("bed").rx2('start')[ri], robjects.r("bed").rx2('end')[ri], robjects.r("bed").rx2('name')[ri], robjects.r("bed").rx2('score')[ri]) for ri in xrange(nrow)), fields=["start","end","name","score"], chrom=chrom, mode='append') nrow = robjects.r("nrow(wig)")[0] outwig.write(((robjects.r("wig").rx2('pos')[ri]-1, robjects.r("wig").rx2('pos')[ri], robjects.r("wig").rx2('score')[ri]) for ri in xrange(nrow)), fields=["start","end","score"], chrom=chrom, mode='append') outwig.close() print "************OUTPUT FILES**********" print "\n".join([opt.output+".pdf", opt.output+"_peaks.bed", opt.output+"_deconv.bedgraph"]) print "************PARAMETERS**********" print "lambda=%f|mu=%f|len=%i" %(robjects.r("sol$par$lambda")[0],robjects.r("sol$par$mu")[0],robjects.r("read.length")[0]) sys.exit(0) except Usage, err: print >>sys.stderr, err.msg print >>sys.stderr, usage sys.exit(2)
def __call__(self, **kw): feature_type = int(kw.get('feature_type') or 0) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not (feature_type == 3): raise ValueError("Please specify an assembly") signals = kw.get('SigMulti', {}).get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: #bodies features = genes elif feature_type == 1: #promoters prom_pars = { 'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True } features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: #exons features = exons elif feature_type == 3: #custom track _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) highlights = kw.get('HiMulti', {}).get('highlights', []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname='plot_pairs.pdf') narr = None set_index = [] set_labels = [] if int(kw['mode']) == 0: #correl cormax = int(kw.get('cormax') or _cormax) xarr = array(range(-cormax, cormax + 1)) srtdchrom = sorted(chrmeta.keys()) features = [ x[:3] for chrom in srtdchrom for x in sorted_stream(features(chrom)) ] _f = ['chr', 'start', 'end', 'score'] narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) elif int(kw['mode']) == 1: #density xarr = None for chrom in chrmeta: feat = features(chrom) if 'name' not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if 'name' not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw['mode']) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels]) self.new_file(pdf, 'plot_pairs') return self.display_time()
def __call__(self, **kw): feature_type = int(kw.get("feature_type") or 0) individual = kw.get("individual", False) if isinstance(individual, basestring): individual = individual.lower() in ["1", "true", "t", "on"] if individual and int(kw["mode"]) != 1: raise ValueError("Only correlation plots can work with the 'individual' option.") assembly_id = kw.get("assembly") or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not (feature_type == 3): raise ValueError("Please specify an assembly") # signals = kw.get('SigMulti',{}).get('signals', []) signals = kw.get("signals", []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: # bodies features = genes elif feature_type == 1: # promoters prom_pars = { "before_start": int(kw.get("upstream") or prom_up_def), "after_start": int(kw.get("downstream") or prom_down_def), "on_strand": True, } features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: # exons features = exons elif feature_type == 3: # custom track _t = track(kw.get("features"), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) # highlights = kw.get('HiMulti',{}).get('highlights', []) highlights = kw.get("highlights", []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname="plot_pairs.pdf") narr = None set_index = [] set_labels = [] _new = True if int(kw["mode"]) == 1: # correl cormax = int(kw.get("cormax") or _cormax) xarr = array(range(-cormax, cormax + 1)) _f = ["chr", "start", "end", "score"] features = [x[:3] for chrom in chrmeta for x in sorted_stream(features(chrom))] table = self.temporary_path(fname="table.txt") with open(table, "w") as t: t.write("\t".join(["chr", "start", "end", "max(correlation)", "lag_max"]) + "\n") if individual: for nplot, feature in enumerate(features): if narr is not None and nplot < _MAX_PLOTS_: pairs(narr, xarr, labels=snames, output=pdf, new=_new, last=False) _new = False narr = correlation([s.read(fields=_f) for s in signals], [feature], (-cormax, cormax), True) list_corr = list(narr[0][0]) max_corr = max(list_corr) lag_max = list_corr.index(max_corr) - cormax t.write("\t".join([str(x) for x in feature[:3] + (max_corr, lag_max)]) + "\n") else: narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) list_corr = list(narr[0][0]) max_corr = max(list_corr) lag_max = list_corr.index(max_corr) - cormax t.write("\t".join(["-", "-", "-"] + [str(max_corr), str(lag_max)]) + "\n") elif int(kw["mode"]) == 0: # density xarr = None for chrom in chrmeta: feat = features(chrom) if "name" not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields) :] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if "name" not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields) :] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw["mode"]) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels], new=_new, last=True) if int(kw["mode"]) == 1: self.new_file(table, "table") self.new_file(pdf, "plot_pairs") return self.display_time()
def __call__(self, **kw): feature_type = int(kw.get('feature_type') or 0) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not(feature_type == 3): raise ValueError("Please specify an assembly") signals = kw.get('SigMulti',{}).get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: #bodies features = genes elif feature_type == 1: #promoters prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: #exons features = exons elif feature_type == 3: #custom track _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) highlights = kw.get('HiMulti',{}).get('highlights', []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname='plot_pairs.pdf') narr = None set_index = [] set_labels = [] if int(kw['mode']) == 0: #correl cormax = int(kw.get('cormax') or _cormax) xarr = array(range(-cormax, cormax + 1)) srtdchrom = sorted(chrmeta.keys()) features = [x[:3] for chrom in srtdchrom for x in sorted_stream(features(chrom))] _f = ['chr', 'start', 'end', 'score'] narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) elif int(kw['mode']) == 1: #density xarr = None for chrom in chrmeta: feat = features(chrom) if 'name' not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if 'name' not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw['mode']) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index,set_labels]) self.new_file(pdf, 'plot_pairs') return self.display_time()
def main(argv=None): try: usage = "camelPeaks.py [OPTIONS]" desc = """A ChIP-seq peak deconvolution algorithm.""" parser = optparse.OptionParser(usage=usage, description=desc) for opt in opts: parser.add_option(opt[0], opt[1], help=opt[2], **opt[3]) (opt, args) = parser.parse_args() if not (opt.peaks and os.path.exists(opt.peaks)): parser.print_help() raise Usage("Specify a valid peaks file with -p.") if not (opt.forward and os.path.exists(opt.forward)): parser.print_help() raise Usage("Specify a valid forward strand density file with -f.") if not (opt.reverse and os.path.exists(opt.reverse)): parser.print_help() raise Usage("Specify a valid reverse strand density file with -r.") #### if opt.chromosome and opt.length: chrmeta = {opt.chromosome: {'length': opt.length}} else: chrmeta = opt.genome peak_track = track(opt.peaks, chrmeta=chrmeta) chrmeta = peak_track.chrmeta if opt.chromosome: chrmeta = {opt.chromosome: chrmeta[opt.chromosome]} track_info = { 'datatype': peak_track.info.get('datatype', 'qualitative') } outbed = track(opt.output + "_peaks.bed", chrmeta=chrmeta, fields=["chr", "start", "end", "name", "score"]) outwig = track(opt.output + "_deconv.bedgraph", chrmeta=chrmeta) outwig.open(mode='overwrite') topts = {'chrmeta': chrmeta, 'readonly': True} for chrom, cv in chrmeta.iteritems(): peak_stream = sorted_stream(peak_track.read(selection=chrom), [chrom]) strands = { track(opt.forward, **topts).read(chrom, fields=[ 'start', 'end', 'score' ]): 'plus', track(opt.reverse, **topts).read(chrom, fields=[ 'start', 'end', 'score' ]): 'minus' } robjects.r('options(stringsAsFactors=F)') robjects.r('counts=data.frame()') for row_count, peak in enumerate(peak_stream): start = int(peak[peak_stream.fields.index('start')]) end = int(peak[peak_stream.fields.index('end')]) if end - start > opt.sizecutoff: continue if start < 0: start = 0 if not (end <= cv['length']): end = cv['length'] if 'name' in peak_stream.fields: reg_name = peak[peak_stream.fields.index('name')] else: reg_name = str(row_count + 1) data_block = robjects.DataFrame({ 'pos': robjects.IntVector(range(start + 1, end + 1)), 'plus': robjects.FloatVector([0] * (end - start)), 'minus': robjects.FloatVector([0] * (end - start)), 'name': robjects.StrVector([reg_name] * (end - start)) }) for stream, strnd in strands.iteritems(): for row in stream: if row[0] < start: continue if row[1] > end: break data_block.rx2(strnd)[(row[0]-start):(row[1]-start)] = \ robjects.FloatVector([row[2]]*(row[1]-row[0])) robjects.r.assign('newblock', data_block) robjects.r('counts=rbind(counts,newblock)') robjects.r('read.length=%i' % opt.extension) robjects.r('chr.name="%s"' % chrom) robjects.r('pdf.file="%s.pdf"' % opt.output) robjects.r('mu=%i' % opt.mu) robjects.r('ktype="%s"' % opt.kernel) robjects.r('source("%s")' % os.path.join(opt.script, "deconv_fcts.R")) robjects.r(""" counts = split(counts[,c("pos","plus","minus")],counts$name) pdf(file=pdf.file,title='chip-seq',paper='a4',width=8,height=11) par(cex=1.5,lwd=1.5) ccf = cross.correlate(counts,threshold=.5) plot(ccf$lag,ccf$acf,t='l',ylim=c(0,1), xlab='Lag',ylab='Cross-correlation', main=paste('Strand cross-correlation',chr.name)) cut.ccf = ccf$acf cut.ccf[which(ccf$lag<mu)] = 0 lambda = ccf$lag[which.max(cut.ccf)] sol = inverse.solve(counts,mu=mu,lambda=lambda,len=read.length,regul=1e-3,optimize=TRUE,ktype=ktype) col = 'red' lab = paste('lambda=',sol$par$lambda,sep='') abline(v=sol$par$lambda,col=col) text(sol$par$lambda,0,lab,col=col,pos=4) col = 'blue' lab = paste('mu=',sol$par$mu,sep='') abline(v=sol$par$mu,col=col) text(sol$par$mu,0.3,lab,col=col,pos=4) col = 'darkgreen' lab = paste('l=',read.length,sep='') abline(v=read.length,col=col) text(read.length,0.6,lab,col=col,pos=4) par(mfrow=c(4,2)) for (n in names(counts)) { if (sol$sol[[n]]$value>.65) next plot.sol(counts[[n]],sol$sol[[n]],sol$par) title(sub=chr.name) } dev.off() bed = data.frame() cutoff = 1e-3 for (n in names(counts)) { I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob)) if (length(I)<2) next interval = range(counts[[n]]$pos[I]) score = sum(sol$sol[[n]]$prob[I]) name = paste('ID=',n,';FERR=',round(sol$sol[[n]]$val,digits=4),sep='') bed = rbind(bed,data.frame( start=interval[1],end=interval[2], name=name,score=score)) } bed[,'start'] = as.integer(bed[,'start']-1) wig = data.frame() for (n in names(counts)) { I = which(sol$sol[[n]]$prob>cutoff*sum(sol$sol[[n]]$prob)) wig = rbind(wig,data.frame( pos = as.integer(counts[[n]]$pos[I]), score = as.numeric(sol$sol[[n]]$prob[I]))) } """) nrow = robjects.r("nrow(bed)")[0] outbed.write(((robjects.r("bed").rx2('start')[ri], robjects.r("bed").rx2('end')[ri], robjects.r("bed").rx2('name')[ri], robjects.r("bed").rx2('score')[ri]) for ri in xrange(nrow)), fields=["start", "end", "name", "score"], chrom=chrom, mode='append') nrow = robjects.r("nrow(wig)")[0] outwig.write(((robjects.r("wig").rx2('pos')[ri] - 1, robjects.r("wig").rx2('pos')[ri], robjects.r("wig").rx2('score')[ri]) for ri in xrange(nrow)), fields=["start", "end", "score"], chrom=chrom, mode='append') outwig.close() print "************OUTPUT FILES**********" print "\n".join([ opt.output + ".pdf", opt.output + "_peaks.bed", opt.output + "_deconv.bedgraph" ]) print "************PARAMETERS**********" print "lambda=%f|mu=%f|len=%i" % (robjects.r("sol$par$lambda")[0], robjects.r("sol$par$mu")[0], robjects.r("read.length")[0]) sys.exit(0) except Usage, err: print >> sys.stderr, err.msg print >> sys.stderr, usage sys.exit(2)
def feature_matrix(trackScores,trackFeatures,segment=False,method='mean',**kw): """ Return an array with as many lines as there are features in *trackFeatures*, and as many columns as there are score tracks in *trackScores*. Each element in the matrix thus corresponds to the (average) score of some genomic feature. If *segment* is True, each feature will be segmented into bins using bbcflib.gfminer.stream.intervals.segment_features (additional parameters in *\*\*kw* will be passed to this function). Then each element of the array is itself an array with *nbins* lines and one column for each track in *trackScores*. If *segment* is False, then each element of the array is an array with one element for each track in *trackScores*. Example:: gene1 gene2 X: -----#####|#####|#####--------###|###|###----- (features) Y: _____________666|66666________666|666|666_____ (scores1) Z: _____22222|22222|22222________________________ (scores2) With segment=True, nbins=3: Y Z R: [[[0. 2.], # bin0 \ [2. 2.], # bin1 } gene 1 [6. 2.]], # bin2 / [[6. 0.], # bin0 \ [6. 0.], # bin1 } gene2 [6. 0.]]] # bin2 / With segment=False: Y Z R: [[3. 2.] [6. 0.]] Note: the whole segmented features track will be loaded in memory. :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s). :param trackFeatures: (FeatureStream) feature track. :param segment: (bool) segment each feature into bins.[False] :param method: (str) Operation applied to the list of scores for one feature. It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'. :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`). :rtype: tuple (numpy.ndarray of strings, numpy.ndarray of floats) """ nbins = 1 nscores = 1 if segment: trackFeatures = sorted_stream(segment_features(trackFeatures,**kw)) nbins = kw.get('nbins',segment_features.func_defaults[0]) \ + kw.get('upstream',(0,0))[1] \ + kw.get('downstream',(0,0))[1] all_means = score_by_feature(trackScores,trackFeatures,method=method) nfields = len(trackFeatures.fields) if isinstance(trackScores,(list,tuple)): nscores = len(trackScores) scores_dict = {} if segment: empty_mat = numpy.zeros(shape=(nbins,nscores)) else: empty_mat = numpy.zeros(nscores) name_idx = all_means.fields.index('name') for t in all_means: _n = t[name_idx] scores_dict.setdefault(_n, empty_mat.copy()) if segment: scores_dict[_n][t[nfields-1]] = t[nfields:] else: scores_dict[_n] = t[nfields:] feat_names = numpy.array(scores_dict.keys()) scores_mat = numpy.array(scores_dict.values()) return (feat_names,scores_mat)