def test_score_by_feature(self): features = fstream([('chr',5,15,'gene1'),('chr',30,40,'gene2')], fields=['chr','start','end','name']) scores1 = fstream([('chr',10,20,6.),('chr',30,40,6.)], fields=['chr','start','end','score']) scores2 = fstream([('chr',30,40,2.)], fields=['chr','start','end','score']) res = list(score_by_feature([scores1,scores2],features)) expected = [('chr',5,15,'gene1',3.,0.),('chr',30,40,'gene2',6.,2.)] self.assertListEqual(res,expected) # normalize = False features = fstream([('chr',5,15,'gene1'),('chr',30,40,'gene2')], fields=['chr','start','end','name']) scores1 = fstream([('chr',10,20,6.),('chr',30,40,6.)], fields=['chr','start','end','score']) scores2 = fstream([('chr',2,8,2.),('chr',30,33,3.)], fields=['chr','start','end','score']) res = list(score_by_feature([scores1,scores2],features,method=sum)) expected = [('chr',5,15,'gene1',30.,6.),('chr',30,40,'gene2',60.,9.)] self.assertListEqual(res,expected)
def plot_footprint_profile(ex, bedlist, signals, chrnames, groups, logfile): files = dict((gid, {'pdf': "", 'mat': []}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n") logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed, format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0], len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1] + 2 * _plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream( segment_features(FeatureStream(regs, fields=['start', 'end']), nbins=motif[1], upstream=_plot_flank, downstream=_plot_flank)) for t in score_by_feature( [s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1], _plot_flank[1] + nbins) for k in range(nbins): X[k + _plot_flank[1]] = str(k + 1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4, 2], output=files[gid]['pdf'], new=new, last=(last == 0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf, "w") as dff: dff.write("\t".join([""] + [str(x) for x in X]) + "\n") for n, sn in enumerate(snames): dff.write("\t".join([sn] + [str(x) for x in dat[:, n]]) + "\n") files[gid]['mat'].append((mname, _datf)) return files
def quantify(self,**kw): feature_type = kw.get('feature_type', 0) if str(feature_type) in [str(x[0]) for x in ftypes]: feature_type = int(feature_type) func = str(kw.get('score_op', 'mean')) assembly_id = kw.get('assembly') format = kw.get('output') or 'txt' chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not(feature_type in ftypes[3]): raise ValueError("Please specify an assembly") #signals = kw['SigMulti'].get('signals',[]) signals = kw.get('signals',[]) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] if feature_type in ftypes[0]: features = genes elif feature_type in ftypes[1]: prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type in ftypes[2]: features = exons elif feature_type in ftypes[3]: assert os.path.exists(str(kw.get('features'))), "Features file not found: '%s'" % kw.get("features") _t = track(kw['features'], chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Take feature_type in %s." %ftypes) output = self.temporary_path(fname='quantification.'+format) if len(signals) > 1: _f = ["score%i"%i for i in range(len(signals))] else: _f = ["score"] tout = track(output, format, fields=['chr','start','end','name']+_f, chrmeta=chrmeta, info={'datatype':'qualitative'}) if format == 'txt': header = ['#chr','start','end','name']+[s.name for s in signals] tout.make_header("\t".join(header)) for chrom in chrmeta: sread = [sig.read(chrom) for sig in signals] tout.write(score_by_feature(sread, features(chrom), method=func), chrom=chrom, clip=True, mode="append") return output
def test_score_by_feature(self): features = fstream([('chr', 5, 15, 'gene1'), ('chr', 30, 40, 'gene2')], fields=['chr', 'start', 'end', 'name']) scores1 = fstream([('chr', 10, 20, 6.), ('chr', 30, 40, 6.)], fields=['chr', 'start', 'end', 'score']) scores2 = fstream([('chr', 30, 40, 2.)], fields=['chr', 'start', 'end', 'score']) res = list(score_by_feature([scores1, scores2], features)) expected = [('chr', 5, 15, 'gene1', 3., 0.), ('chr', 30, 40, 'gene2', 6., 2.)] self.assertListEqual(res, expected) # normalize = False features = fstream([('chr', 5, 15, 'gene1'), ('chr', 30, 40, 'gene2')], fields=['chr', 'start', 'end', 'name']) scores1 = fstream([('chr', 10, 20, 6.), ('chr', 30, 40, 6.)], fields=['chr', 'start', 'end', 'score']) scores2 = fstream([('chr', 2, 8, 2.), ('chr', 30, 33, 3.)], fields=['chr', 'start', 'end', 'score']) res = list(score_by_feature([scores1, scores2], features, method=sum)) expected = [('chr', 5, 15, 'gene1', 30., 6.), ('chr', 30, 40, 'gene2', 60., 9.)] self.assertListEqual(res, expected)
def plot_footprint_profile( ex, bedlist, signals, chrnames, groups, logfile ): files = dict((gid,{'pdf':"",'mat':[]}) for gid in bedlist.keys()) logfile.write("Plotting footprints:\n");logfile.flush() for gid, motifbed in bedlist.iteritems(): # signals = [track(sig) for sig in siglist[gid]] snames = [sig.name for sig in signals[gid]] tmotif = track(motifbed,format='bed') data = {} numregs = {} for chrom in chrnames: fread = {} for r in tmotif.read(chrom): r2 = r[3].split(":") key = (r2[0],len(r2[1])) if key in fread: fread[key].append(r[1:3]) else: fread[key] = [r[1:3]] for motif, regs in fread.iteritems(): if motif not in data: data[motif] = zeros(shape=(motif[1]+2*_plot_flank[1], len(signals[gid]))) numregs[motif] = 0 numregs[motif] += len(regs) tFeat = sorted_stream(segment_features(FeatureStream(regs,fields=['start','end']), nbins=motif[1],upstream=_plot_flank,downstream=_plot_flank)) for t in score_by_feature([s.read(chrom) for s in signals[gid]], tFeat): data[motif][t[2]] += t[3:] files[gid]['pdf'] = unique_filename_in() new = True last = len(data) for motif, dat in data.iteritems(): last -= 1 mname, nbins = motif dat /= float(numregs[motif]) X = range(-_plot_flank[1],_plot_flank[1]+nbins) for k in range(nbins): X[k+_plot_flank[1]] = str(k+1) ####### Could do a heatmap (sort by intensity)... lineplot(X, [dat[:, n] for n in range(dat.shape[-1])], mfrow=[4,2], output=files[gid]['pdf'], new=new, last=(last==0), legend=snames, main=mname) new = False _datf = unique_filename_in() with open(_datf,"w") as dff: dff.write("\t".join([""]+[str(x) for x in X])+"\n") for n,sn in enumerate(snames): dff.write("\t".join([sn]+[str(x) for x in dat[:, n]])+"\n") files[gid]['mat'].append((mname,_datf)) return files
def summed_feature_matrix(trackScores,trackFeatures,method='mean',**kw): """ Each feature in *trackFeatures* is segmented into bins using bbcflib.gfminer.stream.segment_features (with parameters passed from *\*\*kw*). This creates a matrix with a column for each track in *trackScores* and a row for each bin in the segmented features. The values of a matrix entry is the score from one track in *trackScores* in one bin summed over all features. Example:: gene1 gene2 X: -----#####|#####|#####--------###|###|###----- (features, nbins=3) Y: _____________666|66666________666|666|666_____ Z: _____22222|22222|22222________________________ Y Z R: [[3. 1.], # bin 0 [4. 1.], # bin 1 [6. 1.]] # bin 2 Note: the whole segmented features track will be loaded in memory. :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s). :param trackFeatures: (FeatureStream) feature track. :param method: (str) Operation applied to the list of scores for one feature. It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'. :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`). :rtype: numpy.ndarray, int (number of features) """ nfields = len(trackFeatures.fields) trackFeatures = sorted_stream(segment_features(trackFeatures,**kw)) all_means = score_by_feature(trackScores,trackFeatures,method=method) if isinstance(trackScores,(list,tuple)): nscores = len(trackScores) else: nscores = 1 nbins = kw.get('nbins',segment_features.func_defaults[0]) \ + kw.get('upstream',(0,0))[1] \ + kw.get('downstream',(0,0))[1] averages = numpy.zeros(shape=(nbins,nscores)) ntot = -1 for ntot,x in enumerate(all_means): averages[x[nfields]] += x[(nfields+1):] return averages, (ntot+1)/nbins
def __call__(self, **kw): feature_type = int(kw.get('feature_type') or 0) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not (feature_type == 3): raise ValueError("Please specify an assembly") signals = kw.get('SigMulti', {}).get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: #bodies features = genes elif feature_type == 1: #promoters prom_pars = { 'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True } features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: #exons features = exons elif feature_type == 3: #custom track _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) highlights = kw.get('HiMulti', {}).get('highlights', []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname='plot_pairs.pdf') narr = None set_index = [] set_labels = [] if int(kw['mode']) == 0: #correl cormax = int(kw.get('cormax') or _cormax) xarr = array(range(-cormax, cormax + 1)) srtdchrom = sorted(chrmeta.keys()) features = [ x[:3] for chrom in srtdchrom for x in sorted_stream(features(chrom)) ] _f = ['chr', 'start', 'end', 'score'] narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) elif int(kw['mode']) == 1: #density xarr = None for chrom in chrmeta: feat = features(chrom) if 'name' not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if 'name' not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw['mode']) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels]) self.new_file(pdf, 'plot_pairs') return self.display_time()
def __call__(self, **kw): feature_type = int(kw.get("feature_type") or 0) individual = kw.get("individual", False) if isinstance(individual, basestring): individual = individual.lower() in ["1", "true", "t", "on"] if individual and int(kw["mode"]) != 1: raise ValueError("Only correlation plots can work with the 'individual' option.") assembly_id = kw.get("assembly") or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not (feature_type == 3): raise ValueError("Please specify an assembly") # signals = kw.get('SigMulti',{}).get('signals', []) signals = kw.get("signals", []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: # bodies features = genes elif feature_type == 1: # promoters prom_pars = { "before_start": int(kw.get("upstream") or prom_up_def), "after_start": int(kw.get("downstream") or prom_down_def), "on_strand": True, } features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: # exons features = exons elif feature_type == 3: # custom track _t = track(kw.get("features"), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) # highlights = kw.get('HiMulti',{}).get('highlights', []) highlights = kw.get("highlights", []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname="plot_pairs.pdf") narr = None set_index = [] set_labels = [] _new = True if int(kw["mode"]) == 1: # correl cormax = int(kw.get("cormax") or _cormax) xarr = array(range(-cormax, cormax + 1)) _f = ["chr", "start", "end", "score"] features = [x[:3] for chrom in chrmeta for x in sorted_stream(features(chrom))] table = self.temporary_path(fname="table.txt") with open(table, "w") as t: t.write("\t".join(["chr", "start", "end", "max(correlation)", "lag_max"]) + "\n") if individual: for nplot, feature in enumerate(features): if narr is not None and nplot < _MAX_PLOTS_: pairs(narr, xarr, labels=snames, output=pdf, new=_new, last=False) _new = False narr = correlation([s.read(fields=_f) for s in signals], [feature], (-cormax, cormax), True) list_corr = list(narr[0][0]) max_corr = max(list_corr) lag_max = list_corr.index(max_corr) - cormax t.write("\t".join([str(x) for x in feature[:3] + (max_corr, lag_max)]) + "\n") else: narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) list_corr = list(narr[0][0]) max_corr = max(list_corr) lag_max = list_corr.index(max_corr) - cormax t.write("\t".join(["-", "-", "-"] + [str(max_corr), str(lag_max)]) + "\n") elif int(kw["mode"]) == 0: # density xarr = None for chrom in chrmeta: feat = features(chrom) if "name" not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields) :] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if "name" not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields) :] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw["mode"]) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index, set_labels], new=_new, last=True) if int(kw["mode"]) == 1: self.new_file(table, "table") self.new_file(pdf, "plot_pairs") return self.display_time()
def __call__(self, **kw): feature_type = int(kw.get('feature_type') or 0) assembly_id = kw.get('assembly') or None chrmeta = "guess" if assembly_id: assembly = genrep.Assembly(assembly_id) chrmeta = assembly.chrmeta genes = assembly.gene_track exons = assembly.exon_track elif not(feature_type == 3): raise ValueError("Please specify an assembly") signals = kw.get('SigMulti',{}).get('signals', []) if not isinstance(signals, list): signals = [signals] signals = [track(sig, chrmeta=chrmeta) for sig in signals] snames = [sig.name for sig in signals] if feature_type == 0: #bodies features = genes elif feature_type == 1: #promoters prom_pars = {'before_start': int(kw.get('upstream') or prom_up_def), 'after_start': int(kw.get('downstream') or prom_down_def), 'on_strand': True} features = lambda c: neighborhood(genes(c), **prom_pars) elif feature_type == 2: #exons features = exons elif feature_type == 3: #custom track _t = track(kw.get('features'), chrmeta=chrmeta) chrmeta = _t.chrmeta features = _t.read else: raise ValueError("Feature type not known: %i" % feature_type) highlights = kw.get('HiMulti',{}).get('highlights', []) if not isinstance(highlights, list): highlights = [highlights] if highlights is not None: highlights = [track(hi, chrmeta=chrmeta) for hi in highlights] hinames = [t.name for t in highlights] pdf = self.temporary_path(fname='plot_pairs.pdf') narr = None set_index = [] set_labels = [] if int(kw['mode']) == 0: #correl cormax = int(kw.get('cormax') or _cormax) xarr = array(range(-cormax, cormax + 1)) srtdchrom = sorted(chrmeta.keys()) features = [x[:3] for chrom in srtdchrom for x in sorted_stream(features(chrom))] _f = ['chr', 'start', 'end', 'score'] narr = correlation([s.read(fields=_f) for s in signals], features, (-cormax, cormax), True) elif int(kw['mode']) == 1: #density xarr = None for chrom in chrmeta: feat = features(chrom) if 'name' not in feat.fields: feat = add_name_field(feat) means = score_by_feature([s.read(chrom) for s in signals], feat) mf = means.fields[len(feat.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue if narr is None: narr = _n else: narr = vstack((narr, _n)) set_index = [narr.shape[0]] for hitrack in highlights: for chrom in chrmeta: hiread = hitrack.read(chrom) if 'name' not in hiread.fields: hiread = add_name_field(hiread) means = score_by_feature([s.read(chrom) for s in signals], hiread) mf = means.fields[len(hiread.fields):] _n, _l = score_array(means, mf) if _n.size == 0: continue narr = vstack((narr, _n)) set_labels.extend(_l) set_index.append(narr.shape[0]) else: raise ValueError("Mode not implemented: %s" % kw['mode']) if narr is None: raise ValueError("No data") pairs(narr, xarr, labels=snames, output=pdf, highlights=[set_index,set_labels]) self.new_file(pdf, 'plot_pairs') return self.display_time()
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict,frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files',{}) else: raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.") merge_strands = int(options.get('merge_strands',-1)) suffixes = ["fwd","rev"] peak_deconvolution = options.get('peak_deconvolution',False) if isinstance(peak_deconvolution,basestring): peak_deconvolution = peak_deconvolution.lower() in ['1','true','t'] run_meme = options.get('run_meme',False) if isinstance(run_meme,basestring): run_meme = run_meme.lower() in ['1','true','t'] macs_args = options.get('macs_args',["--bw","200"]) b2w_args = options.get('b2w_args',[]) if not(isinstance(mapseq_files,dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid,mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not(isinstance(mapped,dict)): raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name+"_"+str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via ) if mapped[k].get('poisson_threshold',-1)>0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns)>0: p_thresh['group_name'] = sum(ptruns)/len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped)>1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: tests.append(bamfile) names['tests'].append((gid,group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls)<1: controls = [None] names['controls'] = [(0,None)] logfile.write("Starting MACS.\n");logfile.flush() processed = {'macs': add_macs_results( ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via ) } logfile.write("Done MACS.\n");logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score':(6,sys.maxint)} _fields = ['chr','start','end','name','score'] for i,name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = track(processed['macs'][ctrl]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n,_n=xn: "%s:%i" %(__n,_n)) for xn,x in enumerate(names['controls'])]) ############################## macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 ) peak_list[name] = unique_filename_in()+".sql" macs_final = track( peak_list[name], chrmeta=chrlist, info={'datatype':'qualitative'}, fields=['start','end','name','score'] ) macs_final.write(fusion(macs_neighb),clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int(options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension']>100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid,mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv( stream, pval ): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields ) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1]+" deconvolution.\n");logfile.flush() if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = processed['macs'][ctrl]+"_peaks.bed" else: macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed" for x in names['controls']], via=via ) deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via ) peak_list[name] = unique_filename_in()+".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed,0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1]+'_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1]+'_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'],(bigwig,"bigWig")) ex.add(bigwig, description=set_file_descr(name[1]+'_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e));logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1]+'_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs( stream, xlsl, _f ): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:]) yield _p+xlsl[0][nb-1][1:] else: nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:] return FeatureStream( _macs_row(stream), fields=_f ) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex,peakfile) peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields)) ex.add(peakfile+".gz", description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text', step='annotation',groupId=name[0])) stracks = [track(wig,info={'name':name+"_"+st}) for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()] tablefile = unique_filename_in() with open(tablefile,"w") as _tf: _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1] for _s in names['tests'] for _c in names['controls']] _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']), 'name', lambda __n,_n=npt: "%s:%i" %(__n,_n)) for npt,pt in enumerate(peakfile_list)] features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], remove_duplicates=True, group_by=['chr','start','end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile,"a") as _tf: for row in quantifs: pcols = ['']*_ns*_nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while ( _k < len(_rnsplit)-1-int(_nc>1) ): if _nc > 1: _k += 2 _n2 = _rnsplit[_k-1] _n = _rnsplit[_k].split("|") pcols[int(_n[0])*_nc+int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n") gzipfile(ex,tablefile) ex.add(tablefile+".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n");logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'], via=via ) return processed
def chipseq_workflow(ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf'): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict, frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not ('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files', {}) else: raise TypeError( "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'." ) merge_strands = int(options.get('merge_strands', -1)) suffixes = ["fwd", "rev"] peak_deconvolution = options.get('peak_deconvolution', False) if isinstance(peak_deconvolution, basestring): peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't'] run_meme = options.get('run_meme', False) if isinstance(run_meme, basestring): run_meme = run_meme.lower() in ['1', 'true', 't'] macs_args = options.get('macs_args', ["--bw", "200"]) b2w_args = options.get('b2w_args', []) if not (isinstance(mapseq_files, dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid, mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not (isinstance(mapped, dict)): raise TypeError( "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name + "_" + str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking(ex, mapped[k]["bam"], via=via) if mapped[k].get('poisson_threshold', -1) > 0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns) > 0: p_thresh['group_name'] = sum(ptruns) / len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped) > 1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: tests.append(bamfile) names['tests'].append((gid, group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] logfile.write("Starting MACS.\n") logfile.flush() processed = { 'macs': add_macs_results(ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via) } logfile.write("Done MACS.\n") logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score': (6, sys.maxint)} _fields = ['chr', 'start', 'end', 'name', 'score'] for i, name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = track(processed['macs'][ctrl] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([ apply(track(processed['macs'][(name, x)] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n, _n=xn: "%s:%i" % (__n, _n)) for xn, x in enumerate(names['controls']) ]) ############################## macs_neighb = neighborhood(macsbed, before_start=150, after_end=150) peak_list[name] = unique_filename_in() + ".sql" macs_final = track(peak_list[name], chrmeta=chrlist, info={'datatype': 'qualitative'}, fields=['start', 'end', 'name', 'score']) macs_final.write(fusion(macs_neighb), clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int( options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension'] > 100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid, mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via) wig.append(dict((s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict( (s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv(stream, pval): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0], ) + ((x[2] + x[1]) / 2 - 150, (x[2] + x[1]) / 2 + 150) + x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1] + " deconvolution.\n") logfile.flush() if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = processed['macs'][ctrl] + "_peaks.bed" else: macsbed = intersect_many_bed(ex, [ processed['macs'][(name, x)] + "_peaks.bed" for x in names['controls'] ], via=via) deconv = run_deconv(ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via) peak_list[name] = unique_filename_in() + ".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed, 0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1] + '_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1] + '_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'], (bigwig, "bigWig")) ex.add(bigwig, description=set_file_descr(name[1] + '_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e)) logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1] + '_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs(stream, xlsl, _f): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int( _n.split(";")[0][13:]) if _n[:3] == "ID=" else int( _n[10:]) yield _p + xlsl[0][nb - 1][1:] else: nb = _n.split( ";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:] return FeatureStream(_macs_row(stream), fields=_f) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist, chrmeta=chrlist, fields=["chr", "start", "end", "name", "score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([ processed['macs'][(name, _c)] + "_peaks.xls" for _c in names['controls'] ]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join([ 'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)', 'location_type', 'distance' ] + _fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs( getNearestFeature(ptrack.read(selection=chrom), _feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr', 'start', 'end', 'name', 'score' ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join( ['chromosome', 'start', 'end', 'info', 'peak_height'] + _fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex, peakfile) peakfile_list.append( track(peakfile + ".gz", format='txt', fields=_fields)) ex.add(peakfile + ".gz", description=set_file_descr(name[1] + '_annotated_peaks.txt.gz', type='text', step='annotation', groupId=name[0])) stracks = [ track(wig, info={'name': name + "_" + st}) for name, wigdict in merged_wig.iteritems() for st, wig in wigdict.iteritems() ] tablefile = unique_filename_in() with open(tablefile, "w") as _tf: _pnames = [ "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1] for _s in names['tests'] for _c in names['controls'] ] _tf.write("\t".join([ '#chromosome', 'start', 'end', ] + _pnames + [s.name for s in stracks]) + "\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [ apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']), 'name', lambda __n, _n=npt: "%s:%i" % (__n, _n)) for npt, pt in enumerate(peakfile_list) ] features = fusion( concatenate(pk_lst, fields=['chr', 'start', 'end', 'name'], remove_duplicates=True, group_by=['chr', 'start', 'end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile, "a") as _tf: for row in quantifs: pcols = [''] * _ns * _nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while (_k < len(_rnsplit) - 1 - int(_nc > 1)): if _nc > 1: _k += 2 _n2 = _rnsplit[_k - 1] _n = _rnsplit[_k].split("|") pcols[int(_n[0]) * _nc + int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join( str(tt) for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) + "\n") gzipfile(ex, tablefile) ex.add(tablefile + ".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz', type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n") logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'], via=via) return processed
def feature_matrix(trackScores,trackFeatures,segment=False,method='mean',**kw): """ Return an array with as many lines as there are features in *trackFeatures*, and as many columns as there are score tracks in *trackScores*. Each element in the matrix thus corresponds to the (average) score of some genomic feature. If *segment* is True, each feature will be segmented into bins using bbcflib.gfminer.stream.intervals.segment_features (additional parameters in *\*\*kw* will be passed to this function). Then each element of the array is itself an array with *nbins* lines and one column for each track in *trackScores*. If *segment* is False, then each element of the array is an array with one element for each track in *trackScores*. Example:: gene1 gene2 X: -----#####|#####|#####--------###|###|###----- (features) Y: _____________666|66666________666|666|666_____ (scores1) Z: _____22222|22222|22222________________________ (scores2) With segment=True, nbins=3: Y Z R: [[[0. 2.], # bin0 \ [2. 2.], # bin1 } gene 1 [6. 2.]], # bin2 / [[6. 0.], # bin0 \ [6. 0.], # bin1 } gene2 [6. 0.]]] # bin2 / With segment=False: Y Z R: [[3. 2.] [6. 0.]] Note: the whole segmented features track will be loaded in memory. :param trackScores: (FeatureStream, or list of FeatureStream objects) score track(s). :param trackFeatures: (FeatureStream) feature track. :param segment: (bool) segment each feature into bins.[False] :param method: (str) Operation applied to the list of scores for one feature. It is the `method` argument to `stream.score_by_feature` - one of 'sum','mean','median','min','max'. :param **kw: arguments to pass to segment_features (`nbins`,`upstream`,`downstream`). :rtype: tuple (numpy.ndarray of strings, numpy.ndarray of floats) """ nbins = 1 nscores = 1 if segment: trackFeatures = sorted_stream(segment_features(trackFeatures,**kw)) nbins = kw.get('nbins',segment_features.func_defaults[0]) \ + kw.get('upstream',(0,0))[1] \ + kw.get('downstream',(0,0))[1] all_means = score_by_feature(trackScores,trackFeatures,method=method) nfields = len(trackFeatures.fields) if isinstance(trackScores,(list,tuple)): nscores = len(trackScores) scores_dict = {} if segment: empty_mat = numpy.zeros(shape=(nbins,nscores)) else: empty_mat = numpy.zeros(nscores) name_idx = all_means.fields.index('name') for t in all_means: _n = t[name_idx] scores_dict.setdefault(_n, empty_mat.copy()) if segment: scores_dict[_n][t[nfields-1]] = t[nfields:] else: scores_dict[_n] = t[nfields:] feat_names = numpy.array(scores_dict.keys()) scores_mat = numpy.array(scores_dict.values()) return (feat_names,scores_mat)