def fimo(motifs, fasta, qval=True): # Run Fimo if qval: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh" else: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001" cmd = "fimo " + options + " %s %s" % (motifs, fasta) print "Running >>", cmd os.system(cmd) os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt") # Bed output t = track('fimo.txt', fields=[ "name", "chr", "start", "end", "strand", "score", "p-value", "q-value", "sequence" ]) t.fields = [ "name", "chr", "start", "end", "strand", "a", "score", "q", "sequence" ] s = t.read() s = select(s, ['chr', 'start', 'end', 'name', 'score', 'strand']) s = apply(s, 'chr', lambda x: x.split('|')[1]) s = sorted_stream(s) s = cobble(s) s = apply(s, 'name', lambda x: '|'.join(list(set(x.split('|'))))) outname = 'fimo.bed' bed = track(outname, fields=s.fields) bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite') bed.write(s) if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
def fimo(motifs,fasta,qval=True): # Run Fimo if qval: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.01 --qv-thresh" else: options = "--max-stored-scores 1000000 --verbosity 1 --thresh 0.000001" cmd = "fimo " + options + " %s %s" % (motifs, fasta) print "Running >>",cmd os.system(cmd) os.system("sort -k2,2n -k3,3n -k4,4n fimo_out/fimo.txt > fimo.txt") # Bed output t = track('fimo.txt', fields=["name","chr","start","end","strand","score","p-value","q-value","sequence"]) t.fields = ["name","chr","start","end","strand","a","score","q","sequence"] s = t.read() s = select(s,['chr','start','end','name','score','strand']) s = apply(s,'chr',lambda x:x.split('|')[1]) s = sorted_stream(s) s = cobble(s) s = apply(s,'name',lambda x:'|'.join(list(set(x.split('|'))))) outname = 'fimo.bed' bed = track(outname,fields=s.fields) bed.make_header(name="TSS_motifs", description="Motifs +-XKb around TSS", mode='overwrite') bed.write(s) if os.path.exists("fimo_out"): shutil.rmtree("fimo_out")
def test_apply(self): stream = fstream([(10,12,0.5), (14,15,1.2)], fields=['start','end','score']) res = list(apply(stream,'score',lambda x:2*x)) expected = [(10,12,1.), (14,15,2.4)] self.assertListEqual(res,expected) stream = fstream([(10,12,0.5), (14,15,1.2)], fields=['start','end','score']) res = list(apply(stream,['score','end'],[lambda x:2*x, lambda x:x-3])) expected = [(10,9,1.), (14,12,2.4)] self.assertListEqual(res,expected)
def test_apply(self): stream = fstream([(10, 12, 0.5), (14, 15, 1.2)], fields=['start', 'end', 'score']) res = list(apply(stream, 'score', lambda x: 2 * x)) expected = [(10, 12, 1.), (14, 15, 2.4)] self.assertListEqual(res, expected) stream = fstream([(10, 12, 0.5), (14, 15, 1.2)], fields=['start', 'end', 'score']) res = list( apply(stream, ['score', 'end'], [lambda x: 2 * x, lambda x: x - 3])) expected = [(10, 9, 1.), (14, 12, 2.4)] self.assertListEqual(res, expected)
def __call__(self, **kw): func = kw.get('function', "log2") l_track = kw.get('SigMulti', {}).get('track', []) if not isinstance(l_track, list): l_track = [l_track] outall = [] for tname in l_track: tinput = track(tname) if 'score' not in tinput.fields: continue format = kw.get('format', tinput.format) out_name = tinput.name + '_' + func + '.' + format outtemp = self.temporary_path(out_name) out_track = track(outtemp, chrmeta=tinput.chrmeta) filtered = score_threshold(tinput, strict=(func[:3] == "log")) out_track.write(apply(filtered, 'score', eval(func)), mode='write') out_track.close() outall.append(outtemp) tinput.close() if len(outall) == 1: self.new_file(outall[0], 'converted_track') elif len(outall) > 1: tar_name = self.temporary_path(fname="numeric_operation_out.tgz") tar = tarfile.open(tar_name, "w:gz") [tar.add(f, arcname=os.path.basename(f)) for f in outall] tar.close() self.new_file(tar_name, 'converted_track_tar') return self.display_time()
def __call__(self, **kw): func = kw.get('function',"log2") #l_track = kw.get('SigMulti', {}).get('track',[]) l_track = kw.get('track',[]) if not isinstance(l_track, list): l_track = [l_track] outall = [] for tname in l_track : tinput = track(tname) if 'score' not in tinput.fields: continue format = kw.get('output',tinput.format) out_name = tinput.name+'_'+func+'.'+format outtemp = self.temporary_path(out_name) out_track = track(outtemp,chrmeta=tinput.chrmeta) filtered = score_threshold(tinput, strict=(func[:3] == "log")) out_track.write(apply(filtered,'score',eval(func)), mode='write') out_track.close() outall.append(outtemp) tinput.close() if len(outall) == 1: self.new_file(outall[0], 'converted_track') elif len(outall) > 1: tar_name = self.temporary_path(fname="numeric_operation_out.tgz") tar = tarfile.open(tar_name, "w:gz") [tar.add(f,arcname=os.path.basename(f)) for f in outall] tar.close() self.new_file(tar_name, 'converted_track_tar') return self.display_time()
def convert_junc_file(self, filename): """Convert a .junc SOAPsplice output file to bed format. Return the file name. :param filename: (str) name of the .junc file to convert. """ t = track(filename, format='txt', fields=['chr','start','end','strand','score'], chrmeta=self.assembly.chrmeta) stream = t.read() # Translate chromosome names s1 = map_chromosomes(stream, self.assembly.chromosomes) # Add junction IDs s2 = duplicate(s1,'strand','name') C = itertools.count() s3 = apply(s2,'name', lambda x: 'junction'+str(C.next())) # Convert to bed format outfile = unique_filename_in() bed = outfile + '.bed' out = track(bed, fields=s3.fields, chrmeta=self.assembly.chrmeta) out.write(s3) return bed
def convert_junc_file(self, filename): """Convert a .junc SOAPsplice output file to bed format. Return the file name. :param filename: (str) name of the .junc file to convert. """ t = track(filename, format='txt', fields=['chr', 'start', 'end', 'strand', 'score'], chrmeta=self.assembly.chrmeta) stream = t.read() # Translate chromosome names s1 = map_chromosomes(stream, self.assembly.chromosomes) # Add junction IDs s2 = duplicate(s1, 'strand', 'name') C = itertools.count() s3 = apply(s2, 'name', lambda x: 'junction' + str(C.next())) # Convert to bed format outfile = unique_filename_in() bed = outfile + '.bed' out = track(bed, fields=s3.fields, chrmeta=self.assembly.chrmeta) out.write(s3) return bed
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict,frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files',{}) else: raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.") merge_strands = int(options.get('merge_strands',-1)) suffixes = ["fwd","rev"] peak_deconvolution = options.get('peak_deconvolution',False) if isinstance(peak_deconvolution,basestring): peak_deconvolution = peak_deconvolution.lower() in ['1','true','t'] run_meme = options.get('run_meme',False) if isinstance(run_meme,basestring): run_meme = run_meme.lower() in ['1','true','t'] macs_args = options.get('macs_args',["--bw","200"]) b2w_args = options.get('b2w_args',[]) if not(isinstance(mapseq_files,dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid,mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not(isinstance(mapped,dict)): raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name+"_"+str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via ) if mapped[k].get('poisson_threshold',-1)>0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns)>0: p_thresh['group_name'] = sum(ptruns)/len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped)>1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: tests.append(bamfile) names['tests'].append((gid,group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls)<1: controls = [None] names['controls'] = [(0,None)] logfile.write("Starting MACS.\n");logfile.flush() processed = {'macs': add_macs_results( ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via ) } logfile.write("Done MACS.\n");logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score':(6,sys.maxint)} _fields = ['chr','start','end','name','score'] for i,name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = track(processed['macs'][ctrl]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n,_n=xn: "%s:%i" %(__n,_n)) for xn,x in enumerate(names['controls'])]) ############################## macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 ) peak_list[name] = unique_filename_in()+".sql" macs_final = track( peak_list[name], chrmeta=chrlist, info={'datatype':'qualitative'}, fields=['start','end','name','score'] ) macs_final.write(fusion(macs_neighb),clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int(options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension']>100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid,mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv( stream, pval ): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields ) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1]+" deconvolution.\n");logfile.flush() if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = processed['macs'][ctrl]+"_peaks.bed" else: macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed" for x in names['controls']], via=via ) deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via ) peak_list[name] = unique_filename_in()+".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed,0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1]+'_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1]+'_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'],(bigwig,"bigWig")) ex.add(bigwig, description=set_file_descr(name[1]+'_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e));logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1]+'_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs( stream, xlsl, _f ): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:]) yield _p+xlsl[0][nb-1][1:] else: nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:] return FeatureStream( _macs_row(stream), fields=_f ) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex,peakfile) peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields)) ex.add(peakfile+".gz", description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text', step='annotation',groupId=name[0])) stracks = [track(wig,info={'name':name+"_"+st}) for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()] tablefile = unique_filename_in() with open(tablefile,"w") as _tf: _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1] for _s in names['tests'] for _c in names['controls']] _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']), 'name', lambda __n,_n=npt: "%s:%i" %(__n,_n)) for npt,pt in enumerate(peakfile_list)] features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], remove_duplicates=True, group_by=['chr','start','end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile,"a") as _tf: for row in quantifs: pcols = ['']*_ns*_nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while ( _k < len(_rnsplit)-1-int(_nc>1) ): if _nc > 1: _k += 2 _n2 = _rnsplit[_k-1] _n = _rnsplit[_k].split("|") pcols[int(_n[0])*_nc+int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n") gzipfile(ex,tablefile) ex.add(tablefile+".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n");logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'], via=via ) return processed
def chipseq_workflow(ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf'): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict, frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not ('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files', {}) else: raise TypeError( "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'." ) merge_strands = int(options.get('merge_strands', -1)) suffixes = ["fwd", "rev"] peak_deconvolution = options.get('peak_deconvolution', False) if isinstance(peak_deconvolution, basestring): peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't'] run_meme = options.get('run_meme', False) if isinstance(run_meme, basestring): run_meme = run_meme.lower() in ['1', 'true', 't'] macs_args = options.get('macs_args', ["--bw", "200"]) b2w_args = options.get('b2w_args', []) if not (isinstance(mapseq_files, dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid, mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not (isinstance(mapped, dict)): raise TypeError( "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name + "_" + str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking(ex, mapped[k]["bam"], via=via) if mapped[k].get('poisson_threshold', -1) > 0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns) > 0: p_thresh['group_name'] = sum(ptruns) / len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped) > 1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: tests.append(bamfile) names['tests'].append((gid, group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] logfile.write("Starting MACS.\n") logfile.flush() processed = { 'macs': add_macs_results(ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via) } logfile.write("Done MACS.\n") logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score': (6, sys.maxint)} _fields = ['chr', 'start', 'end', 'name', 'score'] for i, name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = track(processed['macs'][ctrl] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([ apply(track(processed['macs'][(name, x)] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n, _n=xn: "%s:%i" % (__n, _n)) for xn, x in enumerate(names['controls']) ]) ############################## macs_neighb = neighborhood(macsbed, before_start=150, after_end=150) peak_list[name] = unique_filename_in() + ".sql" macs_final = track(peak_list[name], chrmeta=chrlist, info={'datatype': 'qualitative'}, fields=['start', 'end', 'name', 'score']) macs_final.write(fusion(macs_neighb), clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int( options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension'] > 100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid, mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via) wig.append(dict((s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict( (s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv(stream, pval): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0], ) + ((x[2] + x[1]) / 2 - 150, (x[2] + x[1]) / 2 + 150) + x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1] + " deconvolution.\n") logfile.flush() if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = processed['macs'][ctrl] + "_peaks.bed" else: macsbed = intersect_many_bed(ex, [ processed['macs'][(name, x)] + "_peaks.bed" for x in names['controls'] ], via=via) deconv = run_deconv(ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via) peak_list[name] = unique_filename_in() + ".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed, 0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1] + '_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1] + '_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'], (bigwig, "bigWig")) ex.add(bigwig, description=set_file_descr(name[1] + '_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e)) logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1] + '_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs(stream, xlsl, _f): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int( _n.split(";")[0][13:]) if _n[:3] == "ID=" else int( _n[10:]) yield _p + xlsl[0][nb - 1][1:] else: nb = _n.split( ";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:] return FeatureStream(_macs_row(stream), fields=_f) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist, chrmeta=chrlist, fields=["chr", "start", "end", "name", "score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([ processed['macs'][(name, _c)] + "_peaks.xls" for _c in names['controls'] ]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join([ 'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)', 'location_type', 'distance' ] + _fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs( getNearestFeature(ptrack.read(selection=chrom), _feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr', 'start', 'end', 'name', 'score' ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join( ['chromosome', 'start', 'end', 'info', 'peak_height'] + _fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex, peakfile) peakfile_list.append( track(peakfile + ".gz", format='txt', fields=_fields)) ex.add(peakfile + ".gz", description=set_file_descr(name[1] + '_annotated_peaks.txt.gz', type='text', step='annotation', groupId=name[0])) stracks = [ track(wig, info={'name': name + "_" + st}) for name, wigdict in merged_wig.iteritems() for st, wig in wigdict.iteritems() ] tablefile = unique_filename_in() with open(tablefile, "w") as _tf: _pnames = [ "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1] for _s in names['tests'] for _c in names['controls'] ] _tf.write("\t".join([ '#chromosome', 'start', 'end', ] + _pnames + [s.name for s in stracks]) + "\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [ apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']), 'name', lambda __n, _n=npt: "%s:%i" % (__n, _n)) for npt, pt in enumerate(peakfile_list) ] features = fusion( concatenate(pk_lst, fields=['chr', 'start', 'end', 'name'], remove_duplicates=True, group_by=['chr', 'start', 'end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile, "a") as _tf: for row in quantifs: pcols = [''] * _ns * _nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while (_k < len(_rnsplit) - 1 - int(_nc > 1)): if _nc > 1: _k += 2 _n2 = _rnsplit[_k - 1] _n = _rnsplit[_k].split("|") pcols[int(_n[0]) * _nc + int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join( str(tt) for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) + "\n") gzipfile(ex, tablefile) ex.add(tablefile + ".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz', type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n") logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'], via=via) return processed