def filterSeq(ex, fastqFiles, seqToFilter, gid, grp_name, via='lsf'): indexSeqToFilter = {} indexFiles = {} global bcDelimiter for k, f in seqToFilter.iteritems(): if os.path.getsize(f) == 0: continue ex.add(f, description=set_file_descr(grp_name + "_" + k.replace(bcDelimiter, "_") + "_seqToFilter.fa", groupId=gid, step="filtering", type="fa", view="admin")) if k in fastqFiles: indexFiles[k] = bowtie_build.nonblocking(ex, f, via=via) unalignedFiles = {} futures = [] bwtarg = ["-a", "-q", "-n", "2", "-l", "20", "--un"] for k, f in indexFiles.iteritems(): unalignedFiles[k] = unique_filename_in() touch(ex, unalignedFiles[k]) futures.append( bowtie.nonblocking(ex, f.wait(), fastqFiles[k], bwtarg + [unalignedFiles[k]], via='lsf')) for f in futures: f.wait() return unalignedFiles
def test_browse_executions(self): ex_desc = "browse_ex_test" with execution(M, description=ex_desc) as ex: touch(ex,"boris") ex_found = M.browse_executions(with_description=ex_desc) #self.assertIs(ex.id,ex_found) M.delete_execution(ex.id)
def test_browse_executions(self): ex_desc = "browse_ex_test" with execution(M, description=ex_desc) as ex: touch(ex, "boris") ex_found = M.browse_executions(with_description=ex_desc) # self.assertIs(ex.id,ex_found) M.delete_execution(ex.id)
def test_unique_filename_exact_match(self): with execution(None) as ex: st = random.getstate() f = touch(ex) random.setstate(st) g = touch(ex) self.assertNotEqual(f, g)
def test_unique_filename_beginnings_match(self): with execution(None) as ex: st = random.getstate() f = unique_filename_in() touch(ex, f + 'abcdefg') random.setstate(st) g = touch(ex) self.assertNotEqual(f, g)
def test_resolve_alias_with_alias(self): with execution(None) as ex: f = touch(ex) M = MiniLIMS("boris") a = M.import_file(f) M.add_alias(a, 'hilda') self.assertEqual(M.resolve_alias('hilda'), a)
def test_associate_with_id(self): try: fid = M.import_file('test.py') with execution(M) as ex: touch(ex, "hilda") ex.add("hilda", associate_to_id=fid, template="%s.meep") hilda_id = M.search_files(source=('execution',ex.id))[0] hilda_name = M.fetch_file(hilda_id)['repository_name'] fid_name = M.fetch_file(fid)['repository_name'] self.assertEqual("%s.meep" % fid_name, hilda_name) finally: try: M.delete_execution(ex.id) M.delete_file(fid) except: pass
def save_wellington( ex, wellout, chrmeta ): bedlist = {} for name, wlist in wellout.iteritems(): wellall = unique_filename_in() #### Dummy file touch( ex, wellall ) ex.add(wellall, description=set_file_descr(name[1]+'_wellington_files', type='none', view='admin', step='footprints', groupId=name[0])) #### BED at FDR 1% bedlist[name[0]] = wellall+"FDR01.bed.gz" bedzip = gzip.open(bedlist[name[0]],'wb') bedzip.write("track name='"+name[1]+"_WellingtonFootprints_FDR_0.01'\n") for x in wlist: with open(os.path.join(*x)+".WellingtonFootprints.FDR.0.01.bed") as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall+"FDR01.bed.gz", description=set_file_descr(name[1]+'_WellingtonFootprintsFDR01.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz') #### BED at p-values [...] bedzip = gzip.open(wellall+"PvalCutoffs.bed.gz",'wb') for bfile in os.listdir(os.path.join(wlist[0][0],"p_value_cutoffs")): cut = os.path.splitext(bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4]) bedzip.write("track name='"+name[1]+"_WellingtonFootprints_Pval_%s'\n" %cut) for wdir,wpref in wlist: _bedpath = os.path.join(wdir,"p_value_cutoffs",wpref+".WellingtonFootprints."+cut+".bed") with open(_bedpath) as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall+"PvalCutoffs.bed.gz", description=set_file_descr(name[1]+'_WellingtonFootprintsPvalCutoffs.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz') #### WIG cat([os.path.join(*x)+".WellingtonFootprints.wig" for x in wlist], wellall+".wig") #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta) #ex.add(wellall+".bw", # description=set_file_descr(name[1]+'_WellingtonFootprints.bw', # type='bigWig', ucsc='1', step='footprints', groupId=name[0]), # associate_to_filename=wellall, template='%s_WellingtonFootprints.bw') ex.add(wellall+".wig", description=set_file_descr(name[1]+'_WellingtonFootprints.wig', type='wig', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprints.wig') return bedlist
def test_associate_with_names(self): try: with execution(M) as ex: touch(ex, "boris") touch(ex, "hilda") ex.add("boris") ex.add("hilda", associate_to_filename="boris", template="%s.meep") boris_id = M.search_files(source=('execution',ex.id), with_text="boris")[0] hilda_id = M.search_files(source=('execution',ex.id), with_text="hilda")[0] boris_name = M.fetch_file(boris_id)['repository_name'] hilda_name = M.fetch_file(hilda_id)['repository_name'] self.assertEqual("%s.meep" % boris_name, hilda_name) finally: try: M.delete_execution(ex.id) except: pass
def test_path_to_file_on_execution(self): with execution(None) as ignoreme: f = touch(ignoreme) M = MiniLIMS("boris") fid = M.import_file(f) mpath = M.path_to_file(fid) with execution(M) as ex: fpath = ex.path_to_file(fid) self.assertEqual(mpath, fpath)
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'): """ Main call to create the library """ if len(params['primary'])<2: print('Some parameters are missing, cannot create the library') print('primary='+params['primary']+" ; "+'secondary='+params['secondary']) return [None,None,None,None] if not isinstance(assembly_or_fasta,genrep.Assembly): assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta ) chrnames = assembly_or_fasta.chrnames chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems()) allfiles = assembly_or_fasta.fasta_by_chrom #assembly_or_fasta.untar_genome_fasta() libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f, params['primary'], params['secondary'], params['length'], params['type'], via=via )) for c, f in allfiles.iteritems()) resfile = unique_filename_in() os.mkdir(resfile) bedfiles = {} for chrom, future in libfiles.iteritems(): libfiles[chrom] = future.wait() if not os.path.getsize(libfiles[chrom][1])>0: time.sleep(60) touch(ex,libfiles[chrom][1]) bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map) rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via) bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames] cat(bedchrom,out=resfile+".bed") gzipfile(ex,[resfile+".bed"]+bedchrom) # resfile_sql = resfile+".sql" # track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species']) enz_list = [] infos_lib = { 'assembly_name': params['species'], 'enzyme1_id': getEnzymeSeqId(params['primary'], True, enz_list, url), 'enzyme2_id': getEnzymeSeqId(params['secondary'], True, enz_list, url), 'segment_length': params['length'], 'type': params['type'], 'filename': resfile } return [ libfiles, bedfiles, resfile, infos_lib ]
def test_path_to_file_on_execution(self): with execution(None) as ignoreme: f = touch(ignoreme) M = MiniLIMS("boris") fid = M.import_file(f) mpath = M.path_to_file(fid) with execution(M) as ex: fpath = ex.path_to_file(fid) used_files = M.fetch_execution(ex.id)['used_files'] self.assertEqual(used_files, [fid]) self.assertEqual(mpath, fpath)
def test_immutability_dropped(self): executions = [] with execution(M) as ex: touch(ex, "boris") ex.add("boris") exid1 = ex.id borisid = M.search_files(source=('execution',ex.id))[0] self.assertFalse(M.fetch_file(borisid)['immutable']) with execution(M) as ex: ex.use(borisid) exid2 = ex.id self.assertTrue(M.fetch_file(borisid)['immutable']) M.delete_execution(exid2) self.assertFalse(M.fetch_file(borisid)['immutable']) M.delete_execution(exid1) self.assertEqual(M.search_files(source=('execution',exid1)), [])
def test_hierarchical_association(self): try: with execution(M) as ex: touch(ex, "a") touch(ex, "b") touch(ex, "c") ex.add("a") ex.add("b", associate_to_filename="a", template="%s.step") ex.add("c", associate_to_filename="b", template="%s.step") a_id = M.search_files(source=('execution', ex.id), with_text='a')[0] b_id = M.search_files(source=('execution', ex.id), with_text='b')[0] c_id = M.search_files(source=('execution', ex.id), with_text='c')[0] a_name = M.fetch_file(a_id)['repository_name'] b_name = M.fetch_file(b_id)['repository_name'] c_name = M.fetch_file(c_id)['repository_name'] self.assertEqual("%s.step" % a_name, b_name) self.assertEqual("%s.step.step" % a_name, c_name) finally: try: M.delete_execution(ex.id) except: pass
def parse_meme_xml( ex, meme_file, chrmeta ): """ Parse meme xml file and convert to track """ from xml.etree import ElementTree as ET touch(ex,meme_file) tree = ET.parse(meme_file) ncol = {} allmatrices = {} for motif in tree.find('motifs').findall('motif'): mid = motif.attrib['id'] ncol[mid] = 0 allmatrices[mid] = unique_filename_in() with open(allmatrices[mid],'w') as mat_out: for parray in motif.find('probabilities')[0].findall('alphabet_array'): ncol[mid] += 1 m = {'letter_A':0,'letter_C':0,'letter_G':0,'letter_T':0} for col in parray: m[col.attrib['letter_id']] = float(col.text) mat_out.write("1\t%f\t%f\t%f\t%f\n" %(m['letter_A'],m['letter_C'],m['letter_G'],m['letter_T'])) def _xmltree(_t):#(_c,_t): seq_name = {} seq_chr = None for it in _t.getiterator(): if it.tag == 'sequence': seq_name[it.attrib['id']] = it.attrib['name'] if it.tag == 'scanned_sites': name = seq_name[it.attrib['sequence_id']] name,seq_chr,start,end = re.search(r'(.*)\|(.+):(\d+)-(\d+)',name).groups() if it.tag == 'scanned_site':# and _c == seq_chr: start = int(start)+int(it.attrib['position'])-1 end = start+ncol[it.attrib['motif_id']] strnd = it.attrib['strand'] == 'plus' and 1 or -1 score = it.attrib['pvalue'] yield (seq_chr,str(start),str(end),it.attrib['motif_id'],score,strnd) outsql = unique_filename_in()+".sql" outtrack = track(outsql, chrmeta=chrmeta, info={'datatype':'qualitative'}, fields=['start','end','name','score','strand']) outtrack.write(FeatureStream(_xmltree(tree),fields=['chr']+outtrack.fields)) outtrack.close() return {'sql':outsql,'matrices':allmatrices}
def filterSeq(ex,fastqFiles,seqToFilter,gid,grp_name,via='lsf'): indexSeqToFilter = {} indexFiles = {} global bcDelimiter for k,f in seqToFilter.iteritems(): if os.path.getsize(f) == 0: continue ex.add(f,description=set_file_descr(grp_name+"_"+k.replace(bcDelimiter,"_")+"_seqToFilter.fa", groupId=gid,step="filtering", type="fa",view="admin")) if k in fastqFiles: indexFiles[k] = bowtie_build.nonblocking(ex,f,via=via) unalignedFiles = {} futures = [] bwtarg = ["-a","-q","-n","2","-l","20","--un"] for k,f in indexFiles.iteritems(): unalignedFiles[k] = unique_filename_in() touch(ex,unalignedFiles[k]) futures.append(bowtie.nonblocking( ex, f.wait(), fastqFiles[k], bwtarg+[unalignedFiles[k]], via='lsf')) for f in futures: f.wait() return unalignedFiles
def filterSeq(ex,fastqFiles,seqToFilter,gid,grp_name,via='lsf'): #seqToFilter=`awk -v primer=${curPrimer} '{if($0 ~ /^>/){n=split($0,a,"|");curPrimer=a[1];gsub(">","",curPrimer); if(curPrimer == primer){seqToFilter="";for(i=5;i<n;i++){if(a[i] !~ /Exclude=/){seqToFilter=seqToFilter""a[i]","}} if(a[n] !~ /Exclude=/){seqToFilter=seqToFilter""a[n]}else{gsub(/,$/,"",seqToFilter)};print seqToFilter}}}' ${primersFile}` indexSeqToFilter = {} indexFiles = {} for k,f in seqToFilter.iteritems(): if os.path.getsize(f) == 0: continue ex.add(f,description=set_file_descr(grp_name+"_"+k+"_seqToFilter.fa", groupId=gid,step="filtering", type="fa",view="admin")) if k in fastqFiles: indexFiles[k] = bowtie_build.nonblocking(ex,f,via=via) unalignedFiles = {} futures = [] bwtarg = ["-a","-q","-n","2","-l","20","--un"] for k,f in indexFiles.iteritems(): unalignedFiles[k] = unique_filename_in() touch(ex,unalignedFiles[k]) futures.append(bowtie.nonblocking( ex, f.wait(), fastqFiles[k], bwtarg+[unalignedFiles[k]], via='lsf')) for f in futures: f.wait() return unalignedFiles
def test_hierarchical_association(self): try: with execution(M) as ex: touch(ex, "a") touch(ex, "b") touch(ex, "c") ex.add("a") ex.add("b", associate_to_filename="a", template="%s.step") ex.add("c", associate_to_filename="b", template="%s.step") a_id = M.search_files(source=('execution',ex.id), with_text='a')[0] b_id = M.search_files(source=('execution',ex.id), with_text='b')[0] c_id = M.search_files(source=('execution',ex.id), with_text='c')[0] a_name = M.fetch_file(a_id)['repository_name'] b_name = M.fetch_file(b_id)['repository_name'] c_name = M.fetch_file(c_id)['repository_name'] self.assertEqual("%s.step" % a_name, b_name) self.assertEqual("%s.step.step" % a_name, c_name) finally: try: M.delete_execution(ex.id) except: pass
def add_file(ex): touch(ex, "boris") ex.add("boris", description="test")
def test_resolve_alias_returns_int_if_exists(self): with execution(None) as ex: f = touch(ex) M = MiniLIMS("boris") a = M.import_file(f) self.assertEqual(M.resolve_alias(a), a)
def density_to_countsPerFrag( ex, file_dict, groups, assembly, regToExclude, script_path, via='lsf' ): ''' Main function to compute normalised counts per fragments from a density file. ''' futures = {} results = {} for gid, group in groups.iteritems(): reffile = file_dict['lib'][gid] futures[gid] = {} results[gid] = {} for rid,run in group['runs'].iteritems(): density_file = file_dict['4cseq']['density_files'][gid][rid] gm_futures = [] for ch in assembly.chrnames: chref = os.path.join(reffile,ch+".bed.gz") if not(os.path.exists(chref)): chref = reffile # features = track(chref,'bed') # outbed.write(gMiner.stream.mean_score_by_feature( # scores.read(selection=ch), # features.read(selection=ch)), mode='append') bedfile = unique_filename_in()+".bed" gfminer_job = {"operation": "score_by_feature", "output": bedfile, "datatype": "qualitative", "args": "'"+json.dumps({"trackScores":density_file, "trackFeatures":chref, "chromosome":ch})+"'"} gm_futures.append((gfminer_run.nonblocking(ex,gfminer_job,via=via), bedfile)) outsql = unique_filename_in()+".sql" sqlouttr = track( outsql, chrmeta=assembly.chrmeta, info={'datatype':'quantitative'}, fields=['start', 'end', 'score'] ) outbed_all = [] for n,f in enumerate(gm_futures): f[0].wait() fout = f[1] if not(os.path.exists(fout)): time.sleep(60) touch(ex,fout) outbed_all.append(fout) outbed = track(fout, chrmeta=assembly.chrmeta) sqlouttr.write( outbed.read(fields=['start', 'end', 'score'], selection={'score':(0.01,sys.maxint)}), chrom=assembly.chrnames[n] ) sqlouttr.close() countsPerFragFile = unique_filename_in()+".bed" countsPerFragFile = cat(outbed_all,out=countsPerFragFile) results[gid][rid] = [ countsPerFragFile, outsql ] FragFile = unique_filename_in() touch(ex,FragFile) futures[gid][rid] = (FragFile, segToFrag.nonblocking( ex, countsPerFragFile, regToExclude[gid], script_path, via=via, stdout=FragFile , memory=4 )) def _parse_select_frag(stream): for s in stream: sr = s.strip().split('\t') if 'IsValid' in sr[2] and not any([w in sr[8] for w in ['_and_','BothRepeats','notValid']]): patt = re.search(r'([^:]+):(\d+)-(\d+)',sr[1]) if patt: coord = patt.groups() # if float(sr[11])>0.0: yield (coord[0], int(coord[1])-1, int(coord[2]), float(sr[11])) for gid, dict_gid in futures.iteritems(): for rid, res in dict_gid.iteritems(): res[1].wait() touch(ex,res[0]) segOut = open(res[0],"r") resBedGraph = unique_filename_in()+".sql" sqlTr = track( resBedGraph, fields=['start','end','score'], info={'datatype':'quantitative'}, chrmeta=assembly.chrmeta ) sqlTr.write(_parse_select_frag(segOut),fields=['chr','start','end','score']) sqlTr.close() segOut.close() results[gid][rid].extend([res[0],resBedGraph]) return results #[countsPerFrag_allBed, countsPerFrag_selectSql, segToFrag_out, segToFrag_sql]
def test_syntaxerror_outside_execution(self): with execution(M) as ex: pass M.delete_execution(ex.id) with self.assertRaises(SyntaxError): touch(ex)
def save_wellington(ex, wellout, chrmeta): bedlist = {} for name, wlist in wellout.iteritems(): wellall = unique_filename_in() #### Dummy file touch(ex, wellall) ex.add(wellall, description=set_file_descr(name[1] + '_wellington_files', type='none', view='admin', step='footprints', groupId=name[0])) #### BED at FDR 1% bedlist[name[0]] = wellall + "FDR01.bed.gz" bedzip = gzip.open(bedlist[name[0]], 'wb') bedzip.write("track name='" + name[1] + "_WellingtonFootprints_FDR_0.01'\n") for x in wlist: with open(os.path.join(*x) + ".WellingtonFootprints.FDR.0.01.bed") as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall + "FDR01.bed.gz", description=set_file_descr(name[1] + '_WellingtonFootprintsFDR01.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz') #### BED at p-values [...] bedzip = gzip.open(wellall + "PvalCutoffs.bed.gz", 'wb') for bfile in os.listdir(os.path.join(wlist[0][0], "p_value_cutoffs")): cut = os.path.splitext( bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4]) bedzip.write("track name='" + name[1] + "_WellingtonFootprints_Pval_%s'\n" % cut) for wdir, wpref in wlist: _bedpath = os.path.join( wdir, "p_value_cutoffs", wpref + ".WellingtonFootprints." + cut + ".bed") with open(_bedpath) as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall + "PvalCutoffs.bed.gz", description=set_file_descr( name[1] + '_WellingtonFootprintsPvalCutoffs.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz') #### WIG cat([os.path.join(*x) + ".WellingtonFootprints.wig" for x in wlist], wellall + ".wig") #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta) #ex.add(wellall+".bw", # description=set_file_descr(name[1]+'_WellingtonFootprints.bw', # type='bigWig', ucsc='1', step='footprints', groupId=name[0]), # associate_to_filename=wellall, template='%s_WellingtonFootprints.bw') ex.add(wellall + ".wig", description=set_file_descr(name[1] + '_WellingtonFootprints.wig', type='wig', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprints.wig') return bedlist
def dnaseseq_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'): """ This workflow performs the following steps: * BAM files from replicates within the same group are merged * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group * Wellington is called to identify footprints within these enriched regions * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file * Average DNAse profiles around motifs are plotted """ tests = [] controls = [] names = {'tests': [], 'controls': []} supdir = os.path.split(ex.remote_working_directory)[0] for gid, mapped in job.files.iteritems(): group_name = job.groups[gid]['name'] if not isinstance(mapped, dict): raise TypeError( "Files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} if len(mapped) > 1: bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()]) index = index_bam(ex, bamfile) else: bamfile = mapped.values()[0]['bam'] if job.groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: if os.path.exists(job.groups[gid].get('bedfile', 'null')): bedfile = job.groups[gid]['bedfile'] elif os.path.exists( os.path.join(supdir, job.groups[gid].get('bedfile', 'null'))): bedfile = os.path.join(supdir, job.groups[gid]['bedfile']) else: bedfile = None tests.append((bedfile, bamfile)) names['tests'].append((gid, group_name)) if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] tests = macs_bedfiles(ex, assembly.chrmeta, tests, controls, names, job.options.get('macs_args', ["--keep-dup", "10"]), via, logfile) bedlist = run_wellington(ex, tests, names, assembly, via, logfile) ######################### Motif scanning / plotting if any([ gr.get('motif') != 'null' and gr.get('motif') for gr in job.groups.values() ]): motifbeds = motif_scan(ex, bedlist, assembly, job.groups, via, logfile) siglist = dict((gid[0], []) for gid in names['tests']) for gid, mapped in job.files.iteritems(): wig = [] suffixes = ["fwd", "rev"] merge_strands = int(job.options.get('merge_strands', -1)) read_extension = int(job.options.get('read_extension') or -1) make_wigs = merge_strands >= 0 or read_extension != 1 for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=1, convert=False, b2w_args=[], via=via) wig.append(dict( (s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: wig[0] = dict((s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) _trn = job.groups[gid]['name'] + "_%s" if job.groups[gid]['control']: for s, w in wig[0].iteritems(): for _g in siglist.keys(): siglist[_g].append(track(w, info={'name': _trn % s})) else: siglist[gid].extend([ track(w, info={'name': _trn % s}) for s, w in wig[0].iteritems() ]) plot_files = plot_footprint_profile(ex, motifbeds, siglist, assembly.chrnames, job.groups, logfile) for gid, flist in plot_files.iteritems(): gname = job.groups[gid]['name'] plotall = unique_filename_in() touch(ex, plotall) ex.add(plotall, description=set_file_descr(gname + '_footprints_plots', type='none', view='admin', step='motifs', groupId=gid)) ex.add(flist['pdf'], description=set_file_descr(gname + '_footprints_plots.pdf', type='pdf', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.pdf') tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for mname, matf in flist['mat']: tarfh.add(matf, arcname="%s_%s.txt" % (gname, mname)) tarfh.close() ex.add(tarname, description=set_file_descr(gname + '_footprints_plots.tar.gz', type='tar', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.tar.gz') logfile.write("\nDone.\n ") logfile.flush() return 0
def add_macs_results( ex, read_length, genome_size, bamfile, ctrlbam=None, name=None, poisson_threshold=None, alias=None, macs_args=None, via='lsf' ): """Calls the ``macs`` function on each possible pair of test and control bam files and adds the respective outputs to the execution repository. ``macs`` options can be controlled with `macs_args`. If a dictionary of Poisson thresholds for each sample is given, then the enrichment bounds ('-m' option) are computed from them otherwise the default is '-m 10,100'. Returns the set of file prefixes. """ if not(isinstance(bamfile,list)): bamfile = [bamfile] if not(isinstance(ctrlbam,list)): ctrlbam = [ctrlbam] if poisson_threshold is None: poisson_threshold = {} if macs_args is None: macs_args = [] futures = {} rl = read_length for i,bam in enumerate(bamfile): n = name['tests'][i] if poisson_threshold.get(n)>0: low = (poisson_threshold.get(n)+1)*5 enrich_bounds = str(min(30,low))+","+str(10*low) else: enrich_bounds = "10,100" if not("-m" in macs_args): macs_args += ["-m",enrich_bounds] if isinstance(read_length,list): rl = read_length[i] for j,cam in enumerate(ctrlbam): m = name['controls'][j] nm = (n,m) futures[nm] = macs.nonblocking( ex, rl, genome_size, bam, cam, args=macs_args, via=via, memory=12 ) prefixes = {} for n,f in futures.iteritems(): p = f.wait() prefixes[n] = p macs_descr0 = {'step':'macs','type':'none','view':'admin','groupId':n[0][0]} macs_descr1 = {'step':'macs','type':'xls','groupId':n[0][0]} macs_descr2 = {'step':'macs','type':'bed','groupId':n[0][0],'ucsc':'1'} filename = "_vs_".join([x[1] for x in n if x[0]]) touch( ex, p ) ex.add( p, description=set_file_descr(filename,**macs_descr0), alias=alias ) ex.add( p+"_peaks.xls", description=set_file_descr(filename+"_peaks.xls",**macs_descr1), associate_to_filename=p, template='%s_peaks.xls' ) bedzip = gzip.open(p+"_peaks.bed.gz",'wb') bedzip.write("track name='"+filename+"_macs_peaks'\n") with open(p+"_peaks.bed") as bedinf: [bedzip.write(l) for l in bedinf] bedzip.close() ex.add( p+"_peaks.bed.gz", description=set_file_descr(filename+"_peaks.bed.gz",**macs_descr2), associate_to_filename=p, template='%s_peaks.bed.gz' ) bedzip = gzip.open(p+"_summits.bed.gz",'wb') bedzip.write("track name='"+filename+"_macs_summits'\n") with open(p+"_summits.bed") as bedinf: [bedzip.write(l) for l in bedinf] bedzip.close() ex.add( p+"_summits.bed.gz", description=set_file_descr(filename+"_summits.bed.gz",**macs_descr2), associate_to_filename=p, template='%s_summits.bed.gz' ) if n[1][0]: ex.add( p+"_negative_peaks.xls", description=set_file_descr(filename+"_negative_peaks.xls",**macs_descr0), associate_to_filename=p, template='%s_negative_peaks.xls' ) return prefixes
def c4seq_workflow( ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = {'density_files' : {}, 'countsPerFrag' : {}, 'countsPerFrag_grp' : {}, 'norm' : {}, 'norm_grp' : {}, 'profileCorrection': {}, 'profileCorrection_grp' : {}, 'smooth_grp' : {}, 'domainogram_grp' : {}, 'bricks2frags' : {}} # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs=[] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs',False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg',1000000) print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram',False) if isinstance(run_domainogram[gid],basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t']) before_profile_correction[gid] = group.get('before_profile_correction',False) if isinstance(before_profile_correction[gid],basestring): before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid])==0 : baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) )) regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'],{})) print "regToExclude["+str(gid)+"]="+regToExclude[gid] for rid,run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]): density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats']['read_length'], convert=False, via=via ) density_file += "merged.sql" ex.add( density_file, description=set_file_descr("density_file_"+libname+".sql", groupId=gid,step="density",type="sql",view='admin',gdv="1") ) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid]=density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via ) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid,run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in()+".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql convert(resfiles[3],resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via ) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_raw_mergedRep" print "gid="+group['name'] print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait() ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex,file1) file2 = unique_filename_in() #report file touch(ex,file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via ) processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_norm_mergedRep" print "gid="+group['name'] print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait() ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()] print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex,file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_merged[gid].wait() ## wait for merging of norm_grp to be completed futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15 ) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15 ) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name']+"_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in()+".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): for rid,run in group['runs'].iteritems(): allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ] allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ] allRegToExclude += [ regToExclude[gid] ] tablePC=unique_filename_in()+".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile="+tablePC) print(",".join(allNames)) touch(ex,tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += [ 'NA', regToExclude[gid] ] tableSmoothedRaw_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_norm", group['name']+"_smoothed" ] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothed_grp=unique_filename_in()+".txt" touch(ex,tableSmoothed_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothedPC_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## combine BRICKS2Frags files allNames=[] allFiles=[] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [ job.groups[gid]['name']+"_BRICKSpval" ] cat_bricks2frags = unique_filename_in()+".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags) allFiles += [ cat_bricks2frags ] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in()+".txt" touch(ex,tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid) ex.add( sql, description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",gdv="1" ) ) wig = unique_filename_in()+".bw" convert( sql, wig ) ex.add( wig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig",ucsc="1") ) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems(): fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[1], description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",view="admin",gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql", groupId=gid,step=step,type="sql", comment="all informative frags - null included" )) trsql = track(resfiles[3]) bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=trsql.chrmeta) trwig.write(trsql.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph') bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=assembly.chrmeta) trwig.write(trbedgraph.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() fname = "segToFrag_"+job.groups[gid]['name'] ex.add( bwig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig", comment="segToFrag file before normalisation" )) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name'] gzipfile(ex,resfile) ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) gzipfile(ex,resfile) ex.add(resfile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems(): fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected" gzipfile(ex,profileCorrectedFile) ex.add( profileCorrectedFile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf", groupId=gid,step=step,type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,rawSmoothFile) ex.add(rawSmoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,smoothFile) ex.add(smoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,afterProfileCorrection) ex.add(afterProfileCorrection+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name']+"_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid,step=step,type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex,s) s += ".gz" ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1")) step = "combined_results" gzipfile(ex,tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothed_grp) ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tablePC) ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableBRICKS2Frags) ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt")) return processed
def c4seq_workflow(ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf'): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = { 'density_files': {}, 'countsPerFrag': {}, 'countsPerFrag_grp': {}, 'norm': {}, 'norm_grp': {}, 'profileCorrection': {}, 'profileCorrection_grp': {}, 'smooth_grp': {}, 'domainogram_grp': {}, 'bricks2frags': {} } # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs = [] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs', False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([ primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0] for gid, group in job.groups.iteritems() ]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg', 1000000) print "region considered for normalisation: mid viewpoint +/-" + str( sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram', False) if isinstance(run_domainogram[gid], basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1', 'true', 'on', 't']) before_profile_correction[gid] = group.get('before_profile_correction', False) if isinstance(before_profile_correction[gid], basestring): before_profile_correction[gid] = ( before_profile_correction[gid].lower() in ['1', 'true', 'on', 't']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'], {}).get('regToExclude', "").replace('\r', '') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid]) == 0: baitcoord_mid = int(0.5 * (int( primers_dict.get(group['name'], {}).get('baitcoord').split(':') [1].split('-')[0]) + int( primers_dict.get(group['name'], {}).get('baitcoord').split( ':')[1].split('-')[1]))) regToExclude[gid] = primers_dict.get( group['name'], {}).get('baitcoord').split(':')[0] + ':' + str( baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'], {})) print "regToExclude[" + str(gid) + "]=" + regToExclude[gid] for rid, run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not ( 'wig' in mapseq_files[gid][rid]): density_file = parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats'] ['read_length'], convert=False, via=via) density_file += "merged.sql" ex.add(density_file, description=set_file_descr("density_file_" + libname + ".sql", groupId=gid, step="density", type="sql", view='admin', gdv="1")) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid] = density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid, run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in() + ".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][ rid] # _all.sql convert(resfiles[3], resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[ group['name']][ 'baitcoord'] + ", sizeExt=sizeExt, name=" + group[ 'name'] + "rep_" + str( rid) + "regToExclude=" + regToExclude[gid] + "\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name'] + "rep_" + str(rid), regToExclude=regToExclude[gid], script_path=script_path, via=via) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_raw_mergedRep" print "gid=" + group['name'] print "call mergeRep for replicates before normalisation: infiles=" + ",".join( [ res_rid for rid, res_rid in countsPerFrags_bedGraph[gid].iteritems() ] ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([ res_rid for rid, res_rid in countsPerFrags_bedGraph[gid].iteritems() ]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][ gid] = countsPerFrags_bedGraph[gid][ 0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait( ) ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex, file1) file2 = unique_filename_in() #report file touch(ex, file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[ group['name']]['baitcoord'] + ", name=" + group[ 'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via) processed['4cseq']['profileCorrection'][gid][rid] = [ file1, file2, file3 ] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group) > 1: mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_norm_mergedRep" print "gid=" + group['name'] print "call mergeRep: infiles=" + ",".join([ res_rid for rid, res_rid in processed['4cseq']['norm'] [gid].iteritems() ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([ res_rid for rid, res_rid in processed['4cseq']['norm'] [gid].iteritems() ]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][ gid][ 0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait( ) ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group) > 1: mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid, res_rid in processed['4cseq']['profileCorrection'] [gid].iteritems() ] print "call mergeRep (for PC tables): infiles=" + ",".join( pcfiles ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed[ '4cseq']['profileCorrection'][gid][ 0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex, file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait( ) ## wait for merging of raw_grp to be completed futures_smoothed[gid] = (smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6), ) futures_merged[gid].wait( ) ## wait for merging of norm_grp to be completed futures_smoothed[gid] += (smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name'] + "_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6), ) futures_profcor_merged[gid].wait( ) # wait for the merging of profile corrected data to be done futures_smoothed[gid] += (smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name'] + "_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6), ) processed['4cseq']['smooth_grp'][gid] = [ file1, file2, file3 ] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name'] + "_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in() + ".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking( ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): for rid, run in group['runs'].iteritems(): allNames += [ group['name'] + "_rep" + str(rid) + "_norm", group['name'] + "_rep" + str(rid) + "_fit" ] allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]] allRegToExclude += [regToExclude[gid]] tablePC = unique_filename_in() + ".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile=" + tablePC) print(",".join(allNames)) touch(ex, tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking( ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += ['NA', regToExclude[gid]] tableSmoothedRaw_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): allNames += [group['name'] + "_norm", group['name'] + "_smoothed"] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [regToExclude[gid], regToExclude[gid]] tableSmoothed_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothed_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [regToExclude[gid], regToExclude[gid]] tableSmoothedPC_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## combine BRICKS2Frags files allNames = [] allFiles = [] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [job.groups[gid]['name'] + "_BRICKSpval"] cat_bricks2frags = unique_filename_in() + ".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid], out=cat_bricks2frags) allFiles += [cat_bricks2frags] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in() + ".txt" touch(ex, tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_" + job.groups[gid][ 'name'] + "_merged_rep" + str(rid) ex.add(sql, description=set_file_descr(fname + ".sql", groupId=gid, step=step, type="sql", gdv="1")) wig = unique_filename_in() + ".bw" convert(sql, wig) ex.add(wig, description=set_file_descr(fname + ".bw", groupId=gid, step=step, type="bigWig", ucsc="1")) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][ gid].iteritems(): fname = "meanScorePerFeature_" + job.groups[gid][ 'name'] + "_rep" + str(rid) ex.add(resfiles[1], description=set_file_descr(fname + ".sql", groupId=gid, step=step, type="sql", view="admin", gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid) ex.add(resfiles[3], description=set_file_descr( fname + "_all.sql", groupId=gid, step=step, type="sql", comment="all informative frags - null included")) trsql = track(resfiles[3]) bwig = unique_filename_in() + ".bw" trwig = track(bwig, chrmeta=trsql.chrmeta) trwig.write( trsql.read(fields=['chr', 'start', 'end', 'score'], selection={'score': (0.01, sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname + ".bw", groupId=gid, step=step, type="bigWig", ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA( processed['4cseq']['countsPerFrag_grp'][gid]), format='bedgraph') bwig = unique_filename_in() + ".bw" trwig = track(bwig, chrmeta=assembly.chrmeta) trwig.write( trbedgraph.read(fields=['chr', 'start', 'end', 'score'], selection={'score': (0.01, sys.maxint)})) trwig.close() fname = "segToFrag_" + job.groups[gid]['name'] ex.add(bwig, description=set_file_descr( fname + ".bw", groupId=gid, step=step, type="bigWig", comment="segToFrag file before normalisation")) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_" + job.groups[gid]['name'] gzipfile(ex, resfile) ex.add(resfile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_" + job.groups[gid][ 'name'] + "_rep" + str(rid) gzipfile(ex, resfile) ex.add(resfile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq'][ 'profileCorrection_grp'].iteritems(): fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected" gzipfile(ex, profileCorrectedFile) ex.add(profileCorrectedFile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_" + job.groups[gid][ 'name'] + "_profileCorrected_rep" + str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add(reportProfileCorrection, description=set_file_descr(fname + ".pdf", groupId=gid, step=step, type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_" + job.groups[gid][ 'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, rawSmoothFile) ex.add(rawSmoothFile + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_" + job.groups[gid][ 'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, smoothFile) ex.add(smoothFile + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_" + job.groups[gid][ 'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, afterProfileCorrection) ex.add(afterProfileCorrection + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name'] + "_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid, step=step, type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex, s) s += ".gz" ex.add(s, description=set_file_descr(s, groupId=gid, step=step, type="bedGraph", ucsc="1", gdv="1")) step = "combined_results" gzipfile(ex, tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp + ".gz", description=set_file_descr( "table_segToFrags_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableSmoothed_grp) ex.add(tableSmoothed_grp + ".gz", description=set_file_descr( "table_normalised_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp + ".gz", description=set_file_descr( "table_profileCorrected_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tablePC) ex.add(tablePC + ".gz", description=set_file_descr( "table_normalised_fit_per_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableBRICKS2Frags) ex.add(tableBRICKS2Frags + ".gz", description=set_file_descr( "table_frags_in_BRICKS_combined_replicates.txt.gz", step=step, type="txt")) return processed
def density_to_countsPerFrag(ex, file_dict, groups, assembly, regToExclude, script_path, via='lsf'): ''' Main function to compute normalised counts per fragments from a density file. ''' futures = {} results = {} for gid, group in groups.iteritems(): reffile = file_dict['lib'][gid] futures[gid] = {} results[gid] = {} for rid, run in group['runs'].iteritems(): density_file = file_dict['4cseq']['density_files'][gid][rid] gm_futures = [] for ch in assembly.chrnames: chref = os.path.join(reffile, ch + ".bed.gz") if not (os.path.exists(chref)): chref = reffile # features = track(chref,'bed') # outbed.write(gMiner.stream.mean_score_by_feature( # scores.read(selection=ch), # features.read(selection=ch)), mode='append') bedfile = unique_filename_in() + ".bed" gfminer_job = { "operation": "score_by_feature", "output": bedfile, "datatype": "qualitative", "args": "'" + json.dumps({ "trackScores": density_file, "trackFeatures": chref, "chromosome": ch }) + "'" } gm_futures.append((gfminer_run.nonblocking(ex, gfminer_job, via=via), bedfile)) outsql = unique_filename_in() + ".sql" sqlouttr = track(outsql, chrmeta=assembly.chrmeta, info={'datatype': 'quantitative'}, fields=['start', 'end', 'score']) outbed_all = [] for n, f in enumerate(gm_futures): f[0].wait() fout = f[1] if not (os.path.exists(fout)): time.sleep(60) touch(ex, fout) outbed_all.append(fout) outbed = track(fout, chrmeta=assembly.chrmeta) sqlouttr.write(outbed.read( fields=['start', 'end', 'score'], selection={'score': (0.01, sys.maxint)}), chrom=assembly.chrnames[n]) sqlouttr.close() countsPerFragFile = unique_filename_in() + ".bed" countsPerFragFile = cat(outbed_all, out=countsPerFragFile) results[gid][rid] = [countsPerFragFile, outsql] FragFile = unique_filename_in() touch(ex, FragFile) futures[gid][rid] = (FragFile, segToFrag.nonblocking(ex, countsPerFragFile, regToExclude[gid], script_path, via=via, stdout=FragFile, memory=4)) def _parse_select_frag(stream): for s in stream: sr = s.strip().split('\t') if 'IsValid' in sr[2] and not any( [w in sr[8] for w in ['_and_', 'BothRepeats', 'notValid']]): patt = re.search(r'([^:]+):(\d+)-(\d+)', sr[1]) if patt: coord = patt.groups() # if float(sr[11])>0.0: yield (coord[0], int(coord[1]) - 1, int(coord[2]), float(sr[11])) for gid, dict_gid in futures.iteritems(): for rid, res in dict_gid.iteritems(): res[1].wait() touch(ex, res[0]) segOut = open(res[0], "r") resBedGraph = unique_filename_in() + ".sql" sqlTr = track(resBedGraph, fields=['start', 'end', 'score'], info={'datatype': 'quantitative'}, chrmeta=assembly.chrmeta) sqlTr.write(_parse_select_frag(segOut), fields=['chr', 'start', 'end', 'score']) sqlTr.close() segOut.close() results[gid][rid].extend([res[0], resBedGraph]) return results #[countsPerFrag_allBed, countsPerFrag_selectSql, segToFrag_out, segToFrag_sql]
def dnaseseq_workflow( ex, job, assembly, logfile=sys.stdout, via='lsf' ): """ This workflow performs the following steps: * BAM files from replicates within the same group are merged * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group * Wellington is called to identify footprints within these enriched regions * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file * Average DNAse profiles around motifs are plotted """ tests = [] controls = [] names = {'tests': [], 'controls': []} supdir = os.path.split(ex.remote_working_directory)[0] for gid,mapped in job.files.iteritems(): group_name = job.groups[gid]['name'] if not isinstance(mapped,dict): raise TypeError("Files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} if len(mapped)>1: bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()]) index = index_bam(ex, bamfile) else: bamfile = mapped.values()[0]['bam'] if job.groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: if os.path.exists(job.groups[gid].get('bedfile','null')): bedfile = job.groups[gid]['bedfile'] elif os.path.exists(os.path.join(supdir,job.groups[gid].get('bedfile','null'))): bedfile = os.path.join(supdir,job.groups[gid]['bedfile']) else: bedfile = None tests.append((bedfile,bamfile)) names['tests'].append((gid,group_name)) if len(controls)<1: controls = [None] names['controls'] = [(0,None)] tests = macs_bedfiles( ex, assembly.chrmeta, tests, controls, names, job.options.get('macs_args',["--keep-dup","10"]), via, logfile ) bedlist = run_wellington(ex, tests, names, assembly, via, logfile) ######################### Motif scanning / plotting if any([gr.get('motif') != 'null' and gr.get('motif') for gr in job.groups.values()]): motifbeds = motif_scan( ex, bedlist, assembly, job.groups, via, logfile ) siglist = dict((gid[0],[]) for gid in names['tests']) for gid,mapped in job.files.iteritems(): wig = [] suffixes = ["fwd","rev"] merge_strands = int(job.options.get('merge_strands',-1)) read_extension = int(job.options.get('read_extension') or -1) make_wigs = merge_strands >= 0 or read_extension != 1 for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=1, convert=False, b2w_args=[], via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: wig[0] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) _trn = job.groups[gid]['name']+"_%s" if job.groups[gid]['control']: for s,w in wig[0].iteritems(): for _g in siglist.keys(): siglist[_g].append(track(w,info={'name': _trn%s})) else: siglist[gid].extend([track(w,info={'name': _trn%s}) for s,w in wig[0].iteritems()]) plot_files = plot_footprint_profile( ex, motifbeds, siglist, assembly.chrnames, job.groups, logfile ) for gid, flist in plot_files.iteritems(): gname = job.groups[gid]['name'] plotall = unique_filename_in() touch( ex, plotall ) ex.add(plotall, description=set_file_descr(gname+'_footprints_plots', type='none', view='admin', step='motifs', groupId=gid)) ex.add(flist['pdf'], description=set_file_descr(gname+'_footprints_plots.pdf', type='pdf', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.pdf') tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for mname,matf in flist['mat']: tarfh.add(matf, arcname="%s_%s.txt" % (gname,mname)) tarfh.close() ex.add( tarname, description=set_file_descr(gname+'_footprints_plots.tar.gz', type='tar', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.tar.gz') logfile.write("\nDone.\n ");logfile.flush() return 0
def add_macs_results(ex, read_length, genome_size, bamfile, ctrlbam=None, name=None, poisson_threshold=None, alias=None, macs_args=None, via='lsf'): """Calls the ``macs`` function on each possible pair of test and control bam files and adds the respective outputs to the execution repository. ``macs`` options can be controlled with `macs_args`. If a dictionary of Poisson thresholds for each sample is given, then the enrichment bounds ('-m' option) are computed from them otherwise the default is '-m 10,100'. Returns the set of file prefixes. """ if not (isinstance(bamfile, list)): bamfile = [bamfile] if not (isinstance(ctrlbam, list)): ctrlbam = [ctrlbam] if poisson_threshold is None: poisson_threshold = {} if macs_args is None: macs_args = [] futures = {} rl = read_length for i, bam in enumerate(bamfile): n = name['tests'][i] if poisson_threshold.get(n) > 0: low = (poisson_threshold.get(n) + 1) * 5 enrich_bounds = str(min(30, low)) + "," + str(10 * low) else: enrich_bounds = "10,100" if not ("-m" in macs_args): macs_args += ["-m", enrich_bounds] if isinstance(read_length, list): rl = read_length[i] for j, cam in enumerate(ctrlbam): m = name['controls'][j] nm = (n, m) futures[nm] = macs.nonblocking(ex, rl, genome_size, bam, cam, args=macs_args, via=via, memory=12) prefixes = {} for n, f in futures.iteritems(): p = f.wait() prefixes[n] = p macs_descr0 = { 'step': 'macs', 'type': 'none', 'view': 'admin', 'groupId': n[0][0] } macs_descr1 = {'step': 'macs', 'type': 'xls', 'groupId': n[0][0]} macs_descr2 = { 'step': 'macs', 'type': 'bed', 'groupId': n[0][0], 'ucsc': '1' } filename = "_vs_".join([x[1] for x in n if x[0]]) touch(ex, p) ex.add(p, description=set_file_descr(filename, **macs_descr0), alias=alias) ex.add(p + "_peaks.xls", description=set_file_descr(filename + "_peaks.xls", **macs_descr1), associate_to_filename=p, template='%s_peaks.xls') bedzip = gzip.open(p + "_peaks.bed.gz", 'wb') bedzip.write("track name='" + filename + "_macs_peaks'\n") with open(p + "_peaks.bed") as bedinf: [bedzip.write(l) for l in bedinf] bedzip.close() ex.add(p + "_peaks.bed.gz", description=set_file_descr(filename + "_peaks.bed.gz", **macs_descr2), associate_to_filename=p, template='%s_peaks.bed.gz') bedzip = gzip.open(p + "_summits.bed.gz", 'wb') bedzip.write("track name='" + filename + "_macs_summits'\n") with open(p + "_summits.bed") as bedinf: [bedzip.write(l) for l in bedinf] bedzip.close() ex.add(p + "_summits.bed.gz", description=set_file_descr(filename + "_summits.bed.gz", **macs_descr2), associate_to_filename=p, template='%s_summits.bed.gz') if n[1][0]: ex.add(p + "_negative_peaks.xls", description=set_file_descr(filename + "_negative_peaks.xls", **macs_descr0), associate_to_filename=p, template='%s_negative_peaks.xls') return prefixes