def save_wellington( ex, wellout, chrmeta ): bedlist = {} for name, wlist in wellout.iteritems(): wellall = unique_filename_in() #### Dummy file touch( ex, wellall ) ex.add(wellall, description=set_file_descr(name[1]+'_wellington_files', type='none', view='admin', step='footprints', groupId=name[0])) #### BED at FDR 1% bedlist[name[0]] = wellall+"FDR01.bed.gz" bedzip = gzip.open(bedlist[name[0]],'wb') bedzip.write("track name='"+name[1]+"_WellingtonFootprints_FDR_0.01'\n") for x in wlist: with open(os.path.join(*x)+".WellingtonFootprints.FDR.0.01.bed") as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall+"FDR01.bed.gz", description=set_file_descr(name[1]+'_WellingtonFootprintsFDR01.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz') #### BED at p-values [...] bedzip = gzip.open(wellall+"PvalCutoffs.bed.gz",'wb') for bfile in os.listdir(os.path.join(wlist[0][0],"p_value_cutoffs")): cut = os.path.splitext(bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4]) bedzip.write("track name='"+name[1]+"_WellingtonFootprints_Pval_%s'\n" %cut) for wdir,wpref in wlist: _bedpath = os.path.join(wdir,"p_value_cutoffs",wpref+".WellingtonFootprints."+cut+".bed") with open(_bedpath) as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall+"PvalCutoffs.bed.gz", description=set_file_descr(name[1]+'_WellingtonFootprintsPvalCutoffs.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz') #### WIG cat([os.path.join(*x)+".WellingtonFootprints.wig" for x in wlist], wellall+".wig") #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta) #ex.add(wellall+".bw", # description=set_file_descr(name[1]+'_WellingtonFootprints.bw', # type='bigWig', ucsc='1', step='footprints', groupId=name[0]), # associate_to_filename=wellall, template='%s_WellingtonFootprints.bw') ex.add(wellall+".wig", description=set_file_descr(name[1]+'_WellingtonFootprints.wig', type='wig', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprints.wig') return bedlist
def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'): """ Main call to create the library """ if len(params['primary'])<2: print('Some parameters are missing, cannot create the library') print('primary='+params['primary']+" ; "+'secondary='+params['secondary']) return [None,None,None,None] if not isinstance(assembly_or_fasta,genrep.Assembly): assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta ) chrnames = assembly_or_fasta.chrnames chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems()) allfiles = assembly_or_fasta.fasta_by_chrom #assembly_or_fasta.untar_genome_fasta() libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f, params['primary'], params['secondary'], params['length'], params['type'], via=via )) for c, f in allfiles.iteritems()) resfile = unique_filename_in() os.mkdir(resfile) bedfiles = {} for chrom, future in libfiles.iteritems(): libfiles[chrom] = future.wait() if not os.path.getsize(libfiles[chrom][1])>0: time.sleep(60) touch(ex,libfiles[chrom][1]) bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map) rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via) bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames] cat(bedchrom,out=resfile+".bed") gzipfile(ex,[resfile+".bed"]+bedchrom) # resfile_sql = resfile+".sql" # track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species']) enz_list = [] infos_lib = { 'assembly_name': params['species'], 'enzyme1_id': getEnzymeSeqId(params['primary'], True, enz_list, url), 'enzyme2_id': getEnzymeSeqId(params['secondary'], True, enz_list, url), 'segment_length': params['length'], 'type': params['type'], 'filename': resfile } return [ libfiles, bedfiles, resfile, infos_lib ]
def coverageInRepeats(ex, infile, genomeName='mm9', repeatsPath=GlobalRepbasePath, outdir=None, via='lsf'): """ Completes the segment info bed file with the coverage in repeats of each segment. For now, works only for mm9, hg19 and dm3. """ if not(isinstance(infile,dict)): infile = {"":infile} if outdir is None: resfile = unique_filename_in()+".bed" outf = open(resfile,'w') repeatsFile = os.path.join(repeatsPath, genomeName, genomeName+'_rmsk.bed') if not(os.path.exists(repeatsFile)): print("coverage in repeats not calculated as file "+repeatsFile+" does not exist.") if outdir is None: outf.close() cat([inf[0] for inf in infile.values()],out=resfile) else: for chrom,inf in infile.iteritems(): shutil.copy(inf[0], os.path.join(outdir,chrom+".bed")) resfile = outdir return resfile futures = {} for chrom,inf in infile.iteritems(): tmpfile = unique_filename_in() futures[chrom] = (tmpfile,coverageBed.nonblocking(ex,repeatsFile,inf[0],via=via,stdout=tmpfile)) for chrom,fut in futures.iteritems(): if not(outdir is None): resfile = os.path.join(outdir,chrom+".bed") outf = open(resfile,'w') fut[1].wait() coverout = track(fut[0],format='text',fields=['chr','start','end','name','c1','c2','c3','c4']) for s in sorted_stream(coverout.read(),[chrom]): s_split = s[3].split('|') infos = '|'.join(s_split[0:(len(s_split)-4)]+list(s[4:8])) outf.write('\t'.join([str(x) for x in s[0:3]+(infos,)])+'\n') if not(outdir is None): outf.close() if outdir is None: outf.close() else: resfile = outdir return resfile
def save_wellington(ex, wellout, chrmeta): bedlist = {} for name, wlist in wellout.iteritems(): wellall = unique_filename_in() #### Dummy file touch(ex, wellall) ex.add(wellall, description=set_file_descr(name[1] + '_wellington_files', type='none', view='admin', step='footprints', groupId=name[0])) #### BED at FDR 1% bedlist[name[0]] = wellall + "FDR01.bed.gz" bedzip = gzip.open(bedlist[name[0]], 'wb') bedzip.write("track name='" + name[1] + "_WellingtonFootprints_FDR_0.01'\n") for x in wlist: with open(os.path.join(*x) + ".WellingtonFootprints.FDR.0.01.bed") as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall + "FDR01.bed.gz", description=set_file_descr(name[1] + '_WellingtonFootprintsFDR01.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz') #### BED at p-values [...] bedzip = gzip.open(wellall + "PvalCutoffs.bed.gz", 'wb') for bfile in os.listdir(os.path.join(wlist[0][0], "p_value_cutoffs")): cut = os.path.splitext( bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4]) bedzip.write("track name='" + name[1] + "_WellingtonFootprints_Pval_%s'\n" % cut) for wdir, wpref in wlist: _bedpath = os.path.join( wdir, "p_value_cutoffs", wpref + ".WellingtonFootprints." + cut + ".bed") with open(_bedpath) as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall + "PvalCutoffs.bed.gz", description=set_file_descr( name[1] + '_WellingtonFootprintsPvalCutoffs.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz') #### WIG cat([os.path.join(*x) + ".WellingtonFootprints.wig" for x in wlist], wellall + ".wig") #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta) #ex.add(wellall+".bw", # description=set_file_descr(name[1]+'_WellingtonFootprints.bw', # type='bigWig', ucsc='1', step='footprints', groupId=name[0]), # associate_to_filename=wellall, template='%s_WellingtonFootprints.bw') ex.add(wellall + ".wig", description=set_file_descr(name[1] + '_WellingtonFootprints.wig', type='wig', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprints.wig') return bedlist
def find_junctions(self, soapsplice_index=None, path_to_soapsplice=None, soapsplice_options={}): """ Retrieve unmapped reads from a precedent mapping and runs SOAPsplice on them. Return the names of a .bed track indicating the junctions positions, as well as of a bam file of the alignments attesting the junctions. :param soapsplice_index: (str) path to the SOAPsplice index. :param path_to_soapsplice: (str) specify the path to the program if it is not in your $PATH. :param soapsplice_options: (dict) SOAPsplice options, e.g. {'-m':2}. :rtype: str, str """ @program def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}): """Bind 'soapsplice'. Return a text file containing the list of junctions. :param unmapped_R1: (str) path to the fastq file containing the 'left' reads. :param unmapped_R2: (str) path to the fastq file containing the 'right' reads. :param index: (str) path to the SOAPsplice index. :param output: (str) output file name. :param path_to_soapsplice: (str) path to the SOAPsplice executable. If not specified, the program must be in your $PATH. :param options: (dict) SOAPsplice options, given as {opt: value}. :rtype: str Main options:: -p: number of threads, <= 20. [1] -S: 1: forward strand, 2: reverse strand, 3: both. [3] -m: maximum mismatch for one-segment alignment, <= 5. [3] -g: maximum indel for one-segment alignment, <= 2. [2] -i: length of tail that can be ignored in one-segment alignment. [7] -t: longest gap between two segments in two-segment alignment. [500000] -a: shortest length of a segment in two-segment alignment. [8] -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0] -L: maximum distance between paired-end reads. [500000] -l: minimum distance between paired-end reads. [50] -I: insert length of paired-end reads. """ if not output: output = unique_filename_in() path_to_soapsplice = path_to_soapsplice or 'soapsplice' args = [ path_to_soapsplice, '-d', index, '-1', unmapped_R1, '-2', unmapped_R2, '-o', output, '-f', '2' ] opts = [] for k, v in options.iteritems(): opts.extend([str(k), str(v)]) return {"arguments": args + opts, "return_value": output} if not program_exists('soapsplice'): self.write_debug("Skipped junctions search: soapsplice not found.") return self.assembly.set_index_path(intype=3) soapsplice_index = soapsplice_index or self.assembly.index_path soapsplice_options.update( self.job.options.get('soapsplice_options', {})) soapsplice_options.setdefault('-p', 16) # number of threads soapsplice_options.setdefault('-q', 1) # Sanger format unmapped_fastq = {} for gid, group in self.job.groups.iteritems(): unmapped_fastq[gid] = [] for rid, run in group['runs'].iteritems(): unmapped = self.job.files[gid][rid].get('unmapped_fastq') if not unmapped: self.write_log( "No unmapped reads found for group %s, run %d. Skip." % (gid, rid)) continue elif not isinstance(unmapped, tuple): self.write_log("Pair-end reads required. Skip.") continue unmapped_fastq[gid].append(unmapped) if len(unmapped_fastq[gid]) == 0: continue R1 = cat(zip(*unmapped_fastq[gid])[0]) R2 = cat(zip(*unmapped_fastq[gid])[1]) future = soapsplice.nonblocking( self.ex, R1, R2, soapsplice_index, path_to_soapsplice=path_to_soapsplice, options=soapsplice_options, via=self.via, memory=8, threads=soapsplice_options['-p']) try: template = future.wait() except Exception as err: self.write_debug("SOAPsplice failed: %s." % str(err)) return if template is None: self.write_debug("SOAPsplice failed.") return junc_file = template + '.junc' bed = self.convert_junc_file(junc_file, self.assembly) bed_descr = set_file_descr('junctions_%s.bed' % group['name'], groupId=gid, type='bed', step='junctions', ucsc=1) bam_descr = set_file_descr('junctions_%s.bam' % group['name'], groupId=gid, type='bam', step='junctions', ucsc=0) sam = template + '.sam' try: bam = sam_to_bam(self.ex, sam, reheader=self.assembly.name) add_and_index_bam(self.ex, bam, description=bam_descr) self.ex.add(bam, description=bam_descr) except Exception as e: self.write_debug( "%s\n(Qualities may be in the wrong format, try with '-q 0'.)" % str(e)) self.ex.add(bed, description=bed_descr) return bed, bam
def find_junctions(self, soapsplice_index=None, path_to_soapsplice=None, soapsplice_options={}): """ Retrieve unmapped reads from a precedent mapping and runs SOAPsplice on them. Return the names of a .bed track indicating the junctions positions, as well as of a bam file of the alignments attesting the junctions. :param soapsplice_index: (str) path to the SOAPsplice index. :param path_to_soapsplice: (str) specify the path to the program if it is not in your $PATH. :param soapsplice_options: (dict) SOAPsplice options, e.g. {'-m':2}. :rtype: str, str """ @program def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}): """Bind 'soapsplice'. Return a text file containing the list of junctions. :param unmapped_R1: (str) path to the fastq file containing the 'left' reads. :param unmapped_R2: (str) path to the fastq file containing the 'right' reads. :param index: (str) path to the SOAPsplice index. :param output: (str) output file name. :param path_to_soapsplice: (str) path to the SOAPsplice executable. If not specified, the program must be in your $PATH. :param options: (dict) SOAPsplice options, given as {opt: value}. :rtype: str Main options:: -p: number of threads, <= 20. [1] -S: 1: forward strand, 2: reverse strand, 3: both. [3] -m: maximum mismatch for one-segment alignment, <= 5. [3] -g: maximum indel for one-segment alignment, <= 2. [2] -i: length of tail that can be ignored in one-segment alignment. [7] -t: longest gap between two segments in two-segment alignment. [500000] -a: shortest length of a segment in two-segment alignment. [8] -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0] -L: maximum distance between paired-end reads. [500000] -l: minimum distance between paired-end reads. [50] -I: insert length of paired-end reads. """ if not output: output = unique_filename_in() path_to_soapsplice = path_to_soapsplice or 'soapsplice' args = [path_to_soapsplice,'-d',index,'-1',unmapped_R1,'-2',unmapped_R2,'-o',output,'-f','2'] opts = [] for k,v in options.iteritems(): opts.extend([str(k),str(v)]) return {"arguments": args+opts, "return_value": output} if not program_exists('soapsplice'): self.write_debug("Skipped junctions search: soapsplice not found.") return self.assembly.set_index_path(intype=3) soapsplice_index = soapsplice_index or self.assembly.index_path soapsplice_options.update(self.job.options.get('soapsplice_options',{})) soapsplice_options.setdefault('-p',16) # number of threads soapsplice_options.setdefault('-q',1) # Sanger format unmapped_fastq = {} for gid, group in self.job.groups.iteritems(): unmapped_fastq[gid] = [] for rid, run in group['runs'].iteritems(): unmapped = self.job.files[gid][rid].get('unmapped_fastq') if not unmapped: self.write_log("No unmapped reads found for group %s, run %d. Skip." % (gid,rid)) continue elif not isinstance(unmapped,tuple): self.write_log("Pair-end reads required. Skip.") continue unmapped_fastq[gid].append(unmapped) if len(unmapped_fastq[gid]) == 0: continue R1 = cat(zip(*unmapped_fastq[gid])[0]) R2 = cat(zip(*unmapped_fastq[gid])[1]) future = soapsplice.nonblocking(self.ex,R1,R2,soapsplice_index, path_to_soapsplice=path_to_soapsplice, options=soapsplice_options, via=self.via, memory=8, threads=soapsplice_options['-p']) try: template = future.wait() except Exception as err: self.write_debug("SOAPsplice failed: %s." % str(err)) return if template is None: self.write_debug("SOAPsplice failed.") return junc_file = template+'.junc' bed = self.convert_junc_file(junc_file,self.assembly) bed_descr = set_file_descr('junctions_%s.bed' % group['name'], groupId=gid,type='bed',step='junctions', ucsc=1) bam_descr = set_file_descr('junctions_%s.bam' % group['name'], groupId=gid,type='bam',step='junctions', ucsc=0) sam = template+'.sam' try: bam = sam_to_bam(self.ex,sam,reheader=self.assembly.name) add_and_index_bam(self.ex, bam, description=bam_descr) self.ex.add(bam, description=bam_descr) except Exception as e: self.write_debug("%s\n(Qualities may be in the wrong format, try with '-q 0'.)" %str(e)) self.ex.add(bed, description=bed_descr) return bed, bam
def c4seq_workflow( ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = {'density_files' : {}, 'countsPerFrag' : {}, 'countsPerFrag_grp' : {}, 'norm' : {}, 'norm_grp' : {}, 'profileCorrection': {}, 'profileCorrection_grp' : {}, 'smooth_grp' : {}, 'domainogram_grp' : {}, 'bricks2frags' : {}} # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs=[] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs',False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg',1000000) print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram',False) if isinstance(run_domainogram[gid],basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t']) before_profile_correction[gid] = group.get('before_profile_correction',False) if isinstance(before_profile_correction[gid],basestring): before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid])==0 : baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) )) regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'],{})) print "regToExclude["+str(gid)+"]="+regToExclude[gid] for rid,run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]): density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats']['read_length'], convert=False, via=via ) density_file += "merged.sql" ex.add( density_file, description=set_file_descr("density_file_"+libname+".sql", groupId=gid,step="density",type="sql",view='admin',gdv="1") ) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid]=density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via ) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid,run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in()+".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql convert(resfiles[3],resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via ) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_raw_mergedRep" print "gid="+group['name'] print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait() ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex,file1) file2 = unique_filename_in() #report file touch(ex,file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via ) processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_norm_mergedRep" print "gid="+group['name'] print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait() ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()] print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex,file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_merged[gid].wait() ## wait for merging of norm_grp to be completed futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15 ) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15 ) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name']+"_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in()+".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): for rid,run in group['runs'].iteritems(): allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ] allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ] allRegToExclude += [ regToExclude[gid] ] tablePC=unique_filename_in()+".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile="+tablePC) print(",".join(allNames)) touch(ex,tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += [ 'NA', regToExclude[gid] ] tableSmoothedRaw_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_norm", group['name']+"_smoothed" ] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothed_grp=unique_filename_in()+".txt" touch(ex,tableSmoothed_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothedPC_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## combine BRICKS2Frags files allNames=[] allFiles=[] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [ job.groups[gid]['name']+"_BRICKSpval" ] cat_bricks2frags = unique_filename_in()+".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags) allFiles += [ cat_bricks2frags ] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in()+".txt" touch(ex,tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid) ex.add( sql, description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",gdv="1" ) ) wig = unique_filename_in()+".bw" convert( sql, wig ) ex.add( wig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig",ucsc="1") ) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems(): fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[1], description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",view="admin",gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql", groupId=gid,step=step,type="sql", comment="all informative frags - null included" )) trsql = track(resfiles[3]) bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=trsql.chrmeta) trwig.write(trsql.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph') bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=assembly.chrmeta) trwig.write(trbedgraph.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() fname = "segToFrag_"+job.groups[gid]['name'] ex.add( bwig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig", comment="segToFrag file before normalisation" )) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name'] gzipfile(ex,resfile) ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) gzipfile(ex,resfile) ex.add(resfile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems(): fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected" gzipfile(ex,profileCorrectedFile) ex.add( profileCorrectedFile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf", groupId=gid,step=step,type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,rawSmoothFile) ex.add(rawSmoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,smoothFile) ex.add(smoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,afterProfileCorrection) ex.add(afterProfileCorrection+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name']+"_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid,step=step,type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex,s) s += ".gz" ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1")) step = "combined_results" gzipfile(ex,tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothed_grp) ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tablePC) ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableBRICKS2Frags) ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt")) return processed
def density_to_countsPerFrag( ex, file_dict, groups, assembly, regToExclude, script_path, via='lsf' ): ''' Main function to compute normalised counts per fragments from a density file. ''' futures = {} results = {} for gid, group in groups.iteritems(): reffile = file_dict['lib'][gid] futures[gid] = {} results[gid] = {} for rid,run in group['runs'].iteritems(): density_file = file_dict['4cseq']['density_files'][gid][rid] gm_futures = [] for ch in assembly.chrnames: chref = os.path.join(reffile,ch+".bed.gz") if not(os.path.exists(chref)): chref = reffile # features = track(chref,'bed') # outbed.write(gMiner.stream.mean_score_by_feature( # scores.read(selection=ch), # features.read(selection=ch)), mode='append') bedfile = unique_filename_in()+".bed" gfminer_job = {"operation": "score_by_feature", "output": bedfile, "datatype": "qualitative", "args": "'"+json.dumps({"trackScores":density_file, "trackFeatures":chref, "chromosome":ch})+"'"} gm_futures.append((gfminer_run.nonblocking(ex,gfminer_job,via=via), bedfile)) outsql = unique_filename_in()+".sql" sqlouttr = track( outsql, chrmeta=assembly.chrmeta, info={'datatype':'quantitative'}, fields=['start', 'end', 'score'] ) outbed_all = [] for n,f in enumerate(gm_futures): f[0].wait() fout = f[1] if not(os.path.exists(fout)): time.sleep(60) touch(ex,fout) outbed_all.append(fout) outbed = track(fout, chrmeta=assembly.chrmeta) sqlouttr.write( outbed.read(fields=['start', 'end', 'score'], selection={'score':(0.01,sys.maxint)}), chrom=assembly.chrnames[n] ) sqlouttr.close() countsPerFragFile = unique_filename_in()+".bed" countsPerFragFile = cat(outbed_all,out=countsPerFragFile) results[gid][rid] = [ countsPerFragFile, outsql ] FragFile = unique_filename_in() touch(ex,FragFile) futures[gid][rid] = (FragFile, segToFrag.nonblocking( ex, countsPerFragFile, regToExclude[gid], script_path, via=via, stdout=FragFile , memory=4 )) def _parse_select_frag(stream): for s in stream: sr = s.strip().split('\t') if 'IsValid' in sr[2] and not any([w in sr[8] for w in ['_and_','BothRepeats','notValid']]): patt = re.search(r'([^:]+):(\d+)-(\d+)',sr[1]) if patt: coord = patt.groups() # if float(sr[11])>0.0: yield (coord[0], int(coord[1])-1, int(coord[2]), float(sr[11])) for gid, dict_gid in futures.iteritems(): for rid, res in dict_gid.iteritems(): res[1].wait() touch(ex,res[0]) segOut = open(res[0],"r") resBedGraph = unique_filename_in()+".sql" sqlTr = track( resBedGraph, fields=['start','end','score'], info={'datatype':'quantitative'}, chrmeta=assembly.chrmeta ) sqlTr.write(_parse_select_frag(segOut),fields=['chr','start','end','score']) sqlTr.close() segOut.close() results[gid][rid].extend([res[0],resBedGraph]) return results #[countsPerFrag_allBed, countsPerFrag_selectSql, segToFrag_out, segToFrag_sql]
def c4seq_workflow(ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf'): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = { 'density_files': {}, 'countsPerFrag': {}, 'countsPerFrag_grp': {}, 'norm': {}, 'norm_grp': {}, 'profileCorrection': {}, 'profileCorrection_grp': {}, 'smooth_grp': {}, 'domainogram_grp': {}, 'bricks2frags': {} } # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs = [] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs', False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([ primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0] for gid, group in job.groups.iteritems() ]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg', 1000000) print "region considered for normalisation: mid viewpoint +/-" + str( sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram', False) if isinstance(run_domainogram[gid], basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1', 'true', 'on', 't']) before_profile_correction[gid] = group.get('before_profile_correction', False) if isinstance(before_profile_correction[gid], basestring): before_profile_correction[gid] = ( before_profile_correction[gid].lower() in ['1', 'true', 'on', 't']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'], {}).get('regToExclude', "").replace('\r', '') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid]) == 0: baitcoord_mid = int(0.5 * (int( primers_dict.get(group['name'], {}).get('baitcoord').split(':') [1].split('-')[0]) + int( primers_dict.get(group['name'], {}).get('baitcoord').split( ':')[1].split('-')[1]))) regToExclude[gid] = primers_dict.get( group['name'], {}).get('baitcoord').split(':')[0] + ':' + str( baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'], {})) print "regToExclude[" + str(gid) + "]=" + regToExclude[gid] for rid, run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not ( 'wig' in mapseq_files[gid][rid]): density_file = parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats'] ['read_length'], convert=False, via=via) density_file += "merged.sql" ex.add(density_file, description=set_file_descr("density_file_" + libname + ".sql", groupId=gid, step="density", type="sql", view='admin', gdv="1")) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid] = density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid, run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in() + ".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][ rid] # _all.sql convert(resfiles[3], resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[ group['name']][ 'baitcoord'] + ", sizeExt=sizeExt, name=" + group[ 'name'] + "rep_" + str( rid) + "regToExclude=" + regToExclude[gid] + "\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name'] + "rep_" + str(rid), regToExclude=regToExclude[gid], script_path=script_path, via=via) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_raw_mergedRep" print "gid=" + group['name'] print "call mergeRep for replicates before normalisation: infiles=" + ",".join( [ res_rid for rid, res_rid in countsPerFrags_bedGraph[gid].iteritems() ] ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([ res_rid for rid, res_rid in countsPerFrags_bedGraph[gid].iteritems() ]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][ gid] = countsPerFrags_bedGraph[gid][ 0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait( ) ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex, file1) file2 = unique_filename_in() #report file touch(ex, file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[ group['name']]['baitcoord'] + ", name=" + group[ 'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via) processed['4cseq']['profileCorrection'][gid][rid] = [ file1, file2, file3 ] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group) > 1: mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_norm_mergedRep" print "gid=" + group['name'] print "call mergeRep: infiles=" + ",".join([ res_rid for rid, res_rid in processed['4cseq']['norm'] [gid].iteritems() ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([ res_rid for rid, res_rid in processed['4cseq']['norm'] [gid].iteritems() ]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][ gid][ 0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait( ) ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group) > 1: mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid, res_rid in processed['4cseq']['profileCorrection'] [gid].iteritems() ] print "call mergeRep (for PC tables): infiles=" + ",".join( pcfiles ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed[ '4cseq']['profileCorrection'][gid][ 0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex, file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait( ) ## wait for merging of raw_grp to be completed futures_smoothed[gid] = (smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6), ) futures_merged[gid].wait( ) ## wait for merging of norm_grp to be completed futures_smoothed[gid] += (smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name'] + "_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6), ) futures_profcor_merged[gid].wait( ) # wait for the merging of profile corrected data to be done futures_smoothed[gid] += (smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name'] + "_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6), ) processed['4cseq']['smooth_grp'][gid] = [ file1, file2, file3 ] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name'] + "_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in() + ".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking( ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): for rid, run in group['runs'].iteritems(): allNames += [ group['name'] + "_rep" + str(rid) + "_norm", group['name'] + "_rep" + str(rid) + "_fit" ] allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]] allRegToExclude += [regToExclude[gid]] tablePC = unique_filename_in() + ".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile=" + tablePC) print(",".join(allNames)) touch(ex, tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking( ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += ['NA', regToExclude[gid]] tableSmoothedRaw_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): allNames += [group['name'] + "_norm", group['name'] + "_smoothed"] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [regToExclude[gid], regToExclude[gid]] tableSmoothed_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothed_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [regToExclude[gid], regToExclude[gid]] tableSmoothedPC_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## combine BRICKS2Frags files allNames = [] allFiles = [] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [job.groups[gid]['name'] + "_BRICKSpval"] cat_bricks2frags = unique_filename_in() + ".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid], out=cat_bricks2frags) allFiles += [cat_bricks2frags] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in() + ".txt" touch(ex, tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_" + job.groups[gid][ 'name'] + "_merged_rep" + str(rid) ex.add(sql, description=set_file_descr(fname + ".sql", groupId=gid, step=step, type="sql", gdv="1")) wig = unique_filename_in() + ".bw" convert(sql, wig) ex.add(wig, description=set_file_descr(fname + ".bw", groupId=gid, step=step, type="bigWig", ucsc="1")) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][ gid].iteritems(): fname = "meanScorePerFeature_" + job.groups[gid][ 'name'] + "_rep" + str(rid) ex.add(resfiles[1], description=set_file_descr(fname + ".sql", groupId=gid, step=step, type="sql", view="admin", gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid) ex.add(resfiles[3], description=set_file_descr( fname + "_all.sql", groupId=gid, step=step, type="sql", comment="all informative frags - null included")) trsql = track(resfiles[3]) bwig = unique_filename_in() + ".bw" trwig = track(bwig, chrmeta=trsql.chrmeta) trwig.write( trsql.read(fields=['chr', 'start', 'end', 'score'], selection={'score': (0.01, sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname + ".bw", groupId=gid, step=step, type="bigWig", ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA( processed['4cseq']['countsPerFrag_grp'][gid]), format='bedgraph') bwig = unique_filename_in() + ".bw" trwig = track(bwig, chrmeta=assembly.chrmeta) trwig.write( trbedgraph.read(fields=['chr', 'start', 'end', 'score'], selection={'score': (0.01, sys.maxint)})) trwig.close() fname = "segToFrag_" + job.groups[gid]['name'] ex.add(bwig, description=set_file_descr( fname + ".bw", groupId=gid, step=step, type="bigWig", comment="segToFrag file before normalisation")) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_" + job.groups[gid]['name'] gzipfile(ex, resfile) ex.add(resfile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_" + job.groups[gid][ 'name'] + "_rep" + str(rid) gzipfile(ex, resfile) ex.add(resfile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq'][ 'profileCorrection_grp'].iteritems(): fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected" gzipfile(ex, profileCorrectedFile) ex.add(profileCorrectedFile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_" + job.groups[gid][ 'name'] + "_profileCorrected_rep" + str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add(reportProfileCorrection, description=set_file_descr(fname + ".pdf", groupId=gid, step=step, type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_" + job.groups[gid][ 'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, rawSmoothFile) ex.add(rawSmoothFile + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_" + job.groups[gid][ 'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, smoothFile) ex.add(smoothFile + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_" + job.groups[gid][ 'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, afterProfileCorrection) ex.add(afterProfileCorrection + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name'] + "_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid, step=step, type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex, s) s += ".gz" ex.add(s, description=set_file_descr(s, groupId=gid, step=step, type="bedGraph", ucsc="1", gdv="1")) step = "combined_results" gzipfile(ex, tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp + ".gz", description=set_file_descr( "table_segToFrags_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableSmoothed_grp) ex.add(tableSmoothed_grp + ".gz", description=set_file_descr( "table_normalised_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp + ".gz", description=set_file_descr( "table_profileCorrected_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tablePC) ex.add(tablePC + ".gz", description=set_file_descr( "table_normalised_fit_per_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableBRICKS2Frags) ex.add(tableBRICKS2Frags + ".gz", description=set_file_descr( "table_frags_in_BRICKS_combined_replicates.txt.gz", step=step, type="txt")) return processed
def density_to_countsPerFrag(ex, file_dict, groups, assembly, regToExclude, script_path, via='lsf'): ''' Main function to compute normalised counts per fragments from a density file. ''' futures = {} results = {} for gid, group in groups.iteritems(): reffile = file_dict['lib'][gid] futures[gid] = {} results[gid] = {} for rid, run in group['runs'].iteritems(): density_file = file_dict['4cseq']['density_files'][gid][rid] gm_futures = [] for ch in assembly.chrnames: chref = os.path.join(reffile, ch + ".bed.gz") if not (os.path.exists(chref)): chref = reffile # features = track(chref,'bed') # outbed.write(gMiner.stream.mean_score_by_feature( # scores.read(selection=ch), # features.read(selection=ch)), mode='append') bedfile = unique_filename_in() + ".bed" gfminer_job = { "operation": "score_by_feature", "output": bedfile, "datatype": "qualitative", "args": "'" + json.dumps({ "trackScores": density_file, "trackFeatures": chref, "chromosome": ch }) + "'" } gm_futures.append((gfminer_run.nonblocking(ex, gfminer_job, via=via), bedfile)) outsql = unique_filename_in() + ".sql" sqlouttr = track(outsql, chrmeta=assembly.chrmeta, info={'datatype': 'quantitative'}, fields=['start', 'end', 'score']) outbed_all = [] for n, f in enumerate(gm_futures): f[0].wait() fout = f[1] if not (os.path.exists(fout)): time.sleep(60) touch(ex, fout) outbed_all.append(fout) outbed = track(fout, chrmeta=assembly.chrmeta) sqlouttr.write(outbed.read( fields=['start', 'end', 'score'], selection={'score': (0.01, sys.maxint)}), chrom=assembly.chrnames[n]) sqlouttr.close() countsPerFragFile = unique_filename_in() + ".bed" countsPerFragFile = cat(outbed_all, out=countsPerFragFile) results[gid][rid] = [countsPerFragFile, outsql] FragFile = unique_filename_in() touch(ex, FragFile) futures[gid][rid] = (FragFile, segToFrag.nonblocking(ex, countsPerFragFile, regToExclude[gid], script_path, via=via, stdout=FragFile, memory=4)) def _parse_select_frag(stream): for s in stream: sr = s.strip().split('\t') if 'IsValid' in sr[2] and not any( [w in sr[8] for w in ['_and_', 'BothRepeats', 'notValid']]): patt = re.search(r'([^:]+):(\d+)-(\d+)', sr[1]) if patt: coord = patt.groups() # if float(sr[11])>0.0: yield (coord[0], int(coord[1]) - 1, int(coord[2]), float(sr[11])) for gid, dict_gid in futures.iteritems(): for rid, res in dict_gid.iteritems(): res[1].wait() touch(ex, res[0]) segOut = open(res[0], "r") resBedGraph = unique_filename_in() + ".sql" sqlTr = track(resBedGraph, fields=['start', 'end', 'score'], info={'datatype': 'quantitative'}, chrmeta=assembly.chrmeta) sqlTr.write(_parse_select_frag(segOut), fields=['chr', 'start', 'end', 'score']) sqlTr.close() segOut.close() results[gid][rid].extend([res[0], resBedGraph]) return results #[countsPerFrag_allBed, countsPerFrag_selectSql, segToFrag_out, segToFrag_sql]