def createLibrary(ex, assembly_or_fasta, params, url=GlobalHtsUrl, via='local'): """ Main call to create the library """ if len(params['primary'])<2: print('Some parameters are missing, cannot create the library') print('primary='+params['primary']+" ; "+'secondary='+params['secondary']) return [None,None,None,None] if not isinstance(assembly_or_fasta,genrep.Assembly): assembly_or_fasta = genrep.Assembly( ex=ex, fasta=assembly_or_fasta ) chrnames = assembly_or_fasta.chrnames chrom_map = dict((v['ac'],k) for k,v in assembly_or_fasta.chrmeta.iteritems()) allfiles = assembly_or_fasta.fasta_by_chrom #assembly_or_fasta.untar_genome_fasta() libfiles = dict((c, getRestEnzymeOccAndSeq.nonblocking( ex, f, params['primary'], params['secondary'], params['length'], params['type'], via=via )) for c, f in allfiles.iteritems()) resfile = unique_filename_in() os.mkdir(resfile) bedfiles = {} for chrom, future in libfiles.iteritems(): libfiles[chrom] = future.wait() if not os.path.getsize(libfiles[chrom][1])>0: time.sleep(60) touch(ex,libfiles[chrom][1]) bedfiles[chrom] = parse_fragFile(libfiles[chrom][1],chrom_map) rescov = coverageInRepeats(ex, bedfiles, params['species'], outdir=resfile, via=via) bedchrom = [os.path.join(resfile,chrom+".bed") for chrom in chrnames] cat(bedchrom,out=resfile+".bed") gzipfile(ex,[resfile+".bed"]+bedchrom) # resfile_sql = resfile+".sql" # track.convert((resfile,'bed'),(resfile_sql,'sql'),assembly=params['species']) enz_list = [] infos_lib = { 'assembly_name': params['species'], 'enzyme1_id': getEnzymeSeqId(params['primary'], True, enz_list, url), 'enzyme2_id': getEnzymeSeqId(params['secondary'], True, enz_list, url), 'segment_length': params['length'], 'type': params['type'], 'filename': resfile } return [ libfiles, bedfiles, resfile, infos_lib ]
def count_reads(self, bamfiles, gtf): self.write_log("* Counting reads") # Count reads on genes, transcripts with "rnacounter" ncond = len(self.conditions) tablenames = [None]*ncond futures = [None]*ncond max_rlen = 0 counter_options = ["--nh"] for bam in bamfiles: sam = pysam.Samfile(bam,'rb') max_rlen = max(max_rlen, sam.next().rlen) counter_options += ["--exon_cutoff", str(max_rlen)] bwt_args = self.job.options.get('map_args',{}).get('bwt_args',[]) # if not "--local" in bwt_args: # counter_options += ["--nh"] if hasattr(self.assembly,"fasta_origin") or self.assembly.intype==2: counter_options += ["--type","transcripts", "--method","raw"] else: counter_options += ["--type","genes,transcripts", "--method","raw,nnls"] if self.stranded: counter_options += ["--stranded"] for i,c in enumerate(self.conditions): tablenames[i] = unique_filename_in() futures[i] = rnacounter.nonblocking(self.ex, bamfiles[i], gtf, stdout=tablenames[i], via=self.via, options=counter_options) # Put samples together for i,c in enumerate(self.conditions): try: futures[i].wait() except Exception as err: self.write_debug("Counting failed: %s." % str(err)) raise err if futures[i] is None: self.write_debug("Counting failed.") raise ValueError("Counting failed.") if len(tablenames) > 1: joined = unique_filename_in() rnacounter_join.nonblocking(self.ex, tablenames, stdout=joined, via=self.via).wait() else: joined = tablenames[0] # Split genes and transcripts into separate files genes_filename = unique_filename_in() trans_filename = unique_filename_in() genes_file = open(genes_filename,"wb") trans_file = open(trans_filename,"wb") if self.stranded: genes_anti_filename = unique_filename_in() trans_anti_filename = unique_filename_in() genes_anti_file = open(genes_anti_filename,"wb") trans_anti_file = open(trans_anti_filename,"wb") with open(joined) as jfile: header = jfile.readline() hconds = ["counts."+c for c in self.conditions] + ["rpkm."+c for c in self.conditions] hinfo = header.strip().split('\t')[2*ncond+1:] header = '\t'.join(["ID"] + hconds + hinfo)+'\n' genes_file.write(header) trans_file.write(header) type_idx = header.split('\t').index("Type") if self.stranded: genes_anti_file.write(header) trans_anti_file.write(header) sense_idx = header.split('\t').index("Sense") for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() sense = L[sense_idx].lower() if ftype == 'gene': if sense == 'antisense': genes_anti_file.write(line) else: genes_file.write(line) elif ftype == 'transcript': if sense == 'antisense': trans_anti_file.write(line) else: trans_file.write(line) else: for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() if ftype == 'gene': genes_file.write(line) elif ftype == 'transcript': trans_file.write(line) genes_file.close() trans_file.close() # Keep intermediate tables for i,c in enumerate(self.conditions): #shutil.copy(tablenames[i], "../counts%d.txt"%i) descr = set_file_descr(self.conditions[i]+'_'+tablenames[i]+'.gz', type='txt', step='pileup', view='admin') gzipfile(self.ex, tablenames[i]) self.ex.add(tablenames[i]+'.gz', description=descr) if self.stranded: count_files = {'genes':genes_filename, 'transcripts':trans_filename, 'genes_anti':genes_anti_filename, 'transcripts_anti':trans_anti_filename} else: count_files = {'genes':genes_filename, 'transcripts':trans_filename} return count_files
def c4seq_workflow( ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = {'density_files' : {}, 'countsPerFrag' : {}, 'countsPerFrag_grp' : {}, 'norm' : {}, 'norm_grp' : {}, 'profileCorrection': {}, 'profileCorrection_grp' : {}, 'smooth_grp' : {}, 'domainogram_grp' : {}, 'bricks2frags' : {}} # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs=[] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs',False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg',1000000) print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram',False) if isinstance(run_domainogram[gid],basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t']) before_profile_correction[gid] = group.get('before_profile_correction',False) if isinstance(before_profile_correction[gid],basestring): before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid])==0 : baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) )) regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'],{})) print "regToExclude["+str(gid)+"]="+regToExclude[gid] for rid,run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]): density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats']['read_length'], convert=False, via=via ) density_file += "merged.sql" ex.add( density_file, description=set_file_descr("density_file_"+libname+".sql", groupId=gid,step="density",type="sql",view='admin',gdv="1") ) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid]=density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via ) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid,run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in()+".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql convert(resfiles[3],resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via ) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_raw_mergedRep" print "gid="+group['name'] print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait() ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex,file1) file2 = unique_filename_in() #report file touch(ex,file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via ) processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_norm_mergedRep" print "gid="+group['name'] print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait() ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()] print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex,file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_merged[gid].wait() ## wait for merging of norm_grp to be completed futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15 ) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15 ) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name']+"_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in()+".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): for rid,run in group['runs'].iteritems(): allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ] allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ] allRegToExclude += [ regToExclude[gid] ] tablePC=unique_filename_in()+".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile="+tablePC) print(",".join(allNames)) touch(ex,tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += [ 'NA', regToExclude[gid] ] tableSmoothedRaw_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_norm", group['name']+"_smoothed" ] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothed_grp=unique_filename_in()+".txt" touch(ex,tableSmoothed_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothedPC_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## combine BRICKS2Frags files allNames=[] allFiles=[] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [ job.groups[gid]['name']+"_BRICKSpval" ] cat_bricks2frags = unique_filename_in()+".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags) allFiles += [ cat_bricks2frags ] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in()+".txt" touch(ex,tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid) ex.add( sql, description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",gdv="1" ) ) wig = unique_filename_in()+".bw" convert( sql, wig ) ex.add( wig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig",ucsc="1") ) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems(): fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[1], description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",view="admin",gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql", groupId=gid,step=step,type="sql", comment="all informative frags - null included" )) trsql = track(resfiles[3]) bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=trsql.chrmeta) trwig.write(trsql.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph') bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=assembly.chrmeta) trwig.write(trbedgraph.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() fname = "segToFrag_"+job.groups[gid]['name'] ex.add( bwig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig", comment="segToFrag file before normalisation" )) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name'] gzipfile(ex,resfile) ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) gzipfile(ex,resfile) ex.add(resfile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems(): fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected" gzipfile(ex,profileCorrectedFile) ex.add( profileCorrectedFile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf", groupId=gid,step=step,type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,rawSmoothFile) ex.add(rawSmoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,smoothFile) ex.add(smoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,afterProfileCorrection) ex.add(afterProfileCorrection+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name']+"_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid,step=step,type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex,s) s += ".gz" ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1")) step = "combined_results" gzipfile(ex,tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothed_grp) ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tablePC) ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableBRICKS2Frags) ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt")) return processed
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict,frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files',{}) else: raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.") merge_strands = int(options.get('merge_strands',-1)) suffixes = ["fwd","rev"] peak_deconvolution = options.get('peak_deconvolution',False) if isinstance(peak_deconvolution,basestring): peak_deconvolution = peak_deconvolution.lower() in ['1','true','t'] run_meme = options.get('run_meme',False) if isinstance(run_meme,basestring): run_meme = run_meme.lower() in ['1','true','t'] macs_args = options.get('macs_args',["--bw","200"]) b2w_args = options.get('b2w_args',[]) if not(isinstance(mapseq_files,dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid,mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not(isinstance(mapped,dict)): raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name+"_"+str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via ) if mapped[k].get('poisson_threshold',-1)>0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns)>0: p_thresh['group_name'] = sum(ptruns)/len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped)>1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: tests.append(bamfile) names['tests'].append((gid,group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls)<1: controls = [None] names['controls'] = [(0,None)] logfile.write("Starting MACS.\n");logfile.flush() processed = {'macs': add_macs_results( ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via ) } logfile.write("Done MACS.\n");logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score':(6,sys.maxint)} _fields = ['chr','start','end','name','score'] for i,name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = track(processed['macs'][ctrl]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n,_n=xn: "%s:%i" %(__n,_n)) for xn,x in enumerate(names['controls'])]) ############################## macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 ) peak_list[name] = unique_filename_in()+".sql" macs_final = track( peak_list[name], chrmeta=chrlist, info={'datatype':'qualitative'}, fields=['start','end','name','score'] ) macs_final.write(fusion(macs_neighb),clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int(options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension']>100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid,mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv( stream, pval ): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields ) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1]+" deconvolution.\n");logfile.flush() if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = processed['macs'][ctrl]+"_peaks.bed" else: macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed" for x in names['controls']], via=via ) deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via ) peak_list[name] = unique_filename_in()+".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed,0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1]+'_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1]+'_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'],(bigwig,"bigWig")) ex.add(bigwig, description=set_file_descr(name[1]+'_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e));logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1]+'_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs( stream, xlsl, _f ): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:]) yield _p+xlsl[0][nb-1][1:] else: nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:] return FeatureStream( _macs_row(stream), fields=_f ) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex,peakfile) peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields)) ex.add(peakfile+".gz", description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text', step='annotation',groupId=name[0])) stracks = [track(wig,info={'name':name+"_"+st}) for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()] tablefile = unique_filename_in() with open(tablefile,"w") as _tf: _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1] for _s in names['tests'] for _c in names['controls']] _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']), 'name', lambda __n,_n=npt: "%s:%i" %(__n,_n)) for npt,pt in enumerate(peakfile_list)] features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], remove_duplicates=True, group_by=['chr','start','end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile,"a") as _tf: for row in quantifs: pcols = ['']*_ns*_nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while ( _k < len(_rnsplit)-1-int(_nc>1) ): if _nc > 1: _k += 2 _n2 = _rnsplit[_k-1] _n = _rnsplit[_k].split("|") pcols[int(_n[0])*_nc+int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n") gzipfile(ex,tablefile) ex.add(tablefile+".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n");logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'], via=via ) return processed
def parallel_meme( ex, assembly, regions, name=None, chip=False, meme_args=None, via='lsf' ): """Fetches sequences, then calls ``meme`` on them and finally saves the results in the repository. """ if meme_args is None: meme_args = [] if not(isinstance(regions,list)): regions = [regions] if not(isinstance(name,list)): name = [name or '_'] futures = {} fasta_files = {} background = assembly.statistics(unique_filename_in(),frequency=True) # genomeRef = assembly.untar_genome_fasta() for i,n in enumerate(name): (fasta, size) = assembly.fasta_from_regions( regions[i], ex=ex ) tmpfile = unique_filename_in() outdir = unique_filename_in() if chip: futures[n] = (outdir, memechip.nonblocking( ex, fasta, outdir, background, args=meme_args, via=via, stderr=tmpfile, memory=6 )) else: futures[n] = (outdir, meme.nonblocking( ex, fasta, outdir, background, maxsize=(size*3)/2, args=meme_args, via=via, stderr=tmpfile, memory=6 )) fasta_files[n] = fasta all_res = {} for n,f in futures.iteritems(): f[1].wait() meme_out = f[0] archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") tgz.add( meme_out, arcname=n[1]+"_meme", exclude=lambda x: os.path.basename(x) in [fasta_files[n],background] ) tgz.close() ex.add( archive, description=set_file_descr(n[1]+"_meme.tgz", step='meme', type='tar', groupId=n[0]) ) gzipfile(ex,fasta_files[n],args=["-f"]) ex.add( fasta_files[n]+".gz", description=set_file_descr(n[1]+"_sites.fa.gz", step='meme', type='fasta', groupId=n[0]) ) if not(chip) and os.path.exists(os.path.join(meme_out, "meme.xml")): meme_res = parse_meme_xml( ex, os.path.join(meme_out, "meme.xml"), assembly.chrmeta ) if os.path.exists(os.path.join(meme_out, "meme.html")): ex.add( os.path.join(meme_out, "meme.html"), description=set_file_descr(n[1]+"_meme.html", step='meme', type='html', groupId=n[0]) ) ex.add( meme_res['sql'], description=set_file_descr(n[1]+"_meme_sites.sql", step='meme', type='sql', groupId=n[0]) ) for i,motif in enumerate(meme_res['matrices'].keys()): ex.add( meme_res['matrices'][motif], description=set_file_descr(n[1]+"_meme_"+motif+".txt", step='meme', type='txt', groupId=n[0]) ) ex.add( os.path.join(meme_out, "logo"+str(i+1)+".png"), description=set_file_descr(n[1]+"_meme_"+motif+".png", step='meme', type='png', groupId=n[0]) ) all_res[n] = meme_res return all_res