def pca_rnaseq(self, counts_table_file): @program def pca(counts_table_file): outprefix = unique_filename_in() args = ['pca.R', counts_table_file, outprefix, "rpkm"] return {"arguments": args, "return_value": outprefix} if not program_exists('pca.R'): self.write_debug("Skipped PCA: pca.R not found.") return try: self.write_log("* PCA") outprefix = pca.nonblocking(self.ex, counts_table_file, via=self.via).wait() except Exception as err: self.write_debug("PCA failed: %s." % str(err)) return if outprefix is None: self.write_debug("PCA failed.") return pca_descr_pdf = set_file_descr('pca.pdf', type='pdf', step='pca', ucsc=0) self.ex.add(outprefix + '.pdf', description=pca_descr_pdf)
def create_tracks(ex, outall, sample_names, assembly): """Write BED tracks showing SNPs found in each sample.""" infields = ['chromosome','position','reference']+sample_names+['gene','location_type','distance'] intrack = track(outall, format='text', fields=infields, chrmeta=assembly.chrmeta, intypes={'position':int}) instream = intrack.read(fields=infields[:-3]) outtracks = {} for sample_name in sample_names: out = unique_filename_in()+'.bed.gz' t = track(out,fields=['name']) t.make_header(name=sample_name+"_SNPs") outtracks[sample_name] = (t,out) def _row_to_annot(x,ref,n): if x[3+n][0] == ref: return None else: return "%s>%s"%(ref,x[3+n][0]) for x in instream: coord = (x[0],x[1]-1,x[1]) ref = x[2] snp = dict((name, _row_to_annot(x,ref,n)) for n,name in enumerate(sample_names)) for name, tr in outtracks.iteritems(): if snp[name]: tr[0].write([coord+(snp[name],)],mode='append') for name, tr in outtracks.iteritems(): tr[0].close() description = set_file_descr(name+"_SNPs.bed.gz",type='bed',step='tracks',gdv='1',ucsc='1') ex.add(tr[1], description=description)
def motif_scan( ex, bedlist, assembly, groups, via, logfile ): logfile.write("Scanning motifs\n");logfile.flush() motifbeds = {} supdir = os.path.split(ex.remote_working_directory)[0] for gid,bedfile in bedlist.iteritems(): logfile.write("\n%i: "%gid);logfile.flush() group = groups[gid] motifs = {} for mot in group.get('motif',[]): if os.path.exists(mot): mname = os.path.basename(os.path.splitext(mot)[0]) motifs[mname] = mot elif os.path.exists(os.path.join(supdir,mot)): mname = os.path.basename(os.path.splitext(mot)[0]) motifs[mname] = os.path.join(supdir,mot) else: _gnid, mname = mot.split(' ') motifs[mname] = _gnrp.get_motif_PWM(int(_gnid), mname, output=unique_filename_in()) logfile.write(mname+", ");logfile.flush() _descr = set_file_descr(group['name']+'_motifs.bed', type='bed', ucsc='1', step='motifs', groupId=gid) _out = unique_filename_in() _hd = "track name='%s_motifs'" %group['name'] motifbeds[gid] = save_motif_profile( ex, motifs, assembly, bedfile, keep_max_only=True, output=_out, header=_hd, description=_descr, via=via ) return motifbeds
def save_wellington( ex, wellout, chrmeta ): bedlist = {} for name, wlist in wellout.iteritems(): wellall = unique_filename_in() #### Dummy file touch( ex, wellall ) ex.add(wellall, description=set_file_descr(name[1]+'_wellington_files', type='none', view='admin', step='footprints', groupId=name[0])) #### BED at FDR 1% bedlist[name[0]] = wellall+"FDR01.bed.gz" bedzip = gzip.open(bedlist[name[0]],'wb') bedzip.write("track name='"+name[1]+"_WellingtonFootprints_FDR_0.01'\n") for x in wlist: with open(os.path.join(*x)+".WellingtonFootprints.FDR.0.01.bed") as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall+"FDR01.bed.gz", description=set_file_descr(name[1]+'_WellingtonFootprintsFDR01.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz') #### BED at p-values [...] bedzip = gzip.open(wellall+"PvalCutoffs.bed.gz",'wb') for bfile in os.listdir(os.path.join(wlist[0][0],"p_value_cutoffs")): cut = os.path.splitext(bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4]) bedzip.write("track name='"+name[1]+"_WellingtonFootprints_Pval_%s'\n" %cut) for wdir,wpref in wlist: _bedpath = os.path.join(wdir,"p_value_cutoffs",wpref+".WellingtonFootprints."+cut+".bed") with open(_bedpath) as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall+"PvalCutoffs.bed.gz", description=set_file_descr(name[1]+'_WellingtonFootprintsPvalCutoffs.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz') #### WIG cat([os.path.join(*x)+".WellingtonFootprints.wig" for x in wlist], wellall+".wig") #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta) #ex.add(wellall+".bw", # description=set_file_descr(name[1]+'_WellingtonFootprints.bw', # type='bigWig', ucsc='1', step='footprints', groupId=name[0]), # associate_to_filename=wellall, template='%s_WellingtonFootprints.bw') ex.add(wellall+".wig", description=set_file_descr(name[1]+'_WellingtonFootprints.wig', type='wig', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprints.wig') return bedlist
def get_libForGrp(ex, group, fasta_or_assembly, new_libraries, grpId, url=None, lib_dir=None, via='lsf'): #wd_archive="/archive/epfl/bbcf/mleleu/pipeline_vMarion/pipeline_3Cseq/vWebServer_Bein/" def _libfile(id_lib): libs_list = json.load(urllib2.urlopen( url+"/libraries.json" )) for lib in libs_list: if lib['library']['id']==int(id_lib): return lib['library']['filename'] return None def _paramsFile(paramsfile): """Returns a dictionary with the parameters required for the creation of a new library""" paramslib={'name': 'myLibrary', 'length': '30', 'type': 'typeI'} with open(paramsfile) as f: for s in f: s=s.strip().split('=') key = None if re.search('Library name',s[0],re.I) and len(s[1])>1: key='name' elif re.search('Genome name',s[0],re.I): key='species' elif re.search('Primary',s[0],re.I): key='primary' elif re.search('Secondary',s[0],re.I): key='secondary' elif re.search('Segment length',s[0],re.I) and len(s[1])>0: key='length' elif re.search('Type',s[0],re.I) and len(s[1])>1: key='type' if key: paramslib[key]=s[1].strip() return paramslib if url is None: url = GlobalHtsUrl if lib_dir is None: lib_dir = os.path.split(ex.remote_working_directory)[0] if not(group.get('library_param_file','null') in ["null",'', None]): library_filename = os.path.join(lib_dir,'group_'+group['name']+"_paramsFileLibrary.txt") paramslib = _paramsFile(library_filename) lib_id, ex_libfile = lib_exists( paramslib, new_libraries, url ) if lib_id == 0 and ex_libfile == None: libfiles = createLibrary(ex, fasta_or_assembly, paramslib, url, via=via) reffile = libfiles[2] ex.add( libfiles[2]+".bed.gz", description=set_file_descr( group['name']+"_new_library.bed.gz", groupId=grpId, step="library", type="bed" )) # ex.add(reffile,description=set_file_descr("new_library.sql",groupId=grpId,step="library",type="sql",view='admin')) new_libraries.append( {'library': libfiles[3]} ) elif lib_id > 0: reffile = _libfile(lib_id) else: reffile = ex_libfile elif 'library_id' in group and group['library_id']> 0 and not str(group['library_id'])=="": reffile = _libfile(group['library_id']) if reffile is None: raise TypeError("No valid parameter passed for the library.") if not(os.path.exists(reffile) or os.path.exists(reffile+'.bed.gz')): raise TypeError("library file ("+reffile+") is not valid") if not os.path.exists(reffile): reffile += '.bed.gz' elif 'library_file_url' in group and group['library_file_url'] != "": reffile=group['library_file_url'] else: reffile=None return reffile
def differential_analysis(counts_file, feature_type): #shutil.copy(counts_file, "../") diff_files = DE.differential_analysis(counts_file) if diff_files is not None: for diff in diff_files: # Remove first line diff_nohead = unique_filename_in() with open(diff) as f: head = f.readline().strip() with open(diff_nohead, "wb") as g: for line in f: g.write(line) oname = feature_type + "_differential_"+ head + ".txt" desc = set_file_descr(oname, step='stats', type='txt', ucsc=0) ex.add(diff_nohead, description=desc)
def differential_analysis(counts_file, feature_type): #shutil.copy(counts_file, "../") diff_files = DE.differential_analysis(counts_file) if diff_files is not None: for diff in diff_files: # Remove first line diff_nohead = unique_filename_in() with open(diff) as f: head = f.readline().strip() with open(diff_nohead, "wb") as g: for line in f: g.write(line) oname = feature_type + "_differential_" + head + ".txt" desc = set_file_descr(oname, step='stats', type='txt', ucsc=0) ex.add(diff_nohead, description=desc)
def gdv_create(self,ex): from bbcflib import gdv project = gdv.get_project(mail=self.globals['gdv']['email'], key=self.globals['gdv']['key'], project_key=self.job.options['gdv_key']) if 'error' in project: self.log_write("Creating GDV project.") project = gdv.new_project( self.globals['gdv']['email'], self.globals['gdv']['key'], self.job.description, self.job.assembly.id, self.globals['gdv']['url'] ) self.debug_write("\nGDV project: "+json.dumps(project)) add_pickle( ex, project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') ) self.job.options['gdv_project'] = project return True
def pca_rnaseq(self,counts_table_file): @program def pca(counts_table_file): outprefix = unique_filename_in() args = ['pca.R', counts_table_file, outprefix, "rpkm"] return {"arguments": args, "return_value": outprefix} if not program_exists('pca.R'): self.write_debug("Skipped PCA: pca.R not found.") return try: self.write_log("* PCA") outprefix = pca.nonblocking(self.ex, counts_table_file, via=self.via).wait() except Exception as err: self.write_debug("PCA failed: %s." % str(err)) return if outprefix is None: self.write_debug("PCA failed.") return pca_descr_pdf = set_file_descr('pca.pdf', type='pdf', step='pca', ucsc=0) self.ex.add(outprefix+'.pdf', description=pca_descr_pdf)
def motif_scan(ex, bedlist, assembly, groups, via, logfile): logfile.write("Scanning motifs\n") logfile.flush() motifbeds = {} supdir = os.path.split(ex.remote_working_directory)[0] for gid, bedfile in bedlist.iteritems(): logfile.write("\n%i: " % gid) logfile.flush() group = groups[gid] motifs = {} for mot in group.get('motif', []): if os.path.exists(mot): mname = os.path.basename(os.path.splitext(mot)[0]) motifs[mname] = mot elif os.path.exists(os.path.join(supdir, mot)): mname = os.path.basename(os.path.splitext(mot)[0]) motifs[mname] = os.path.join(supdir, mot) else: _gnid, mname = mot.split(' ') motifs[mname] = _gnrp.get_motif_PWM( int(_gnid), mname, output=unique_filename_in()) logfile.write(mname + ", ") logfile.flush() _descr = set_file_descr(group['name'] + '_motifs.bed', type='bed', ucsc='1', step='motifs', groupId=gid) _out = unique_filename_in() _hd = "track name='%s_motifs'" % group['name'] motifbeds[gid] = save_motif_profile(ex, motifs, assembly, bedfile, keep_max_only=True, output=_out, header=_hd, description=_descr, via=via) return motifbeds
def create_tracks(ex, outall, sample_names, assembly): """Write BED tracks showing SNPs found in each sample.""" infields = ['chromosome', 'position', 'reference' ] + sample_names + ['gene', 'location_type', 'distance'] intrack = track(outall, format='text', fields=infields, chrmeta=assembly.chrmeta, intypes={'position': int}) instream = intrack.read(fields=infields[:-3]) outtracks = {} for sample_name in sample_names: out = unique_filename_in() + '.bed.gz' t = track(out, fields=['name']) t.make_header(name=sample_name + "_SNPs") outtracks[sample_name] = (t, out) def _row_to_annot(x, ref, n): if x[3 + n][0] == ref: return None else: return "%s>%s" % (ref, x[3 + n][0]) for x in instream: coord = (x[0], x[1] - 1, x[1]) ref = x[2] snp = dict((name, _row_to_annot(x, ref, n)) for n, name in enumerate(sample_names)) for name, tr in outtracks.iteritems(): if snp[name]: tr[0].write([coord + (snp[name], )], mode='append') for name, tr in outtracks.iteritems(): tr[0].close() description = set_file_descr(name + "_SNPs.bed.gz", type='bed', step='tracks', gdv='1', ucsc='1') ex.add(tr[1], description=description)
def c4seq_workflow(ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf'): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = { 'density_files': {}, 'countsPerFrag': {}, 'countsPerFrag_grp': {}, 'norm': {}, 'norm_grp': {}, 'profileCorrection': {}, 'profileCorrection_grp': {}, 'smooth_grp': {}, 'domainogram_grp': {}, 'bricks2frags': {} } # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs = [] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs', False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([ primers_dict.get(group['name'], {}).get('baitcoord').split(':')[0] for gid, group in job.groups.iteritems() ]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg', 1000000) print "region considered for normalisation: mid viewpoint +/-" + str( sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram', False) if isinstance(run_domainogram[gid], basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1', 'true', 'on', 't']) before_profile_correction[gid] = group.get('before_profile_correction', False) if isinstance(before_profile_correction[gid], basestring): before_profile_correction[gid] = ( before_profile_correction[gid].lower() in ['1', 'true', 'on', 't']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'], {}).get('regToExclude', "").replace('\r', '') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid]) == 0: baitcoord_mid = int(0.5 * (int( primers_dict.get(group['name'], {}).get('baitcoord').split(':') [1].split('-')[0]) + int( primers_dict.get(group['name'], {}).get('baitcoord').split( ':')[1].split('-')[1]))) regToExclude[gid] = primers_dict.get( group['name'], {}).get('baitcoord').split(':')[0] + ':' + str( baitcoord_mid - 5000) + '-' + str(baitcoord_mid + 5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'], {})) print "regToExclude[" + str(gid) + "]=" + regToExclude[gid] for rid, run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not ( 'wig' in mapseq_files[gid][rid]): density_file = parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats'] ['read_length'], convert=False, via=via) density_file += "merged.sql" ex.add(density_file, description=set_file_descr("density_file_" + libname + ".sql", groupId=gid, step="density", type="sql", view='admin', gdv="1")) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid] = density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid, run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in() + ".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][ rid] # _all.sql convert(resfiles[3], resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles=" + resfile + ", normfile=" + normfile + "baitCoord=" + primers_dict[ group['name']][ 'baitcoord'] + ", sizeExt=sizeExt, name=" + group[ 'name'] + "rep_" + str( rid) + "regToExclude=" + regToExclude[gid] + "\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name'] + "rep_" + str(rid), regToExclude=regToExclude[gid], script_path=script_path, via=via) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_raw_mergedRep" print "gid=" + group['name'] print "call mergeRep for replicates before normalisation: infiles=" + ",".join( [ res_rid for rid, res_rid in countsPerFrags_bedGraph[gid].iteritems() ] ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([ res_rid for rid, res_rid in countsPerFrags_bedGraph[gid].iteritems() ]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][ gid] = countsPerFrags_bedGraph[gid][ 0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait( ) ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex, file1) file2 = unique_filename_in() #report file touch(ex, file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile=" + normfile + ", baitCoord=" + primers_dict[ group['name']]['baitcoord'] + ", name=" + group[ 'name'] + ", file1=" + file1 + ", file2=" + file2 + ", file3= " + file3 + "\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via) processed['4cseq']['profileCorrection'][gid][rid] = [ file1, file2, file3 ] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group) > 1: mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_norm_mergedRep" print "gid=" + group['name'] print "call mergeRep: infiles=" + ",".join([ res_rid for rid, res_rid in processed['4cseq']['norm'] [gid].iteritems() ]) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([ res_rid for rid, res_rid in processed['4cseq']['norm'] [gid].iteritems() ]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][ gid][ 0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait( ) ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group) > 1: mergefile = unique_filename_in() touch(ex, mergefile) titleName = group['name'] + "_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid, res_rid in processed['4cseq']['profileCorrection'] [gid].iteritems() ] print "call mergeRep (for PC tables): infiles=" + ",".join( pcfiles ) + ", mergedfile=" + mergefile + ", regToExclude=" + regToExclude[ gid] + "\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via, memory=8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed[ '4cseq']['profileCorrection'][gid][ 0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex, file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait( ) ## wait for merging of raw_grp to be completed futures_smoothed[gid] = (smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6), ) futures_merged[gid].wait( ) ## wait for merging of norm_grp to be completed futures_smoothed[gid] += (smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name'] + "_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6), ) futures_profcor_merged[gid].wait( ) # wait for the merging of profile corrected data to be done futures_smoothed[gid] += (smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name'] + "_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6), ) processed['4cseq']['smooth_grp'][gid] = [ file1, file2, file3 ] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name'] + "_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in() + ".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking( ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): for rid, run in group['runs'].iteritems(): allNames += [ group['name'] + "_rep" + str(rid) + "_norm", group['name'] + "_rep" + str(rid) + "_fit" ] allFiles += [processed['4cseq']['profileCorrection'][gid][rid][2]] allRegToExclude += [regToExclude[gid]] tablePC = unique_filename_in() + ".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile=" + tablePC) print(",".join(allNames)) touch(ex, tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking( ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [group['name'] + "_raw", group['name'] + "_rawSmoothed"] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += ['NA', regToExclude[gid]] tableSmoothedRaw_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): allNames += [group['name'] + "_norm", group['name'] + "_smoothed"] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [regToExclude[gid], regToExclude[gid]] tableSmoothed_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothed_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames = [] allFiles = [] allRegToExclude = [] for gid, group in job.groups.iteritems(): allNames += [group['name'] + "_normPC", group['name'] + "_smoothedPC"] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [regToExclude[gid], regToExclude[gid]] tableSmoothedPC_grp = unique_filename_in() + ".txt" touch(ex, tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking( ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8), ) ## combine BRICKS2Frags files allNames = [] allFiles = [] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [job.groups[gid]['name'] + "_BRICKSpval"] cat_bricks2frags = unique_filename_in() + ".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid], out=cat_bricks2frags) allFiles += [cat_bricks2frags] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in() + ".txt" touch(ex, tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes=out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_" + job.groups[gid][ 'name'] + "_merged_rep" + str(rid) ex.add(sql, description=set_file_descr(fname + ".sql", groupId=gid, step=step, type="sql", gdv="1")) wig = unique_filename_in() + ".bw" convert(sql, wig) ex.add(wig, description=set_file_descr(fname + ".bw", groupId=gid, step=step, type="bigWig", ucsc="1")) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][ gid].iteritems(): fname = "meanScorePerFeature_" + job.groups[gid][ 'name'] + "_rep" + str(rid) ex.add(resfiles[1], description=set_file_descr(fname + ".sql", groupId=gid, step=step, type="sql", view="admin", gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_" + job.groups[gid]['name'] + "_rep" + str(rid) ex.add(resfiles[3], description=set_file_descr( fname + "_all.sql", groupId=gid, step=step, type="sql", comment="all informative frags - null included")) trsql = track(resfiles[3]) bwig = unique_filename_in() + ".bw" trwig = track(bwig, chrmeta=trsql.chrmeta) trwig.write( trsql.read(fields=['chr', 'start', 'end', 'score'], selection={'score': (0.01, sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname + ".bw", groupId=gid, step=step, type="bigWig", ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA( processed['4cseq']['countsPerFrag_grp'][gid]), format='bedgraph') bwig = unique_filename_in() + ".bw" trwig = track(bwig, chrmeta=assembly.chrmeta) trwig.write( trbedgraph.read(fields=['chr', 'start', 'end', 'score'], selection={'score': (0.01, sys.maxint)})) trwig.close() fname = "segToFrag_" + job.groups[gid]['name'] ex.add(bwig, description=set_file_descr( fname + ".bw", groupId=gid, step=step, type="bigWig", comment="segToFrag file before normalisation")) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_" + job.groups[gid]['name'] gzipfile(ex, resfile) ex.add(resfile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_" + job.groups[gid][ 'name'] + "_rep" + str(rid) gzipfile(ex, resfile) ex.add(resfile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq'][ 'profileCorrection_grp'].iteritems(): fname = "segToFrag_" + job.groups[gid]['name'] + "_profileCorrected" gzipfile(ex, profileCorrectedFile) ex.add(profileCorrectedFile + ".gz", description=set_file_descr(fname + ".bedGraph.gz", groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_" + job.groups[gid][ 'name'] + "_profileCorrected_rep" + str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add(reportProfileCorrection, description=set_file_descr(fname + ".pdf", groupId=gid, step=step, type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_" + job.groups[gid][ 'name'] + "_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, rawSmoothFile) ex.add(rawSmoothFile + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_" + job.groups[gid][ 'name'] + "_norm_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, smoothFile) ex.add(smoothFile + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_" + job.groups[gid][ 'name'] + "_profileCorrected_smoothed_" + nFrags + "FragsPerWin.bedGraph.gz" gzipfile(ex, afterProfileCorrection) ex.add(afterProfileCorrection + ".gz", description=set_file_descr(fname, groupId=gid, step=step, type="bedGraph", ucsc='1', gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name'] + "_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid, step=step, type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex, s) s += ".gz" ex.add(s, description=set_file_descr(s, groupId=gid, step=step, type="bedGraph", ucsc="1", gdv="1")) step = "combined_results" gzipfile(ex, tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp + ".gz", description=set_file_descr( "table_segToFrags_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableSmoothed_grp) ex.add(tableSmoothed_grp + ".gz", description=set_file_descr( "table_normalised_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp + ".gz", description=set_file_descr( "table_profileCorrected_smoothed_combined_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tablePC) ex.add(tablePC + ".gz", description=set_file_descr( "table_normalised_fit_per_replicates.txt.gz", step=step, type="txt")) gzipfile(ex, tableBRICKS2Frags) ex.add(tableBRICKS2Frags + ".gz", description=set_file_descr( "table_frags_in_BRICKS_combined_replicates.txt.gz", step=step, type="txt")) return processed
def dnaseseq_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'): """ This workflow performs the following steps: * BAM files from replicates within the same group are merged * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group * Wellington is called to identify footprints within these enriched regions * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file * Average DNAse profiles around motifs are plotted """ tests = [] controls = [] names = {'tests': [], 'controls': []} supdir = os.path.split(ex.remote_working_directory)[0] for gid, mapped in job.files.iteritems(): group_name = job.groups[gid]['name'] if not isinstance(mapped, dict): raise TypeError( "Files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} if len(mapped) > 1: bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()]) index = index_bam(ex, bamfile) else: bamfile = mapped.values()[0]['bam'] if job.groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: if os.path.exists(job.groups[gid].get('bedfile', 'null')): bedfile = job.groups[gid]['bedfile'] elif os.path.exists( os.path.join(supdir, job.groups[gid].get('bedfile', 'null'))): bedfile = os.path.join(supdir, job.groups[gid]['bedfile']) else: bedfile = None tests.append((bedfile, bamfile)) names['tests'].append((gid, group_name)) if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] tests = macs_bedfiles(ex, assembly.chrmeta, tests, controls, names, job.options.get('macs_args', ["--keep-dup", "10"]), via, logfile) bedlist = run_wellington(ex, tests, names, assembly, via, logfile) ######################### Motif scanning / plotting if any([ gr.get('motif') != 'null' and gr.get('motif') for gr in job.groups.values() ]): motifbeds = motif_scan(ex, bedlist, assembly, job.groups, via, logfile) siglist = dict((gid[0], []) for gid in names['tests']) for gid, mapped in job.files.iteritems(): wig = [] suffixes = ["fwd", "rev"] merge_strands = int(job.options.get('merge_strands', -1)) read_extension = int(job.options.get('read_extension') or -1) make_wigs = merge_strands >= 0 or read_extension != 1 for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=1, convert=False, b2w_args=[], via=via) wig.append(dict( (s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: wig[0] = dict((s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) _trn = job.groups[gid]['name'] + "_%s" if job.groups[gid]['control']: for s, w in wig[0].iteritems(): for _g in siglist.keys(): siglist[_g].append(track(w, info={'name': _trn % s})) else: siglist[gid].extend([ track(w, info={'name': _trn % s}) for s, w in wig[0].iteritems() ]) plot_files = plot_footprint_profile(ex, motifbeds, siglist, assembly.chrnames, job.groups, logfile) for gid, flist in plot_files.iteritems(): gname = job.groups[gid]['name'] plotall = unique_filename_in() touch(ex, plotall) ex.add(plotall, description=set_file_descr(gname + '_footprints_plots', type='none', view='admin', step='motifs', groupId=gid)) ex.add(flist['pdf'], description=set_file_descr(gname + '_footprints_plots.pdf', type='pdf', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.pdf') tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for mname, matf in flist['mat']: tarfh.add(matf, arcname="%s_%s.txt" % (gname, mname)) tarfh.close() ex.add(tarname, description=set_file_descr(gname + '_footprints_plots.tar.gz', type='tar', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.tar.gz') logfile.write("\nDone.\n ") logfile.flush() return 0
def save_wellington(ex, wellout, chrmeta): bedlist = {} for name, wlist in wellout.iteritems(): wellall = unique_filename_in() #### Dummy file touch(ex, wellall) ex.add(wellall, description=set_file_descr(name[1] + '_wellington_files', type='none', view='admin', step='footprints', groupId=name[0])) #### BED at FDR 1% bedlist[name[0]] = wellall + "FDR01.bed.gz" bedzip = gzip.open(bedlist[name[0]], 'wb') bedzip.write("track name='" + name[1] + "_WellingtonFootprints_FDR_0.01'\n") for x in wlist: with open(os.path.join(*x) + ".WellingtonFootprints.FDR.0.01.bed") as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall + "FDR01.bed.gz", description=set_file_descr(name[1] + '_WellingtonFootprintsFDR01.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsFDR01.bed.gz') #### BED at p-values [...] bedzip = gzip.open(wellall + "PvalCutoffs.bed.gz", 'wb') for bfile in os.listdir(os.path.join(wlist[0][0], "p_value_cutoffs")): cut = os.path.splitext( bfile[:-4])[1][1:] #between . ([1:]) and .bed ([:-4]) bedzip.write("track name='" + name[1] + "_WellingtonFootprints_Pval_%s'\n" % cut) for wdir, wpref in wlist: _bedpath = os.path.join( wdir, "p_value_cutoffs", wpref + ".WellingtonFootprints." + cut + ".bed") with open(_bedpath) as _bed: [bedzip.write(l) for l in _bed] bedzip.close() ex.add(wellall + "PvalCutoffs.bed.gz", description=set_file_descr( name[1] + '_WellingtonFootprintsPvalCutoffs.bed.gz', type='bed', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprintsPvalCutoffs.bed.gz') #### WIG cat([os.path.join(*x) + ".WellingtonFootprints.wig" for x in wlist], wellall + ".wig") #convert(wellall+".wig", wellall+".bw", chrmeta=chrmeta) #ex.add(wellall+".bw", # description=set_file_descr(name[1]+'_WellingtonFootprints.bw', # type='bigWig', ucsc='1', step='footprints', groupId=name[0]), # associate_to_filename=wellall, template='%s_WellingtonFootprints.bw') ex.add(wellall + ".wig", description=set_file_descr(name[1] + '_WellingtonFootprints.wig', type='wig', ucsc='1', step='footprints', groupId=name[0]), associate_to_filename=wellall, template='%s_WellingtonFootprints.wig') return bedlist
def rnaseq_workflow(ex, job, pileup_level=["genes", "transcripts"], via="lsf", junctions=False, stranded=False, logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow. :rtype: None :param ex: a bein execution. :param job: a Frontend.Job object (or a dictionary of the same form). :param assembly: a genrep.Assembly object :param junctions: (bool) whether to search for splice junctions using SOAPsplice. [False] :param via: (str) send job via 'local' or 'lsf'. ["lsf"] """ group_names = {} conditions = [] groups = job.groups assembly = job.assembly assert len(groups) > 0, "No groups/runs were given." for gid, group in groups.iteritems(): gname = str(group['name']) group_names[gid] = gname if isinstance(pileup_level, basestring): pileup_level = [pileup_level] # Define conditions as 'group_name.run_id' and store bamfiles in the same order bamfiles = [] for gid, files in job.files.iteritems(): k = 0 for rid, f in files.iteritems(): k += 1 cond = group_names[gid] + '.' + str(k) conditions.append(cond) bamfiles.append(f['bam']) ncond = len(conditions) # Get the assembly's GTF # ...from fasta origin logfile.write("* Prepare GTF\n") logfile.flush() if hasattr(assembly, "fasta_origin"): logfile.write(" ... from fasta origin\n") logfile.flush() gtf = gtf_from_bam_header(bamfiles[0]) descr = set_file_descr(gtf, type='txt', step='pileup', view='admin') ex.add(gtf, description=descr) pileup_level = ["transcripts"] if stranded: stranded = False logfile.write( " ... Cannot exploit strand information from custom fasta reference.\n" ) logfile.flush() # ... or from (wrong) mapping on the transcriptome elif assembly.intype == 2: logfile.write(" ... from mapping on the transcriptome\n") logfile.flush() gtf = transcriptome_gtf_from_genrep(assembly) # ... or from config file else: gtf = job.options.get('annot_file') if gtf and os.path.exists(os.path.join('..', gtf)): gtf = os.path.join('..', gtf) logfile.write(" ... from config file: %s\n" % gtf) logfile.flush() elif gtf and os.path.exists(gtf): gtf = os.path.abspath(gtf) logfile.write(" ... from config file: %s\n" % gtf) logfile.flush() # ... or from GenRep else: logfile.write(" ... from GenRep\n") logfile.flush() gtf = assembly.create_exome_gtf() #shutil.copy(gtf,"../") # Build controllers rnaseq_args = (ex, via, job, assembly, conditions, debugfile, logfile, pileup_level, junctions, stranded) CNT = Counter(*rnaseq_args) DE = DE_Analysis(*rnaseq_args) PCA = Pca(*rnaseq_args) JN = Junctions(*rnaseq_args) # Count reads on genes, transcripts with "rnacounter" count_files = CNT.count_reads(bamfiles, gtf) def differential_analysis(counts_file, feature_type): #shutil.copy(counts_file, "../") diff_files = DE.differential_analysis(counts_file) if diff_files is not None: for diff in diff_files: # Remove first line diff_nohead = unique_filename_in() with open(diff) as f: head = f.readline().strip() with open(diff_nohead, "wb") as g: for line in f: g.write(line) oname = feature_type + "_differential_" + head + ".txt" desc = set_file_descr(oname, step='stats', type='txt', ucsc=0) ex.add(diff_nohead, description=desc) # DE and PCA if "genes" in pileup_level: # PCA of groups ~ gene expression description = set_file_descr("genes_expression.txt", step="pileup", type="txt", ucsc=0) ex.add(count_files['genes'], description=description) differential_analysis(count_files['genes'], "genes") if stranded: description = set_file_descr("genes_antisense_expression.txt", step="pileup", type="txt", ucsc=0) ex.add(count_files['genes_anti'], description=description) differential_analysis(count_files['genes_anti'], "genes_antisense") if ncond > 2: PCA.pca_rnaseq(count_files['genes']) if "transcripts" in pileup_level: description = set_file_descr("transcripts_expression.txt", step="pileup", type="txt", ucsc=0) ex.add(count_files['transcripts'], description=description) differential_analysis(count_files['transcripts'], "transcripts") if stranded: description = set_file_descr( "transcripts_antisense_expression.txt", step="pileup", type="txt", ucsc=0) ex.add(count_files['transcripts_anti'], description=description) differential_analysis(count_files['transcripts_anti'], "transcripts_antisense") # Find splice junctions if junctions: logfile.write("* Search for splice junctions\n") logfile.flush() JN.find_junctions() return 0
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict,frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files',{}) else: raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.") merge_strands = int(options.get('merge_strands',-1)) suffixes = ["fwd","rev"] peak_deconvolution = options.get('peak_deconvolution',False) if isinstance(peak_deconvolution,basestring): peak_deconvolution = peak_deconvolution.lower() in ['1','true','t'] run_meme = options.get('run_meme',False) if isinstance(run_meme,basestring): run_meme = run_meme.lower() in ['1','true','t'] macs_args = options.get('macs_args',["--bw","200"]) b2w_args = options.get('b2w_args',[]) if not(isinstance(mapseq_files,dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid,mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not(isinstance(mapped,dict)): raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name+"_"+str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via ) if mapped[k].get('poisson_threshold',-1)>0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns)>0: p_thresh['group_name'] = sum(ptruns)/len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped)>1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: tests.append(bamfile) names['tests'].append((gid,group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls)<1: controls = [None] names['controls'] = [(0,None)] logfile.write("Starting MACS.\n");logfile.flush() processed = {'macs': add_macs_results( ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via ) } logfile.write("Done MACS.\n");logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score':(6,sys.maxint)} _fields = ['chr','start','end','name','score'] for i,name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = track(processed['macs'][ctrl]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n,_n=xn: "%s:%i" %(__n,_n)) for xn,x in enumerate(names['controls'])]) ############################## macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 ) peak_list[name] = unique_filename_in()+".sql" macs_final = track( peak_list[name], chrmeta=chrlist, info={'datatype':'qualitative'}, fields=['start','end','name','score'] ) macs_final.write(fusion(macs_neighb),clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int(options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension']>100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid,mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv( stream, pval ): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields ) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1]+" deconvolution.\n");logfile.flush() if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = processed['macs'][ctrl]+"_peaks.bed" else: macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed" for x in names['controls']], via=via ) deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via ) peak_list[name] = unique_filename_in()+".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed,0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1]+'_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1]+'_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'],(bigwig,"bigWig")) ex.add(bigwig, description=set_file_descr(name[1]+'_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e));logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1]+'_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs( stream, xlsl, _f ): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:]) yield _p+xlsl[0][nb-1][1:] else: nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:] return FeatureStream( _macs_row(stream), fields=_f ) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex,peakfile) peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields)) ex.add(peakfile+".gz", description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text', step='annotation',groupId=name[0])) stracks = [track(wig,info={'name':name+"_"+st}) for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()] tablefile = unique_filename_in() with open(tablefile,"w") as _tf: _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1] for _s in names['tests'] for _c in names['controls']] _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']), 'name', lambda __n,_n=npt: "%s:%i" %(__n,_n)) for npt,pt in enumerate(peakfile_list)] features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], remove_duplicates=True, group_by=['chr','start','end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile,"a") as _tf: for row in quantifs: pcols = ['']*_ns*_nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while ( _k < len(_rnsplit)-1-int(_nc>1) ): if _nc > 1: _k += 2 _n2 = _rnsplit[_k-1] _n = _rnsplit[_k].split("|") pcols[int(_n[0])*_nc+int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n") gzipfile(ex,tablefile) ex.add(tablefile+".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n");logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'], via=via ) return processed
def count_reads(self, bamfiles, gtf): self.write_log("* Counting reads") # Count reads on genes, transcripts with "rnacounter" ncond = len(self.conditions) tablenames = [None]*ncond futures = [None]*ncond max_rlen = 0 counter_options = ["--nh"] for bam in bamfiles: sam = pysam.Samfile(bam,'rb') max_rlen = max(max_rlen, sam.next().rlen) counter_options += ["--exon_cutoff", str(max_rlen)] bwt_args = self.job.options.get('map_args',{}).get('bwt_args',[]) # if not "--local" in bwt_args: # counter_options += ["--nh"] if hasattr(self.assembly,"fasta_origin") or self.assembly.intype==2: counter_options += ["--type","transcripts", "--method","raw"] else: counter_options += ["--type","genes,transcripts", "--method","raw,nnls"] if self.stranded: counter_options += ["--stranded"] for i,c in enumerate(self.conditions): tablenames[i] = unique_filename_in() futures[i] = rnacounter.nonblocking(self.ex, bamfiles[i], gtf, stdout=tablenames[i], via=self.via, options=counter_options) # Put samples together for i,c in enumerate(self.conditions): try: futures[i].wait() except Exception as err: self.write_debug("Counting failed: %s." % str(err)) raise err if futures[i] is None: self.write_debug("Counting failed.") raise ValueError("Counting failed.") if len(tablenames) > 1: joined = unique_filename_in() rnacounter_join.nonblocking(self.ex, tablenames, stdout=joined, via=self.via).wait() else: joined = tablenames[0] # Split genes and transcripts into separate files genes_filename = unique_filename_in() trans_filename = unique_filename_in() genes_file = open(genes_filename,"wb") trans_file = open(trans_filename,"wb") if self.stranded: genes_anti_filename = unique_filename_in() trans_anti_filename = unique_filename_in() genes_anti_file = open(genes_anti_filename,"wb") trans_anti_file = open(trans_anti_filename,"wb") with open(joined) as jfile: header = jfile.readline() hconds = ["counts."+c for c in self.conditions] + ["rpkm."+c for c in self.conditions] hinfo = header.strip().split('\t')[2*ncond+1:] header = '\t'.join(["ID"] + hconds + hinfo)+'\n' genes_file.write(header) trans_file.write(header) type_idx = header.split('\t').index("Type") if self.stranded: genes_anti_file.write(header) trans_anti_file.write(header) sense_idx = header.split('\t').index("Sense") for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() sense = L[sense_idx].lower() if ftype == 'gene': if sense == 'antisense': genes_anti_file.write(line) else: genes_file.write(line) elif ftype == 'transcript': if sense == 'antisense': trans_anti_file.write(line) else: trans_file.write(line) else: for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() if ftype == 'gene': genes_file.write(line) elif ftype == 'transcript': trans_file.write(line) genes_file.close() trans_file.close() # Keep intermediate tables for i,c in enumerate(self.conditions): #shutil.copy(tablenames[i], "../counts%d.txt"%i) descr = set_file_descr(self.conditions[i]+'_'+tablenames[i]+'.gz', type='txt', step='pileup', view='admin') gzipfile(self.ex, tablenames[i]) self.ex.add(tablenames[i]+'.gz', description=descr) if self.stranded: count_files = {'genes':genes_filename, 'transcripts':trans_filename, 'genes_anti':genes_anti_filename, 'transcripts_anti':trans_anti_filename} else: count_files = {'genes':genes_filename, 'transcripts':trans_filename} return count_files
def main(argv = None): via = "lsf" limspath = None hts_key = '' working_dir = None config_file = None if argv is None: argv = sys.argv try: try: opts,args = getopt.getopt(sys.argv[1:],"hu:k:d:w:c:", ["help","via=","key=","minilims=", "working-directory=","config="]) except getopt.error, msg: raise Usage(msg) for o, a in opts: if o in ("-h", "--help"): print __doc__ print usage return 0 elif o in ("-u", "--via"): if a=="local": via = "local" elif a=="lsf": via = "lsf" else: raise Usage("Via (-u) can only be \"local\" or \"lsf\", got %s." % (a,)) elif o in ("-w", "--working-directory"): if os.path.exists(a): os.chdir(a) working_dir = a else: raise Usage("Working directory '%s' does not exist." % a) elif o in ("-d", "--minilims"): limspath = a elif o in ("-k", "--key"): hts_key = a elif o in ("-c", "--config"): config_file = a else: raise Usage("Unhandled option: " + o) if not(limspath and os.path.exists(limspath) and (hts_key != None or (config_file and os.path.exists(config_file)))): raise Usage("Need a minilims and a job key or a configuration file") M = MiniLIMS( limspath ) if len(hts_key)>1: gl = use_pickle(M, "global variables") htss = frontend.Frontend( url=gl['hts_mapseq']['url'] ) job = htss.job( hts_key ) [M.delete_execution(x) for x in M.search_executions(with_description=hts_key,fails=True)] elif os.path.exists(config_file): (job,gl) = frontend.parseConfig( config_file ) hts_key = job.description else: raise ValueError("Need either a job key (-k) or a configuration file (-c).") g_rep = genrep.GenRep( url=gl["genrep_url"], root=gl["bwt_root"], intype=job.options.get('input_type_id') or 0 ) assembly = g_rep.assembly( job.assembly_id ) if 'lims' in gl: dafl = dict((loc,daflims.DAFLIMS( username=gl['lims']['user'], password=pwd )) for loc,pwd in gl['lims']['passwd'].iteritems()) else: dafl = None if not('compute_densities' in job.options): job.options['compute_densities'] = True elif isinstance(job.options['compute_densities'],str): job.options['compute_densities'] = job.options['compute_densities'].lower() in ['1','true','t'] if not('ucsc_bigwig' in job.options): job.options['ucsc_bigwig'] = True elif isinstance(job.options['ucsc_bigwig'],str): job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'].lower() in ['1','true','t'] job.options['ucsc_bigwig'] = job.options['ucsc_bigwig'] and job.options['compute_densities'] if not('create_gdv_project' in job.options): job.options['create_gdv_project'] = False elif isinstance(job.options['create_gdv_project'],str): job.options['create_gdv_project'] = job.options['create_gdv_project'].lower() in ['1','true','t'] if job.options.get('read_extension'): job.options['read_extension'] = int(job.options['read_extension']) if job.options.get('merge_strands'): job.options['merge_strands'] = int(job.options['merge_strands']) logfile = open(hts_key+".log",'w') with execution( M, description=hts_key, remote_working_directory=working_dir ) as ex: logfile.write("Enter execution, fetch fastq files.\n");logfile.flush() job = get_fastq_files( job, ex.working_directory, dafl ) logfile.write("Map reads.\n");logfile.flush() mapped_files = map_groups( ex, job, ex.working_directory, assembly, {'via': via} ) logfile.write("Make stats:\n");logfile.flush() for k,v in job.groups.iteritems(): logfile.write(str(k)+str(v['name'])+"\t");logfile.flush() pdf = add_pdf_stats( ex, mapped_files, {k:v['name']}, gl.get('script_path') or '', description=set_file_descr(v['name']+"_mapping_report.pdf",groupId=k,step='stats',type='pdf') ) if job.options['compute_densities']: logfile.write("computing densities.\n");logfile.flush() if not(job.options.get('read_extension')>0): job.options['read_extension'] = mapped_files.values()[0].values()[0]['stats']['read_length'] density_files = densities_groups( ex, job, mapped_files, assembly.chromosomes, via=via ) logfile.write("Finished computing densities.\n");logfile.flush() if job.options['create_gdv_project']: logfile.write("Creating GDV project.\n");logfile.flush() gdv_project = gdv.create_gdv_project( gl['gdv']['key'], gl['gdv']['email'], job.description, assembly.nr_assembly_id, gdv_url=gl['gdv']['url'], public=True ) logfile.write("GDV project: "+str(gdv_project['project_id']+"\n"));logfile.flush() add_pickle( ex, gdv_project, description=set_file_descr("gdv_json",step='gdv',type='py',view='admin') ) allfiles = get_files( ex.id, M ) if 'ucsc_bigwig' and g_rep.intype == 0: ucscfiles = get_files( ex.id, M, select_param={'ucsc':'1'} ) with open(hts_key+".bed",'w') as ucscbed: for ftype,fset in ucscfiles.iteritems(): for ffile,descr in fset.iteritems(): if re.search(r' \(.*\)',descr): continue ucscbed.write(track_header(descr,ftype,gl['hts_mapseq']['download'],ffile)) if job.options['create_gdv_project']: allfiles['url'] = {gdv_project['public_url']: 'GDV view'} download_url = gl['hts_mapseq']['download'] [gdv.add_gdv_track( gl['gdv']['key'], gl['gdv']['email'], gdv_project['project_id'], url=download_url+str(k), name = re.sub('\.sql','',str(f)), gdv_url=gl['gdv']['url'] ) for k,f in allfiles['sql'].iteritems()] logfile.close() print json.dumps(allfiles) with open(hts_key+".done",'w') as done: json.dump(allfiles,done) if 'email' in gl: r = email.EmailReport( sender=gl['email']['sender'], to=str(job.email), subject="Mapseq job "+str(job.description), smtp_server=gl['email']['smtp'] ) r.appendBody(''' Your mapseq job has finished. The description was: '''+str(job.description)+''' and its unique key is '''+hts_key+'''. You can now retrieve the results at this url: '''+gl['hts_mapseq']['url']+"jobs/"+hts_key+"/get_results") r.send() return 0
def __call__(self,opts): self.opts = opts if os.path.exists(self.opts.wdir): os.chdir(self.opts.wdir) else: raise Usage("Working directory '%s' does not exist." %self.opts.wdir) ##### Connect to Minilims, recover global variables, fetch job info self.minilims = os.path.join(self.opts.basepath,self.name+"_minilims") M = MiniLIMS(self.minilims) if not((self.opts.key != None or (self.opts.config and os.path.exists(self.opts.config)))): raise Usage("Need a job key or a configuration file") if self.opts.key: self.globals = use_pickle(M, "global variables") htss = frontend.Frontend( url=self.globals['hts_mapseq']['url'] ) self.job = htss.job( self.opts.key ) [M.delete_execution(x) for x in \ M.search_executions(with_description=self.opts.key,fails=True)] if self.job.options.get("config_file"): if os.path.exists(self.job.options["config_file"]): self.opts.config = os.path.abspath(self.job.options["config_file"]) elif os.path.exists("config.txt"): self.opts.config = os.path.abspath("config.txt") if self.opts.config and os.path.exists(self.opts.config): (self.job,self.globals) = frontend.parseConfig( self.opts.config, self.job, self.globals ) elif os.path.exists(self.opts.config): (self.job,self.globals) = frontend.parseConfig( self.opts.config ) self.opts.key = self.job.description else: raise Usage("Need either a job key (-k) or a configuration file (-c).") ##### Genrep instance if 'fasta_file' in self.job.options: if os.path.exists(self.job.options['fasta_file']): self.job.options['fasta_file'] = os.path.abspath(self.job.options['fasta_path']) else: for ext in (".fa",".fa.gz",".tar.gz"): if os.path.exists("ref_sequence"+ext): self.job.options['fasta_file'] = os.path.abspath("ref_sequence"+ext) if not os.path.exists(self.job.options['fasta_file']): raise Usage("Don't know where to find fasta file %s." %self.job.options["fasta_file"]) g_rep = genrep.GenRep( url=self.globals.get("genrep_url"), root=self.globals.get("bwt_root") ) ##### Configure facility LIMS if 'lims' in self.globals: from bbcflib import daflims self.job.dafl = dict((loc,daflims.DAFLIMS( username=self.globals['lims']['user'], password=pwd )) for loc,pwd in self.globals['lims']['passwd'].iteritems()) ######################################################################## ########################## EXECUTION ################################# ######################################################################## ##### Logging logfile_name = os.path.abspath(self.opts.key+".log") debugfile_name = os.path.abspath(self.opts.key+".debug") self.logfile = open(logfile_name,'w') self.debugfile = open(debugfile_name,'w') self.debug_write(json.dumps(self.globals)+"\n") with execution( M, description=self.opts.key, remote_working_directory=self.opts.wdir ) as ex: self.log_write("Enter execution. Current working directory: %s" %ex.working_directory) self.job.assembly = genrep.Assembly( assembly=self.job.assembly_id, genrep=g_rep, fasta=self.job.options.get('fasta_file'), annot=self.job.options.get('annot_file'), intype=self.job.options.get('input_type_id',0), ex=ex, via=self.opts.via, bowtie2=self.job.options.get("bowtie2",True) ) ##### Check all the options if not self.check_options(): raise Usage("Problem with options %s" %self.opts) self.debug_write(json.dumps(self.job.options)) self.init_files( ex ) ##### Run workflow self.log_write("Starting workflow.") self.main_func(ex,**self.main_args) ##### Add logs to the LIMS in admin mode self.logfile.flush() self.debugfile.flush() log_desc = set_file_descr('logfile.txt', step='log', type='txt', view="admin") debug_desc = set_file_descr('debug.txt', step='log', type='txt', view="admin") ex.add(os.path.join(logfile_name), description=log_desc) ex.add(os.path.join(debugfile_name), description=debug_desc) ##### Create GDV project if self.job.options['create_gdv_project']: self.gdv_create(ex) ######################################################################## ######################## POSTPROCESSING ############################## ######################################################################## allfiles = get_files( ex.id, M ) if self.job.options['create_gdv_project'] and \ self.job.options['gdv_project'].get('project',{}).get('id',0)>0: allfiles['url'] = self.gdv_upload(allfiles.get('sql',{})) self.logfile.close() self.debugfile.close() print json.dumps(allfiles) with open(self.opts.key+".done",'w') as done: json.dump(allfiles,done) self.send_email() return 0
def microbiome_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'): ''' Main: * 0. retrieve bam files from mapseq job * 0.a. merge bam files (=> 1 bam file per group) * 1. for each group: * 1.a get counts per group (=> 1 file per group) * 1.b get counts per Level (Kingdom, Phylum, Class, Order, Family, Genus and Species) (=> 1 file per level / per group) * 2. combine counts * 2.a combine counts for all groups (=> 1 combined file) * 2.b combine counts per level for all groups (=> 1 combined file per Level) * 3. generate barplots (=> 1 plot per group + per level + per combined files) ''' ### params levels = [ 'Kingdom', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species' ] infosCols = { 'Kingdom': [0, [1, 2]], 'Phylum': [[0, 1], [2, 3]], 'Class': [[0, 1, 2], [3, 4]], 'Order': [[0, 1, 2, 3], [4, 5]], 'Family': [[0, 1, 2, 3, 4], [5, 6]], 'Genus': [[0, 1, 2, 3, 4, 5], [6, 7]], 'Species': [[0, 1, 2, 3, 4, 5, 6], [7, 8]] } ### outputs processed = {'cnts': {}, 'cnts_level': {}, 'plots': {}} ### do it mapseq_files = job.files # 1.a get counts per group (=> 1 file per group) futures = {} for gid, group in job.groups.iteritems(): group_name = group['name'] bamfiles = [m['bam'] for m in mapseq_files[gid].values()] futures[gid] = run_microbiome.nonblocking(ex, [ "bam_to_annot_counts", bamfiles, assembly.annotations_path, group_name ], via=via, memory=8) # 1.b get counts per Level (Kingdom, Phylum, Class, Order, Family, Genus and Species) (=> 1 file per level / per group) step = 'counts' for gid, future in futures.iteritems(): res = future.wait() processed['cnts'][gid] = res # group_name + "_counts_annot.txt" fname = job.groups[gid]['name'] + "_counts_annot.txt" ex.add(res, description=set_file_descr(fname, groupId=gid, step=step, type="txt")) processed['cnts_level'][gid] = [ run_microbiome.nonblocking(ex, ["getCountsPerLevel", res, level], via=via, memory=8) for level in levels ] # 2.a combine counts for all groups (=> 1 combined file) files = processed['cnts'].values() combined_out = [ run_microbiome.nonblocking(ex, ["combine_counts", files, 0, [1, 2]], via=via, memory=8) ] # 2.b combine counts per level for all groups (=> 1 combined file per Level) for n, level in enumerate(levels): files = dict([(gid, f[n].wait()) for gid, f in processed['cnts_level'].iteritems()]) combined_out.append( run_microbiome.nonblocking( ex, ["combine_counts", files.values()] + infosCols.get(level, [0, [1, 2]]), via=via, memory=8)) for gid, f in files.iteritems(): fname = job.groups[gid]['name'] + "_counts_annot_" + level + ".txt" ex.add(f, description=set_file_descr(fname, groupId=gid, step=step, type="txt")) step = 'combined' ex.add(combined_out[0].wait(), description=set_file_descr("combined_counts.txt", step=step, type="txt")) for nl, level in enumerate(levels): ex.add(combined_out[nl + 1].wait(), description=set_file_descr("combined_counts" + level + ".txt", step=step, type="txt")) return 0
def parallel_meme( ex, assembly, regions, name=None, chip=False, meme_args=None, via='lsf' ): """Fetches sequences, then calls ``meme`` on them and finally saves the results in the repository. """ if meme_args is None: meme_args = [] if not(isinstance(regions,list)): regions = [regions] if not(isinstance(name,list)): name = [name or '_'] futures = {} fasta_files = {} background = assembly.statistics(unique_filename_in(),frequency=True) # genomeRef = assembly.untar_genome_fasta() for i,n in enumerate(name): (fasta, size) = assembly.fasta_from_regions( regions[i], ex=ex ) tmpfile = unique_filename_in() outdir = unique_filename_in() if chip: futures[n] = (outdir, memechip.nonblocking( ex, fasta, outdir, background, args=meme_args, via=via, stderr=tmpfile, memory=6 )) else: futures[n] = (outdir, meme.nonblocking( ex, fasta, outdir, background, maxsize=(size*3)/2, args=meme_args, via=via, stderr=tmpfile, memory=6 )) fasta_files[n] = fasta all_res = {} for n,f in futures.iteritems(): f[1].wait() meme_out = f[0] archive = unique_filename_in() tgz = tarfile.open(archive, "w:gz") tgz.add( meme_out, arcname=n[1]+"_meme", exclude=lambda x: os.path.basename(x) in [fasta_files[n],background] ) tgz.close() ex.add( archive, description=set_file_descr(n[1]+"_meme.tgz", step='meme', type='tar', groupId=n[0]) ) gzipfile(ex,fasta_files[n],args=["-f"]) ex.add( fasta_files[n]+".gz", description=set_file_descr(n[1]+"_sites.fa.gz", step='meme', type='fasta', groupId=n[0]) ) if not(chip) and os.path.exists(os.path.join(meme_out, "meme.xml")): meme_res = parse_meme_xml( ex, os.path.join(meme_out, "meme.xml"), assembly.chrmeta ) if os.path.exists(os.path.join(meme_out, "meme.html")): ex.add( os.path.join(meme_out, "meme.html"), description=set_file_descr(n[1]+"_meme.html", step='meme', type='html', groupId=n[0]) ) ex.add( meme_res['sql'], description=set_file_descr(n[1]+"_meme_sites.sql", step='meme', type='sql', groupId=n[0]) ) for i,motif in enumerate(meme_res['matrices'].keys()): ex.add( meme_res['matrices'][motif], description=set_file_descr(n[1]+"_meme_"+motif+".txt", step='meme', type='txt', groupId=n[0]) ) ex.add( os.path.join(meme_out, "logo"+str(i+1)+".png"), description=set_file_descr(n[1]+"_meme_"+motif+".png", step='meme', type='png', groupId=n[0]) ) all_res[n] = meme_res return all_res
def dnaseseq_workflow( ex, job, assembly, logfile=sys.stdout, via='lsf' ): """ This workflow performs the following steps: * BAM files from replicates within the same group are merged * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group * Wellington is called to identify footprints within these enriched regions * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file * Average DNAse profiles around motifs are plotted """ tests = [] controls = [] names = {'tests': [], 'controls': []} supdir = os.path.split(ex.remote_working_directory)[0] for gid,mapped in job.files.iteritems(): group_name = job.groups[gid]['name'] if not isinstance(mapped,dict): raise TypeError("Files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} if len(mapped)>1: bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()]) index = index_bam(ex, bamfile) else: bamfile = mapped.values()[0]['bam'] if job.groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: if os.path.exists(job.groups[gid].get('bedfile','null')): bedfile = job.groups[gid]['bedfile'] elif os.path.exists(os.path.join(supdir,job.groups[gid].get('bedfile','null'))): bedfile = os.path.join(supdir,job.groups[gid]['bedfile']) else: bedfile = None tests.append((bedfile,bamfile)) names['tests'].append((gid,group_name)) if len(controls)<1: controls = [None] names['controls'] = [(0,None)] tests = macs_bedfiles( ex, assembly.chrmeta, tests, controls, names, job.options.get('macs_args',["--keep-dup","10"]), via, logfile ) bedlist = run_wellington(ex, tests, names, assembly, via, logfile) ######################### Motif scanning / plotting if any([gr.get('motif') != 'null' and gr.get('motif') for gr in job.groups.values()]): motifbeds = motif_scan( ex, bedlist, assembly, job.groups, via, logfile ) siglist = dict((gid[0],[]) for gid in names['tests']) for gid,mapped in job.files.iteritems(): wig = [] suffixes = ["fwd","rev"] merge_strands = int(job.options.get('merge_strands',-1)) read_extension = int(job.options.get('read_extension') or -1) make_wigs = merge_strands >= 0 or read_extension != 1 for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=1, convert=False, b2w_args=[], via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: wig[0] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) _trn = job.groups[gid]['name']+"_%s" if job.groups[gid]['control']: for s,w in wig[0].iteritems(): for _g in siglist.keys(): siglist[_g].append(track(w,info={'name': _trn%s})) else: siglist[gid].extend([track(w,info={'name': _trn%s}) for s,w in wig[0].iteritems()]) plot_files = plot_footprint_profile( ex, motifbeds, siglist, assembly.chrnames, job.groups, logfile ) for gid, flist in plot_files.iteritems(): gname = job.groups[gid]['name'] plotall = unique_filename_in() touch( ex, plotall ) ex.add(plotall, description=set_file_descr(gname+'_footprints_plots', type='none', view='admin', step='motifs', groupId=gid)) ex.add(flist['pdf'], description=set_file_descr(gname+'_footprints_plots.pdf', type='pdf', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.pdf') tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for mname,matf in flist['mat']: tarfh.add(matf, arcname="%s_%s.txt" % (gname,mname)) tarfh.close() ex.add( tarname, description=set_file_descr(gname+'_footprints_plots.tar.gz', type='tar', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.tar.gz') logfile.write("\nDone.\n ");logfile.flush() return 0
def rnaseq_workflow(ex, job, pileup_level=["genes","transcripts"], via="lsf", junctions=False, stranded=False, logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow. :rtype: None :param ex: a bein execution. :param job: a Frontend.Job object (or a dictionary of the same form). :param assembly: a genrep.Assembly object :param junctions: (bool) whether to search for splice junctions using SOAPsplice. [False] :param via: (str) send job via 'local' or 'lsf'. ["lsf"] """ group_names={}; conditions=[] groups = job.groups assembly = job.assembly assert len(groups) > 0, "No groups/runs were given." for gid,group in groups.iteritems(): gname = str(group['name']) group_names[gid] = gname if isinstance(pileup_level,basestring): pileup_level=[pileup_level] # Define conditions as 'group_name.run_id' and store bamfiles in the same order bamfiles = [] for gid,files in job.files.iteritems(): k = 0 for rid,f in files.iteritems(): k+=1 cond = group_names[gid]+'.'+str(k) conditions.append(cond) bamfiles.append(f['bam']) ncond = len(conditions) # Get the assembly's GTF # ...from fasta origin logfile.write("* Prepare GTF\n"); logfile.flush() if hasattr(assembly,"fasta_origin"): logfile.write(" ... from fasta origin\n"); logfile.flush() gtf = gtf_from_bam_header(bamfiles[0]) descr = set_file_descr(gtf, type='txt', step='pileup', view='admin') ex.add(gtf, description=descr) pileup_level = ["transcripts"] if stranded: stranded=False logfile.write(" ... Cannot exploit strand information from custom fasta reference.\n"); logfile.flush() # ... or from (wrong) mapping on the transcriptome elif assembly.intype==2: logfile.write(" ... from mapping on the transcriptome\n"); logfile.flush() gtf = transcriptome_gtf_from_genrep(assembly) # ... or from config file else: gtf = job.options.get('annot_file') if gtf and os.path.exists(os.path.join('..', gtf)): gtf = os.path.join('..', gtf) logfile.write(" ... from config file: %s\n" % gtf); logfile.flush() elif gtf and os.path.exists(gtf): gtf = os.path.abspath(gtf) logfile.write(" ... from config file: %s\n" % gtf); logfile.flush() # ... or from GenRep else: logfile.write(" ... from GenRep\n"); logfile.flush() gtf = assembly.create_exome_gtf() #shutil.copy(gtf,"../") # Build controllers rnaseq_args = (ex,via,job,assembly,conditions,debugfile,logfile, pileup_level,junctions,stranded) CNT = Counter(*rnaseq_args) DE = DE_Analysis(*rnaseq_args) PCA = Pca(*rnaseq_args) JN = Junctions(*rnaseq_args) # Count reads on genes, transcripts with "rnacounter" count_files = CNT.count_reads(bamfiles, gtf) def differential_analysis(counts_file, feature_type): #shutil.copy(counts_file, "../") diff_files = DE.differential_analysis(counts_file) if diff_files is not None: for diff in diff_files: # Remove first line diff_nohead = unique_filename_in() with open(diff) as f: head = f.readline().strip() with open(diff_nohead, "wb") as g: for line in f: g.write(line) oname = feature_type + "_differential_"+ head + ".txt" desc = set_file_descr(oname, step='stats', type='txt', ucsc=0) ex.add(diff_nohead, description=desc) # DE and PCA if "genes" in pileup_level: # PCA of groups ~ gene expression description = set_file_descr("genes_expression.txt", step="pileup", type="txt", ucsc=0) ex.add(count_files['genes'], description=description) differential_analysis(count_files['genes'], "genes") if stranded: description = set_file_descr("genes_antisense_expression.txt", step="pileup", type="txt", ucsc=0) ex.add(count_files['genes_anti'], description=description) differential_analysis(count_files['genes_anti'], "genes_antisense") if ncond > 2: PCA.pca_rnaseq(count_files['genes']) if "transcripts" in pileup_level: description = set_file_descr("transcripts_expression.txt", step="pileup", type="txt", ucsc=0) ex.add(count_files['transcripts'], description=description) differential_analysis(count_files['transcripts'], "transcripts") if stranded: description = set_file_descr("transcripts_antisense_expression.txt", step="pileup", type="txt", ucsc=0) ex.add(count_files['transcripts_anti'], description=description) differential_analysis(count_files['transcripts_anti'], "transcripts_antisense") # Find splice junctions if junctions: logfile.write("* Search for splice junctions\n"); logfile.flush() JN.find_junctions() return 0
def add_macs_results( ex, read_length, genome_size, bamfile, ctrlbam=None, name=None, poisson_threshold=None, alias=None, macs_args=None, via='lsf' ): """Calls the ``macs`` function on each possible pair of test and control bam files and adds the respective outputs to the execution repository. ``macs`` options can be controlled with `macs_args`. If a dictionary of Poisson thresholds for each sample is given, then the enrichment bounds ('-m' option) are computed from them otherwise the default is '-m 10,100'. Returns the set of file prefixes. """ if not(isinstance(bamfile,list)): bamfile = [bamfile] if not(isinstance(ctrlbam,list)): ctrlbam = [ctrlbam] if poisson_threshold is None: poisson_threshold = {} if macs_args is None: macs_args = [] futures = {} rl = read_length for i,bam in enumerate(bamfile): n = name['tests'][i] if poisson_threshold.get(n)>0: low = (poisson_threshold.get(n)+1)*5 enrich_bounds = str(min(30,low))+","+str(10*low) else: enrich_bounds = "10,100" if not("-m" in macs_args): macs_args += ["-m",enrich_bounds] if isinstance(read_length,list): rl = read_length[i] for j,cam in enumerate(ctrlbam): m = name['controls'][j] nm = (n,m) futures[nm] = macs.nonblocking( ex, rl, genome_size, bam, cam, args=macs_args, via=via, memory=12 ) prefixes = {} for n,f in futures.iteritems(): p = f.wait() prefixes[n] = p macs_descr0 = {'step':'macs','type':'none','view':'admin','groupId':n[0][0]} macs_descr1 = {'step':'macs','type':'xls','groupId':n[0][0]} macs_descr2 = {'step':'macs','type':'bed','groupId':n[0][0],'ucsc':'1'} filename = "_vs_".join([x[1] for x in n if x[0]]) touch( ex, p ) ex.add( p, description=set_file_descr(filename,**macs_descr0), alias=alias ) ex.add( p+"_peaks.xls", description=set_file_descr(filename+"_peaks.xls",**macs_descr1), associate_to_filename=p, template='%s_peaks.xls' ) bedzip = gzip.open(p+"_peaks.bed.gz",'wb') bedzip.write("track name='"+filename+"_macs_peaks'\n") with open(p+"_peaks.bed") as bedinf: [bedzip.write(l) for l in bedinf] bedzip.close() ex.add( p+"_peaks.bed.gz", description=set_file_descr(filename+"_peaks.bed.gz",**macs_descr2), associate_to_filename=p, template='%s_peaks.bed.gz' ) bedzip = gzip.open(p+"_summits.bed.gz",'wb') bedzip.write("track name='"+filename+"_macs_summits'\n") with open(p+"_summits.bed") as bedinf: [bedzip.write(l) for l in bedinf] bedzip.close() ex.add( p+"_summits.bed.gz", description=set_file_descr(filename+"_summits.bed.gz",**macs_descr2), associate_to_filename=p, template='%s_summits.bed.gz' ) if n[1][0]: ex.add( p+"_negative_peaks.xls", description=set_file_descr(filename+"_negative_peaks.xls",**macs_descr0), associate_to_filename=p, template='%s_negative_peaks.xls' ) return prefixes
def find_junctions(self, soapsplice_index=None, path_to_soapsplice=None, soapsplice_options={}): """ Retrieve unmapped reads from a precedent mapping and runs SOAPsplice on them. Return the names of a .bed track indicating the junctions positions, as well as of a bam file of the alignments attesting the junctions. :param soapsplice_index: (str) path to the SOAPsplice index. :param path_to_soapsplice: (str) specify the path to the program if it is not in your $PATH. :param soapsplice_options: (dict) SOAPsplice options, e.g. {'-m':2}. :rtype: str, str """ @program def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}): """Bind 'soapsplice'. Return a text file containing the list of junctions. :param unmapped_R1: (str) path to the fastq file containing the 'left' reads. :param unmapped_R2: (str) path to the fastq file containing the 'right' reads. :param index: (str) path to the SOAPsplice index. :param output: (str) output file name. :param path_to_soapsplice: (str) path to the SOAPsplice executable. If not specified, the program must be in your $PATH. :param options: (dict) SOAPsplice options, given as {opt: value}. :rtype: str Main options:: -p: number of threads, <= 20. [1] -S: 1: forward strand, 2: reverse strand, 3: both. [3] -m: maximum mismatch for one-segment alignment, <= 5. [3] -g: maximum indel for one-segment alignment, <= 2. [2] -i: length of tail that can be ignored in one-segment alignment. [7] -t: longest gap between two segments in two-segment alignment. [500000] -a: shortest length of a segment in two-segment alignment. [8] -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0] -L: maximum distance between paired-end reads. [500000] -l: minimum distance between paired-end reads. [50] -I: insert length of paired-end reads. """ if not output: output = unique_filename_in() path_to_soapsplice = path_to_soapsplice or 'soapsplice' args = [path_to_soapsplice,'-d',index,'-1',unmapped_R1,'-2',unmapped_R2,'-o',output,'-f','2'] opts = [] for k,v in options.iteritems(): opts.extend([str(k),str(v)]) return {"arguments": args+opts, "return_value": output} if not program_exists('soapsplice'): self.write_debug("Skipped junctions search: soapsplice not found.") return self.assembly.set_index_path(intype=3) soapsplice_index = soapsplice_index or self.assembly.index_path soapsplice_options.update(self.job.options.get('soapsplice_options',{})) soapsplice_options.setdefault('-p',16) # number of threads soapsplice_options.setdefault('-q',1) # Sanger format unmapped_fastq = {} for gid, group in self.job.groups.iteritems(): unmapped_fastq[gid] = [] for rid, run in group['runs'].iteritems(): unmapped = self.job.files[gid][rid].get('unmapped_fastq') if not unmapped: self.write_log("No unmapped reads found for group %s, run %d. Skip." % (gid,rid)) continue elif not isinstance(unmapped,tuple): self.write_log("Pair-end reads required. Skip.") continue unmapped_fastq[gid].append(unmapped) if len(unmapped_fastq[gid]) == 0: continue R1 = cat(zip(*unmapped_fastq[gid])[0]) R2 = cat(zip(*unmapped_fastq[gid])[1]) future = soapsplice.nonblocking(self.ex,R1,R2,soapsplice_index, path_to_soapsplice=path_to_soapsplice, options=soapsplice_options, via=self.via, memory=8, threads=soapsplice_options['-p']) try: template = future.wait() except Exception as err: self.write_debug("SOAPsplice failed: %s." % str(err)) return if template is None: self.write_debug("SOAPsplice failed.") return junc_file = template+'.junc' bed = self.convert_junc_file(junc_file,self.assembly) bed_descr = set_file_descr('junctions_%s.bed' % group['name'], groupId=gid,type='bed',step='junctions', ucsc=1) bam_descr = set_file_descr('junctions_%s.bam' % group['name'], groupId=gid,type='bam',step='junctions', ucsc=0) sam = template+'.sam' try: bam = sam_to_bam(self.ex,sam,reheader=self.assembly.name) add_and_index_bam(self.ex, bam, description=bam_descr) self.ex.add(bam, description=bam_descr) except Exception as e: self.write_debug("%s\n(Qualities may be in the wrong format, try with '-q 0'.)" %str(e)) self.ex.add(bed, description=bed_descr) return bed, bam
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local', logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow""" ref_genome = assembly.fasta_by_chrom sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())] logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush() vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}} bams = {} # Launch the jobs for gid in sorted(job.files.keys()): # Merge all bams belonging to the same group runs = [r['bam'] for r in job.files[gid].itervalues()] bam = Samfile(runs[0]) header = bam.header headerfile = unique_filename_in() for h in header["SQ"]: if h["SN"] in assembly.chrmeta: h["SN"] = assembly.chrmeta[h["SN"]]["ac"] head = Samfile( headerfile, "wh", header=header ) head.close() if len(runs) > 1: _b = merge_bam(ex,runs) index_bam(ex,_b) bams[gid] = _b else: bams[gid] = runs[0] # Samtools mpileup + bcftools + vcfutils.pl for chrom,ref in ref_genome.iteritems(): vcf = unique_filename_in() vcfs[chrom][gid] = (vcf, pileup.nonblocking(ex, bams[gid], ref, header=headerfile, via=via, stdout=vcf)) logfile.write(" ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush() # Wait for vcfs to finish and store them in *vcfs[chrom][gid]* for gid in sorted(job.files.keys()): for chrom,ref in ref_genome.iteritems(): vcfs[chrom][gid][1].wait() vcfs[chrom][gid] = vcfs[chrom][gid][0] logfile.write(" ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush() # Targz the pileup files (vcf) tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for chrom,v in vcfs.iteritems(): for gid,vcf in v.iteritems(): tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom)) tarfh.close() ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') ) logfile.write("\n* Merge info from vcf files\n"); logfile.flush() outall = unique_filename_in() outexons = unique_filename_in() with open(outall,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \ ['gene','location_type','distance'])+'\n') with open(outexons,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \ + ['new_aa_'+s for s in sample_names])+'\n') msa_table = dict((s,'') for s in [assembly.name]+sample_names) for chrom,v in vcfs.iteritems(): logfile.write(" > Chromosome '%s'\n" % chrom); logfile.flush() # Put together info from all vcf files logfile.write(" - All SNPs\n"); logfile.flush() allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly, sample_names,mincov,float(minsnp),logfile,debugfile) # Annotate SNPs and check synonymy logfile.write(" - Exonic SNPs\n"); logfile.flush() exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile) for snprow in allsnps: for n,k in enumerate([assembly.name]+sample_names): msa_table[k] += snprow[3+n][0] description = set_file_descr("allSNP.txt",step="SNPs",type="txt") ex.add(outall,description=description) description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt") ex.add(outexons,description=description) msafile = unique_filename_in() with open(msafile,"w") as msa: msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0]))) for name,seq in msa_table.iteritems(): msa.write("%s\t%s\n" %(name,seq)) msa_table = {} description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt") ex.add(msafile,description=description) # Create UCSC bed tracks logfile.write("\n* Create tracks\n"); logfile.flush() create_tracks(ex,outall,sample_names,assembly) # Create quantitative tracks logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush() def _process_pileup(pileups, seq, startpos, endpos): atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3} vectors = ([],[],[]) for pileupcolumn in pileups: position = pileupcolumn.pos if position < startpos: continue if position >= endpos: break coverage = pileupcolumn.n ref_symbol = seq[position-startpos] ref = atoi.get(ref_symbol, 4) symbols = [0,0,0,0,0] quality = 0 for pileupread in pileupcolumn.pileups: symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1 quality += ord(pileupread.alignment.qual[pileupread.qpos])-33 quality = float(quality)/coverage info = heterozygosity(ref, symbols[0:4]) if coverage > 0: vectors[0].append((position, position+1, coverage)) if info > 0: vectors[1].append((position, position+1, info)) if quality > 0: vectors[2].append((position, position+1, quality)) # yield (position, position+1, coverage, info, quality) return vectors if job.options.get('make_bigwigs',False): _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'} for gid,bamfile in bams.iteritems(): _descr['groupId'] = gid bamtr = track(bamfile,format="bam") covname = unique_filename_in()+".bw" out_cov = track(covname, chrmeta=assembly.chrmeta) hetname = unique_filename_in()+".bw" out_het = track(hetname, chrmeta=assembly.chrmeta) qualname = unique_filename_in()+".bw" out_qual = track(qualname, chrmeta=assembly.chrmeta) for chrom, cinfo in assembly.chrmeta.iteritems(): fasta = Fastafile(ref_genome[chrom]) #process fasta and bam by 10Mb chunks for chunk in range(0,cinfo["length"],10**7): fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7) vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7) out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom) out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom) out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom) out_cov.close() out_het.close() out_qual.close() description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr) ex.add(covname,description=description) description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr) ex.add(hetname,description=description) description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr) ex.add(qualname,description=description) return 0
def c4seq_workflow( ex, job, primers_dict, assembly, c4_url=None, script_path='', logfile=sys.stdout, via='lsf' ): ''' Main * open the 4C-seq minilims and create execution * 0. get/create the library * 1. if necessary, calculate the density file from the bam file (mapseq.parallel_density_sql) * 2. calculate the count per fragment for each denstiy file with gfminer:score_by_feature to calculate) ''' mapseq_files = job.files ### outputs processed = {'lib': {}, 'density': {}, '4cseq': {}} processed['4cseq'] = {'density_files' : {}, 'countsPerFrag' : {}, 'countsPerFrag_grp' : {}, 'norm' : {}, 'norm_grp' : {}, 'profileCorrection': {}, 'profileCorrection_grp' : {}, 'smooth_grp' : {}, 'domainogram_grp' : {}, 'bricks2frags' : {}} # was 'smoothFrag': {}, 'domainogram': {}} regToExclude = {} new_libs=[] ### options run_domainogram = {} before_profile_correction = {} if not job.options.get('viewpoints_chrs',False): out_chromosomes = ','.join([ch for ch in assembly.chrnames]) else: out_chromosomes = ','.join([primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] for gid,group in job.groups.iteritems()]) print "out_chromosomes=" + out_chromosomes + "\n" sizeExt = job.options.get('norm_reg',1000000) print "region considered for normalisation: mid viewpoint +/-" + str(sizeExt) + 'bps' ### do it for gid, group in job.groups.iteritems(): run_domainogram[gid] = group.get('run_domainogram',False) if isinstance(run_domainogram[gid],basestring): run_domainogram[gid] = (run_domainogram[gid].lower() in ['1','true','on','t']) before_profile_correction[gid] = group.get('before_profile_correction',False) if isinstance(before_profile_correction[gid],basestring): before_profile_correction[gid] = (before_profile_correction[gid].lower() in ['1','true','on','t']) processed['lib'][gid] = get_libForGrp(ex, group, assembly, new_libs, gid, c4_url, via=via) #reffile='/archive/epfl/bbcf/data/DubouleDaan/library_Nla_30bps/library_Nla_30bps_segmentInfos.bed' processed['4cseq']['density_files'][gid] = {} regToExclude[gid] = primers_dict.get(group['name'],{}).get('regToExclude',"").replace('\r','') # if no regToExclude defined, set it as mid_baitCoord +/-5kb if len(regToExclude[gid])==0 : baitcoord_mid = int(0.5 * (int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[0]) + int(primers_dict.get(group['name'],{}).get('baitcoord').split(':')[1].split('-')[1]) )) regToExclude[gid] = primers_dict.get(group['name'],{}).get('baitcoord').split(':')[0] + ':' + str(baitcoord_mid-5000) + '-' + str(baitcoord_mid+5000) #print(';'.join([k+"="+v for k,v in primers_dict.get(group['name'],{}).iteritems()])) print(primers_dict.get(group['name'],{})) print "regToExclude["+str(gid)+"]="+regToExclude[gid] for rid,run in group['runs'].iteritems(): libname = mapseq_files[gid][rid]['libname'] if job.options.get('merge_strands') != 0 or not('wig' in mapseq_files[gid][rid]): density_file=parallel_density_sql( ex, mapseq_files[gid][rid]['bam'], assembly.chrmeta, nreads=mapseq_files[gid][rid]['stats']["total"], merge=0, read_extension=mapseq_files[gid][rid]['stats']['read_length'], convert=False, via=via ) density_file += "merged.sql" ex.add( density_file, description=set_file_descr("density_file_"+libname+".sql", groupId=gid,step="density",type="sql",view='admin',gdv="1") ) else: density_file = mapseq_files[gid][rid]['wig']['merged'] #density_files.append(density_file) processed['4cseq']['density_files'][gid][rid]=density_file # back to grp level! # not anymore: # processed['density'][gid] = merge_sql(ex, density_files, via=via) processed['4cseq']['countsPerFrag'] = density_to_countsPerFrag( ex, processed, job.groups, assembly, regToExclude, script_path, via ) ## access per gid+rid futures_norm = {} countsPerFrags_bedGraph = {} futures_merged_raw = {} for gid, group in job.groups.iteritems(): futures_norm[gid] = {} countsPerFrags_bedGraph[gid] = {} processed['4cseq']['norm'][gid] = {} for rid,run in group['runs'].iteritems(): normfile = unique_filename_in() touch(ex, normfile) resfile = unique_filename_in()+".bedGraph" resfiles = processed['4cseq']['countsPerFrag'][gid][rid] # _all.sql convert(resfiles[3],resfile) countsPerFrags_bedGraph[gid][rid] = resfile print "call normFrags: infiles="+resfile+", normfile="+normfile+"baitCoord="+primers_dict[group['name']]['baitcoord']+", sizeExt=sizeExt, name="+ group['name']+"rep_"+str(rid) + "regToExclude="+regToExclude[gid]+"\n" futures_norm[gid][rid] = normFrags.nonblocking( ex, resfile, normfile, baitCoord=primers_dict[group['name']]['baitcoord'], sizeExt=sizeExt, name=group['name']+"rep_"+str(rid) ,regToExclude=regToExclude[gid], script_path=script_path, via=via ) processed['4cseq']['norm'][gid][rid] = normfile if len(group) > 1: ## merge replicates before normalisation. mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_raw_mergedRep" print "gid="+group['name'] print "call mergeRep for replicates before normalisation: infiles="+",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged_raw[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in countsPerFrags_bedGraph[gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['countsPerFrag_grp'][gid] = mergefile else: futures_merged_raw[gid] = None processed['4cseq']['countsPerFrag_grp'][gid] = countsPerFrags_bedGraph[gid][0] #if no replicates, then the file we want is the 1st one print "***** profile correction / sample + merge normalised data" futures_merged = {} # per gid futures_profcor = {} # per gid, per rid for gid, group in job.groups.iteritems(): ## run profile correction per run then merge them futures_profcor[gid] = {} processed['4cseq']['profileCorrection'][gid] = {} for rid, run in group['runs'].iteritems(): # wait for normalisation of all replicates to be finished futures_norm[gid][rid].wait() ## normalised files, per grp, per rep normfile = processed['4cseq']['norm'][gid][rid] file1 = unique_filename_in() #track file touch(ex,file1) file2 = unique_filename_in() #report file touch(ex,file2) file3 = unique_filename_in() #table file touch(ex, file3) print "call profileCorrection: normfile="+normfile+", baitCoord="+primers_dict[group['name']]['baitcoord']+", name="+group['name']+", file1="+file1+", file2="+file2+", file3= "+file3+"\n" futures_profcor[gid][rid] = profileCorrection.nonblocking( ex, normfile, primers_dict[group['name']]['baitcoord'], group['name'], file1, file2, file3, script_path, via=via ) processed['4cseq']['profileCorrection'][gid][rid] = [file1, file2, file3] ## merge replicates before profile correction. Needs all normalisation for the given grp to be finished, this is why it comes after the rid loop. if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_norm_mergedRep" print "gid="+group['name'] print "call mergeRep: infiles="+",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()])+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_merged[gid] = mergeRep.nonblocking( ex, ",".join([res_rid for rid,res_rid in processed['4cseq']['norm'][gid].iteritems()]), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['norm_grp'][gid] = mergefile else: futures_merged[gid] = None processed['4cseq']['norm_grp'][gid] = processed['4cseq']['norm'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** merge profile corrected data" futures_profcor_merged = {} # per gid for gid, group in job.groups.iteritems(): processed['4cseq']['profileCorrection_grp'][gid] = {} for rid, run in group['runs'].iteritems(): futures_profcor[gid][rid].wait() ## wait for ProfileCorrection to be finished ## merge replicates after profile correction if len(group)>1: mergefile = unique_filename_in() touch(ex, mergefile) titleName=group['name']+"_ProfCor_mergedRep" pcfiles = [ processed['4cseq']['profileCorrection'][gid][rid][0] for rid,res_rid in processed['4cseq']['profileCorrection'][gid].iteritems()] print "call mergeRep (for PC tables): infiles="+",".join(pcfiles)+", mergedfile="+mergefile+", regToExclude="+regToExclude[gid]+"\n" futures_profcor_merged[gid] = mergeRep.nonblocking( ex, ",".join(pcfiles), mergefile, regToExclude[gid], name=titleName, script_path=script_path, via=via , memory= 8) processed['4cseq']['profileCorrection_grp'][gid] = mergefile else: futures_profcor_merged[gid] = None processed['4cseq']['profileCorrection_grp'][gid] = processed['4cseq']['profileCorrection'][gid][0] ##if no replicates, then the file we want is the 1st one print "***** smooth data" futures_smoothed = {} for gid, group in job.groups.iteritems(): file1 = unique_filename_in() touch(ex,file1) file2 = unique_filename_in() touch(ex, file2) file3 = unique_filename_in() touch(ex, file3) nFragsPerWin = group['window_size'] futures_merged_raw[gid].wait() ## wait for merging of raw_grp to be completed futures_smoothed[gid] = ( smoothFragFile.nonblocking( ex, processed['4cseq']['countsPerFrag_grp'][gid], nFragsPerWin, group['name'], file1, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_merged[gid].wait() ## wait for merging of norm_grp to be completed futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['norm_grp'][gid], nFragsPerWin, group['name']+"_norm", file2, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) futures_profcor_merged[gid].wait() # wait for the merging of profile corrected data to be done futures_smoothed[gid] += ( smoothFragFile.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], nFragsPerWin, group['name']+"_fromProfileCorrected", file3, regToExclude[gid], script_path=script_path, via=via, memory=6 ), ) processed['4cseq']['smooth_grp'][gid] = [file1,file2,file3] #[smoothed_file_before_Norm, smoothed file before PC, smoothed file after PC] print "***** Domainograms" futures_domainograms = {} for gid, group in job.groups.iteritems(): grName = job.groups[gid]['name'] if run_domainogram[gid]: regCoord = regToExclude[gid] or primers_dict[grName]['baitcoord'] if before_profile_correction[gid]: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['norm_grp'][gid], grName, regCoord=regCoord, skip=1, script_path=script_path, via=via, memory=15 ) else: futures_domainograms[gid] = runDomainogram.nonblocking( ex, processed['4cseq']['profileCorrection_grp'][gid], grName, regCoord=regCoord.split(':')[0], skip=1, script_path=script_path, via=via, memory=15 ) ## prepare tar files for domainogram results (if any) ## and create "BRICKS to frags" files print "***** BRICKS to Frags" futures_BRICKS2Frags = {} for gid, f in futures_domainograms.iteritems(): if run_domainogram[gid]: # if domainogram has been run resFiles = [] logFile = f.wait() start = False tarname = job.groups[gid]['name']+"_domainogram.tar.gz" res_tar = tarfile.open(tarname, "w:gz") futures_BRICKS2Frags[gid] = [] processed['4cseq']['bricks2frags'][gid] = [] if logFile is None: continue with open(logFile) as f: for s in f: s = s.strip() if '####resfiles####' in s: start = True elif start and "RData" not in s: resFiles.append(s) res_tar.add(s) if start and "foundBRICKS" in s: bricks2fragsfile = unique_filename_in()+".bedGraph" touch(ex, bricks2fragsfile) futures_BRICKS2Frags[gid] += [ BRICKSToFrag.nonblocking(ex, s, processed['4cseq']['norm_grp'][gid], bricks2fragsfile, script_path=script_path, via=via, memory=4 ) ] processed['4cseq']['bricks2frags'][gid] += [ bricks2fragsfile ] res_tar.close() processed['4cseq']['domainogram_grp'][gid] = resFiles + [tarname] ############### prepare tables for global results print "***** combine results into tables " allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): for rid,run in group['runs'].iteritems(): allNames += [ group['name']+"_rep"+str(rid)+"_norm", group['name']+"_rep"+str(rid)+"_fit" ] allFiles += [ processed['4cseq']['profileCorrection'][gid][rid][2] ] allRegToExclude += [ regToExclude[gid] ] tablePC=unique_filename_in()+".txt" print("***will call makeTable with:") print(",".join(allFiles)) print("resfile="+tablePC) print(",".join(allNames)) touch(ex,tablePC) #regToExclude[gid] futures_tables = (makeTable.nonblocking(ex, ",".join(allFiles), tablePC, ",".join(allNames), idCols="4,5", all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) # wait for all smoothing to be done for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() ## make Table raw/smoothed_raw print("** make Table raw/smoothed_raw") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): futures_merged_raw[gid].wait() allNames += [ group['name']+"_raw", group['name']+"_rawSmoothed" ] allFiles += [ processed['4cseq']['countsPerFrag_grp'][gid], processed['4cseq']['smooth_grp'][gid][0] ] allRegToExclude += [ 'NA', regToExclude[gid] ] tableSmoothedRaw_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedRaw_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedRaw_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm before PC print("** make Table norm/smoothed_norm befor PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_norm", group['name']+"_smoothed" ] allFiles += [ processed['4cseq']['norm_grp'][gid], processed['4cseq']['smooth_grp'][gid][1] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothed_grp=unique_filename_in()+".txt" touch(ex,tableSmoothed_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothed_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## make Table norm/smoothed_norm after PC print("** make Table norm/smoothed_norm after PC") allNames=[] allFiles=[] allRegToExclude=[] for gid, group in job.groups.iteritems(): allNames += [ group['name']+"_normPC", group['name']+"_smoothedPC" ] allFiles += [ processed['4cseq']['profileCorrection_grp'][gid], processed['4cseq']['smooth_grp'][gid][2] ] allRegToExclude += [ regToExclude[gid], regToExclude[gid] ] tableSmoothedPC_grp=unique_filename_in()+".txt" touch(ex,tableSmoothedPC_grp) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableSmoothedPC_grp, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, all_regToExclude=','.join(allRegToExclude), script_path=script_path, via=via, memory=8 ), ) ## combine BRICKS2Frags files allNames=[] allFiles=[] for gid, fg in futures_BRICKS2Frags.iteritems(): for f in fg: f.wait() allNames += [ job.groups[gid]['name']+"_BRICKSpval" ] cat_bricks2frags = unique_filename_in()+".txt" print ','.join(processed['4cseq']['bricks2frags'][gid]) cat_bricks2frags = cat(processed['4cseq']['bricks2frags'][gid],out=cat_bricks2frags) allFiles += [ cat_bricks2frags ] for gid, fg in futures_smoothed.iteritems(): for f in fg: f.wait() tableBRICKS2Frags = unique_filename_in()+".txt" touch(ex,tableBRICKS2Frags) futures_tables += (makeTable.nonblocking(ex, ",".join(allFiles), tableBRICKS2Frags, ",".join(allNames), idCols="4", out_chromosomes = out_chromosomes, defVal="NA", script_path=script_path, via=via, memory=8 ), ) for f in futures_tables: f.wait() ################ Add everything to minilims below! step = "density" for gid in processed['4cseq']['density_files'].keys(): for rid, sql in processed['4cseq']['density_files'][gid].iteritems(): fname = "density_file_"+job.groups[gid]['name']+"_merged_rep"+str(rid) ex.add( sql, description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",gdv="1" ) ) wig = unique_filename_in()+".bw" convert( sql, wig ) ex.add( wig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig",ucsc="1") ) step = "counts_per_frag" #was _norm_counts_per_frags # before normalisation process, per replicate for gid in processed['4cseq']['countsPerFrag'].keys(): for rid, resfiles in processed['4cseq']['countsPerFrag'][gid].iteritems(): fname = "meanScorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[1], description=set_file_descr( fname+".sql", groupId=gid,step=step,type="sql",view="admin",gdv='1')) #gzipfile(ex,resfiles[0]) #ex.add( resfiles[0]+".gz", description=set_file_descr( fname+".bed.gz", # groupId=gid,step=step,type="bed",view="admin" )) fname = "segToFrag_"+job.groups[gid]['name']+"_rep"+str(rid) ex.add( resfiles[3], description=set_file_descr( fname+"_all.sql", groupId=gid,step=step,type="sql", comment="all informative frags - null included" )) trsql = track(resfiles[3]) bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=trsql.chrmeta) trwig.write(trsql.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() ex.add( bwig, set_file_descr(fname+".bw",groupId=gid,step=step,type="bigWig",ucsc='1')) ## add segToFrags before normalisation futures_merged_raw[gid].wait() trbedgraph = track(removeNA(processed['4cseq']['countsPerFrag_grp'][gid]),format='bedgraph') bwig = unique_filename_in()+".bw" trwig = track(bwig,chrmeta=assembly.chrmeta) trwig.write(trbedgraph.read(fields=['chr','start','end','score'], selection={'score':(0.01,sys.maxint)})) trwig.close() fname = "segToFrag_"+job.groups[gid]['name'] ex.add( bwig, description=set_file_descr( fname+".bw", groupId=gid,step=step,type="bigWig", comment="segToFrag file before normalisation" )) step = "norm_counts_per_frags" # after new normalisation process, combined replicates for gid, resfile in processed['4cseq']['norm_grp'].iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name'] gzipfile(ex,resfile) ex.add( resfile+".gz", description=set_file_descr( fname+".bedGraph.gz", groupId=gid,step=step, type="bedGraph",ucsc='1')) # norm files, per replicates (might be removed) for gid, dict_gid in processed['4cseq']['norm'].iteritems(): for rid, resfile in dict_gid.iteritems(): fname = "normalised_scorePerFeature_"+job.groups[gid]['name']+"_rep"+str(rid) gzipfile(ex,resfile) ex.add(resfile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "profile_correction" # Profile corrected data, combined replicates for gid, profileCorrectedFile in processed['4cseq']['profileCorrection_grp'].iteritems(): fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected" gzipfile(ex,profileCorrectedFile) ex.add( profileCorrectedFile+".gz", description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) # Profile corrected, per replicate (might be removed) for gid, dict_gid in processed['4cseq']['profileCorrection'].iteritems(): for rid, resfiles in dict_gid.iteritems(): # profileCorrectedFile = resfiles[0] reportProfileCorrection = resfiles[1] fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_rep"+str(rid) # gzipfile(ex,profileCorrectedFile) # ex.add( profileCorrectedFile+".gz", # description=set_file_descr(fname+".bedGraph.gz",groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ex.add( reportProfileCorrection, description=set_file_descr(fname+".pdf", groupId=gid,step=step,type="pdf")) step = "smoothing" for gid, resfiles in processed['4cseq']['smooth_grp'].iteritems(): rawSmoothFile = resfiles[0] smoothFile = resfiles[1] afterProfileCorrection = resfiles[2] nFrags = str(job.groups[gid]['window_size']) ## smoothed file before normalisation fname = "segToFrag_"+job.groups[gid]['name']+"_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,rawSmoothFile) ex.add(rawSmoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, before Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_norm_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,smoothFile) ex.add(smoothFile+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) ## smoothed file after normalisation, after Profile correction fname = "segToFrag_"+job.groups[gid]['name']+"_profileCorrected_smoothed_"+nFrags+"FragsPerWin.bedGraph.gz" gzipfile(ex,afterProfileCorrection) ex.add(afterProfileCorrection+".gz", description=set_file_descr(fname,groupId=gid,step=step,type="bedGraph",ucsc='1',gdv='1')) step = "domainograms" for gid, resfiles in processed['4cseq']['domainogram_grp'].iteritems(): tarFile = resfiles.pop() fname = job.groups[gid]['name']+"_domainogram.tar.gz" ex.add(tarFile, description=set_file_descr(fname, groupId=gid,step=step,type="tgz")) for s in resfiles: if s[-8:] == "bedGraph": gzipfile(ex,s) s += ".gz" ex.add( s, description=set_file_descr( s, groupId=gid,step=step,type="bedGraph",ucsc="1",gdv="1")) step = "combined_results" gzipfile(ex,tableSmoothedRaw_grp) ex.add(tableSmoothedRaw_grp+".gz", description=set_file_descr("table_segToFrags_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothed_grp) ex.add(tableSmoothed_grp+".gz", description=set_file_descr("table_normalised_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableSmoothedPC_grp) ex.add(tableSmoothedPC_grp+".gz", description=set_file_descr("table_profileCorrected_smoothed_combined_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tablePC) ex.add(tablePC+".gz", description=set_file_descr("table_normalised_fit_per_replicates.txt.gz",step=step,type="txt")) gzipfile(ex,tableBRICKS2Frags) ex.add(tableBRICKS2Frags+".gz", description=set_file_descr("table_frags_in_BRICKS_combined_replicates.txt.gz",step=step,type="txt")) return processed
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local', logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow""" ref_genome = assembly.fasta_by_chrom sample_names = [ job.groups[gid]['name'] for gid in sorted(job.files.keys()) ] logfile.write("\n* Generate vcfs for each chrom/group\n") logfile.flush() vcfs = dict((chrom, {}) for chrom in ref_genome.keys()) # {chr: {}} bams = {} # Launch the jobs bam = Samfile(job.files.values()[0].values()[0]['bam']) header = bam.header headerfile = unique_filename_in() for h in header["SQ"]: if h["SN"] in assembly.chrmeta: h["SN"] = assembly.chrmeta[h["SN"]]["ac"] head = Samfile(headerfile, "wh", header=header) head.close() for gid in job.files.keys(): # Merge all bams belonging to the same group runs = [r['bam'] for r in job.files[gid].itervalues()] if len(runs) > 1: _b = merge_bam(ex, runs) index_bam(ex, _b) bams[gid] = _b else: index_bam(ex, runs[0]) bams[gid] = runs[0] # Samtools mpileup + bcftools + vcfutils.pl for chrom, ref in ref_genome.iteritems(): vcf = unique_filename_in() vcfs[chrom][gid] = (vcf, pileup.nonblocking(ex, bams[gid], ref, header=headerfile, via=via, stdout=vcf)) logfile.write(" ...Group %s running.\n" % job.groups[gid]['name']) logfile.flush() # Wait for vcfs to finish and store them in *vcfs[chrom][gid]* for gid in job.files.keys(): for chrom, ref in ref_genome.iteritems(): vcfs[chrom][gid][1].wait() vcfs[chrom][gid] = vcfs[chrom][gid][0] logfile.write(" ...Group %s done.\n" % job.groups[gid]['name']) logfile.flush() # Targz the pileup files (vcf) tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for chrom, v in vcfs.iteritems(): for gid, vcf in v.iteritems(): tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'], chrom)) tarfh.close() ex.add(tarname, description=set_file_descr("vcf_files.tar.gz", step="pileup", type="tar", view='admin')) logfile.write("\n* Merge info from vcf files\n") logfile.flush() outall = unique_filename_in() outexons = unique_filename_in() with open(outall, "w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \ ['gene','location_type','distance'])+'\n') with open(outexons, "w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \ + ['new_aa_'+s for s in sample_names])+'\n') msa_table = dict((s, '') for s in [assembly.name] + sample_names) for chrom, v in vcfs.iteritems(): logfile.write(" > Chromosome '%s'\n" % chrom) logfile.flush() # Put together info from all vcf files logfile.write(" - All SNPs\n") logfile.flush() allsnps = all_snps(ex, chrom, vcfs[chrom], bams, outall, assembly, headerfile, sample_names, mincov, float(minsnp), logfile, debugfile, via) # Annotate SNPs and check synonymy logfile.write(" - Exonic SNPs\n") logfile.flush() exon_snps(chrom, outexons, allsnps, assembly, sample_names, ref_genome, logfile, debugfile) for snprow in allsnps: for n, k in enumerate([assembly.name] + sample_names): base = snprow[3 + n][0] if base == "-": base = snprow[3][0] if base not in 'ACGTacgt': base = "N" msa_table[k] += base description = set_file_descr("allSNP.txt", step="SNPs", type="txt") ex.add(outall, description=description) description = set_file_descr("exonsSNP.txt", step="SNPs", type="txt") ex.add(outexons, description=description) msafile = unique_filename_in() with open(msafile, "w") as msa: msa.write(" %i %i\n" % (len(msa_table), len(msa_table.values()[0]))) for name, seq in msa_table.iteritems(): msa.write("%s\t%s\n" % (name, seq)) msa_table = {} description = set_file_descr("SNPalignment.txt", step="SNPs", type="txt") ex.add(msafile, description=description) # Create UCSC bed tracks logfile.write("\n* Create tracks\n") logfile.flush() create_tracks(ex, outall, sample_names, assembly) # Create quantitative tracks logfile.write("\n* Create heteroz. and quality tracks\n") logfile.flush() def _process_pileup(pileups, seq, startpos, endpos): atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3} vectors = ([], [], []) for pileupcolumn in pileups: position = pileupcolumn.pos if position < startpos: continue if position >= endpos: break coverage = pileupcolumn.n ref_symbol = seq[position - startpos] ref = atoi.get(ref_symbol, 4) symbols = [0, 0, 0, 0, 0] quality = 0 for pileupread in pileupcolumn.pileups: if pileupread.qpos >= len(pileupread.alignment.seq): coverage -= 1 else: symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1 quality += ord( pileupread.alignment.qual[pileupread.qpos]) - 33 quality = float(quality) / coverage info = heterozygosity(ref, symbols[0:4]) if coverage > 0: vectors[0].append((position, position + 1, coverage)) if info > 0: vectors[1].append((position, position + 1, info)) if quality > 0: vectors[2].append((position, position + 1, quality)) # yield (position, position+1, coverage, info, quality) return vectors if job.options.get('make_bigwigs', False): _descr = { 'groupId': 0, 'step': "tracks", 'type': "bigWig", 'ucsc': '1' } for gid, bamfile in bams.iteritems(): _descr['groupId'] = gid bamtr = track(bamfile, format="bam") covname = unique_filename_in() + ".bw" out_cov = track(covname, chrmeta=assembly.chrmeta) hetname = unique_filename_in() + ".bw" out_het = track(hetname, chrmeta=assembly.chrmeta) qualname = unique_filename_in() + ".bw" out_qual = track(qualname, chrmeta=assembly.chrmeta) for chrom, cinfo in assembly.chrmeta.iteritems(): fasta = Fastafile(ref_genome[chrom]) #process fasta and bam by 10Mb chunks for chunk in range(0, cinfo["length"], 10**7): fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk + 10**7) vecs = _process_pileup( bamtr.pileup(chrom, chunk, chunk + 10**7), fastaseq, chunk, chunk + 10**7) out_cov.write(vecs[0], fields=['start', 'end', 'score'], chrom=chrom) out_het.write(vecs[1], fields=['start', 'end', 'score'], chrom=chrom) out_qual.write(vecs[2], fields=['start', 'end', 'score'], chrom=chrom) out_cov.close() out_het.close() out_qual.close() description = set_file_descr( job.groups[gid]['name'] + "_coverage.bw", **_descr) ex.add(covname, description=description) description = set_file_descr( job.groups[gid]['name'] + "_heterozygosity.bw", **_descr) ex.add(hetname, description=description) description = set_file_descr( job.groups[gid]['name'] + "_quality.bw", **_descr) ex.add(qualname, description=description) return 0
def chipseq_workflow(ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf'): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict, frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not ('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files', {}) else: raise TypeError( "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'." ) merge_strands = int(options.get('merge_strands', -1)) suffixes = ["fwd", "rev"] peak_deconvolution = options.get('peak_deconvolution', False) if isinstance(peak_deconvolution, basestring): peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't'] run_meme = options.get('run_meme', False) if isinstance(run_meme, basestring): run_meme = run_meme.lower() in ['1', 'true', 't'] macs_args = options.get('macs_args', ["--bw", "200"]) b2w_args = options.get('b2w_args', []) if not (isinstance(mapseq_files, dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid, mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not (isinstance(mapped, dict)): raise TypeError( "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name + "_" + str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking(ex, mapped[k]["bam"], via=via) if mapped[k].get('poisson_threshold', -1) > 0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns) > 0: p_thresh['group_name'] = sum(ptruns) / len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped) > 1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: tests.append(bamfile) names['tests'].append((gid, group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] logfile.write("Starting MACS.\n") logfile.flush() processed = { 'macs': add_macs_results(ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via) } logfile.write("Done MACS.\n") logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score': (6, sys.maxint)} _fields = ['chr', 'start', 'end', 'name', 'score'] for i, name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = track(processed['macs'][ctrl] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([ apply(track(processed['macs'][(name, x)] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n, _n=xn: "%s:%i" % (__n, _n)) for xn, x in enumerate(names['controls']) ]) ############################## macs_neighb = neighborhood(macsbed, before_start=150, after_end=150) peak_list[name] = unique_filename_in() + ".sql" macs_final = track(peak_list[name], chrmeta=chrlist, info={'datatype': 'qualitative'}, fields=['start', 'end', 'name', 'score']) macs_final.write(fusion(macs_neighb), clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int( options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension'] > 100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid, mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via) wig.append(dict((s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict( (s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv(stream, pval): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0], ) + ((x[2] + x[1]) / 2 - 150, (x[2] + x[1]) / 2 + 150) + x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1] + " deconvolution.\n") logfile.flush() if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = processed['macs'][ctrl] + "_peaks.bed" else: macsbed = intersect_many_bed(ex, [ processed['macs'][(name, x)] + "_peaks.bed" for x in names['controls'] ], via=via) deconv = run_deconv(ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via) peak_list[name] = unique_filename_in() + ".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed, 0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1] + '_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1] + '_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'], (bigwig, "bigWig")) ex.add(bigwig, description=set_file_descr(name[1] + '_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e)) logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1] + '_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs(stream, xlsl, _f): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int( _n.split(";")[0][13:]) if _n[:3] == "ID=" else int( _n[10:]) yield _p + xlsl[0][nb - 1][1:] else: nb = _n.split( ";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:] return FeatureStream(_macs_row(stream), fields=_f) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist, chrmeta=chrlist, fields=["chr", "start", "end", "name", "score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([ processed['macs'][(name, _c)] + "_peaks.xls" for _c in names['controls'] ]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join([ 'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)', 'location_type', 'distance' ] + _fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs( getNearestFeature(ptrack.read(selection=chrom), _feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr', 'start', 'end', 'name', 'score' ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join( ['chromosome', 'start', 'end', 'info', 'peak_height'] + _fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex, peakfile) peakfile_list.append( track(peakfile + ".gz", format='txt', fields=_fields)) ex.add(peakfile + ".gz", description=set_file_descr(name[1] + '_annotated_peaks.txt.gz', type='text', step='annotation', groupId=name[0])) stracks = [ track(wig, info={'name': name + "_" + st}) for name, wigdict in merged_wig.iteritems() for st, wig in wigdict.iteritems() ] tablefile = unique_filename_in() with open(tablefile, "w") as _tf: _pnames = [ "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1] for _s in names['tests'] for _c in names['controls'] ] _tf.write("\t".join([ '#chromosome', 'start', 'end', ] + _pnames + [s.name for s in stracks]) + "\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [ apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']), 'name', lambda __n, _n=npt: "%s:%i" % (__n, _n)) for npt, pt in enumerate(peakfile_list) ] features = fusion( concatenate(pk_lst, fields=['chr', 'start', 'end', 'name'], remove_duplicates=True, group_by=['chr', 'start', 'end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile, "a") as _tf: for row in quantifs: pcols = [''] * _ns * _nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while (_k < len(_rnsplit) - 1 - int(_nc > 1)): if _nc > 1: _k += 2 _n2 = _rnsplit[_k - 1] _n = _rnsplit[_k].split("|") pcols[int(_n[0]) * _nc + int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join( str(tt) for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) + "\n") gzipfile(ex, tablefile) ex.add(tablefile + ".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz', type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n") logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'], via=via) return processed
def microbiome_workflow(ex, job, assembly, logfile=sys.stdout, via="lsf"): """ Main: * 0. retrieve bam files from mapseq job * 0.a. merge bam files (=> 1 bam file per group) * 1. for each group: * 1.a get counts per group (=> 1 file per group) * 1.b get counts per Level (Kingdom, Phylum, Class, Order, Family, Genus and Species) (=> 1 file per level / per group) * 2. combine counts * 2.a combine counts for all groups (=> 1 combined file) * 2.b combine counts per level for all groups (=> 1 combined file per Level) * 3. generate barplots (=> 1 plot per group + per level + per combined files) """ ### params levels = ["Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"] infosCols = { "Kingdom": [0, [1, 2]], "Phylum": [[0, 1], [2, 3]], "Class": [[0, 1, 2], [3, 4]], "Order": [[0, 1, 2, 3], [4, 5]], "Family": [[0, 1, 2, 3, 4], [5, 6]], "Genus": [[0, 1, 2, 3, 4, 5], [6, 7]], "Species": [[0, 1, 2, 3, 4, 5, 6], [7, 8]], } ### outputs processed = {"cnts": {}, "cnts_level": {}, "plots": {}} ### do it mapseq_files = job.files # 1.a get counts per group (=> 1 file per group) futures = {} for gid, group in job.groups.iteritems(): group_name = group["name"] bamfiles = [m["bam"] for m in mapseq_files[gid].values()] futures[gid] = run_microbiome.nonblocking( ex, ["bam_to_annot_counts", bamfiles, assembly.annotations_path, group_name], via=via, memory=8 ) # 1.b get counts per Level (Kingdom, Phylum, Class, Order, Family, Genus and Species) (=> 1 file per level / per group) step = "counts" for gid, future in futures.iteritems(): res = future.wait() processed["cnts"][gid] = res # group_name + "_counts_annot.txt" fname = job.groups[gid]["name"] + "_counts_annot.txt" ex.add(res, description=set_file_descr(fname, groupId=gid, step=step, type="txt")) processed["cnts_level"][gid] = [ run_microbiome.nonblocking(ex, ["getCountsPerLevel", res, level], via=via, memory=8) for level in levels ] # 2.a combine counts for all groups (=> 1 combined file) files = processed["cnts"].values() combined_out = [run_microbiome.nonblocking(ex, ["combine_counts", files, 0, [1, 2]], via=via, memory=8)] # 2.b combine counts per level for all groups (=> 1 combined file per Level) for n, level in enumerate(levels): files = dict([(gid, f[n].wait()) for gid, f in processed["cnts_level"].iteritems()]) combined_out.append( run_microbiome.nonblocking( ex, ["combine_counts", files.values()] + infosCols.get(level, [0, [1, 2]]), via=via, memory=8 ) ) for gid, f in files.iteritems(): fname = job.groups[gid]["name"] + "_counts_annot_" + level + ".txt" ex.add(f, description=set_file_descr(fname, groupId=gid, step=step, type="txt")) step = "combined" ex.add(combined_out[0].wait(), description=set_file_descr("combined_counts.txt", step=step, type="txt")) for nl, level in enumerate(levels): ex.add( combined_out[nl + 1].wait(), description=set_file_descr("combined_counts" + level + ".txt", step=step, type="txt"), ) return 0
def count_reads(self, bamfiles, gtf): self.write_log("* Counting reads") # Count reads on genes, transcripts with "rnacounter" ncond = len(self.conditions) tablenames = [None] * ncond futures = [None] * ncond max_rlen = 0 counter_options = ["--nh"] for bam in bamfiles: sam = pysam.Samfile(bam, 'rb') max_rlen = max(max_rlen, sam.next().rlen) counter_options += ["--exon_cutoff", str(max_rlen)] bwt_args = self.job.options.get('map_args', {}).get('bwt_args', []) # if not "--local" in bwt_args: # counter_options += ["--nh"] if hasattr(self.assembly, "fasta_origin") or self.assembly.intype == 2: counter_options += ["--type", "transcripts", "--method", "raw"] else: counter_options += [ "--type", "genes,transcripts", "--method", "raw,nnls" ] if self.stranded: counter_options += ["--stranded"] for i, c in enumerate(self.conditions): tablenames[i] = unique_filename_in() futures[i] = rnacounter.nonblocking(self.ex, bamfiles[i], gtf, stdout=tablenames[i], via=self.via, options=counter_options) # Put samples together for i, c in enumerate(self.conditions): try: futures[i].wait() except Exception as err: self.write_debug("Counting failed: %s." % str(err)) raise err if futures[i] is None: self.write_debug("Counting failed.") raise ValueError("Counting failed.") if len(tablenames) > 1: joined = unique_filename_in() rnacounter_join.nonblocking(self.ex, tablenames, stdout=joined, via=self.via).wait() else: joined = tablenames[0] # Split genes and transcripts into separate files genes_filename = unique_filename_in() trans_filename = unique_filename_in() genes_file = open(genes_filename, "wb") trans_file = open(trans_filename, "wb") if self.stranded: genes_anti_filename = unique_filename_in() trans_anti_filename = unique_filename_in() genes_anti_file = open(genes_anti_filename, "wb") trans_anti_file = open(trans_anti_filename, "wb") with open(joined) as jfile: header = jfile.readline() hconds = ["counts." + c for c in self.conditions ] + ["rpkm." + c for c in self.conditions] hinfo = header.strip().split('\t')[2 * ncond + 1:] header = '\t'.join(["ID"] + hconds + hinfo) + '\n' genes_file.write(header) trans_file.write(header) type_idx = header.split('\t').index("Type") if self.stranded: genes_anti_file.write(header) trans_anti_file.write(header) sense_idx = header.split('\t').index("Sense") for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() sense = L[sense_idx].lower() if ftype == 'gene': if sense == 'antisense': genes_anti_file.write(line) else: genes_file.write(line) elif ftype == 'transcript': if sense == 'antisense': trans_anti_file.write(line) else: trans_file.write(line) else: for line in jfile: L = line.split('\t') ftype = L[type_idx].lower() if ftype == 'gene': genes_file.write(line) elif ftype == 'transcript': trans_file.write(line) genes_file.close() trans_file.close() # Keep intermediate tables for i, c in enumerate(self.conditions): #shutil.copy(tablenames[i], "../counts%d.txt"%i) descr = set_file_descr(self.conditions[i] + '_' + tablenames[i] + '.gz', type='txt', step='pileup', view='admin') gzipfile(self.ex, tablenames[i]) self.ex.add(tablenames[i] + '.gz', description=descr) if self.stranded: count_files = { 'genes': genes_filename, 'transcripts': trans_filename, 'genes_anti': genes_anti_filename, 'transcripts_anti': trans_anti_filename } else: count_files = { 'genes': genes_filename, 'transcripts': trans_filename } return count_files
def find_junctions(self, soapsplice_index=None, path_to_soapsplice=None, soapsplice_options={}): """ Retrieve unmapped reads from a precedent mapping and runs SOAPsplice on them. Return the names of a .bed track indicating the junctions positions, as well as of a bam file of the alignments attesting the junctions. :param soapsplice_index: (str) path to the SOAPsplice index. :param path_to_soapsplice: (str) specify the path to the program if it is not in your $PATH. :param soapsplice_options: (dict) SOAPsplice options, e.g. {'-m':2}. :rtype: str, str """ @program def soapsplice(unmapped_R1, unmapped_R2, index, output=None, path_to_soapsplice=None, options={}): """Bind 'soapsplice'. Return a text file containing the list of junctions. :param unmapped_R1: (str) path to the fastq file containing the 'left' reads. :param unmapped_R2: (str) path to the fastq file containing the 'right' reads. :param index: (str) path to the SOAPsplice index. :param output: (str) output file name. :param path_to_soapsplice: (str) path to the SOAPsplice executable. If not specified, the program must be in your $PATH. :param options: (dict) SOAPsplice options, given as {opt: value}. :rtype: str Main options:: -p: number of threads, <= 20. [1] -S: 1: forward strand, 2: reverse strand, 3: both. [3] -m: maximum mismatch for one-segment alignment, <= 5. [3] -g: maximum indel for one-segment alignment, <= 2. [2] -i: length of tail that can be ignored in one-segment alignment. [7] -t: longest gap between two segments in two-segment alignment. [500000] -a: shortest length of a segment in two-segment alignment. [8] -q: input quality type in FASTQ file (0: old Illumina, 1: Sanger). [0] -L: maximum distance between paired-end reads. [500000] -l: minimum distance between paired-end reads. [50] -I: insert length of paired-end reads. """ if not output: output = unique_filename_in() path_to_soapsplice = path_to_soapsplice or 'soapsplice' args = [ path_to_soapsplice, '-d', index, '-1', unmapped_R1, '-2', unmapped_R2, '-o', output, '-f', '2' ] opts = [] for k, v in options.iteritems(): opts.extend([str(k), str(v)]) return {"arguments": args + opts, "return_value": output} if not program_exists('soapsplice'): self.write_debug("Skipped junctions search: soapsplice not found.") return self.assembly.set_index_path(intype=3) soapsplice_index = soapsplice_index or self.assembly.index_path soapsplice_options.update( self.job.options.get('soapsplice_options', {})) soapsplice_options.setdefault('-p', 16) # number of threads soapsplice_options.setdefault('-q', 1) # Sanger format unmapped_fastq = {} for gid, group in self.job.groups.iteritems(): unmapped_fastq[gid] = [] for rid, run in group['runs'].iteritems(): unmapped = self.job.files[gid][rid].get('unmapped_fastq') if not unmapped: self.write_log( "No unmapped reads found for group %s, run %d. Skip." % (gid, rid)) continue elif not isinstance(unmapped, tuple): self.write_log("Pair-end reads required. Skip.") continue unmapped_fastq[gid].append(unmapped) if len(unmapped_fastq[gid]) == 0: continue R1 = cat(zip(*unmapped_fastq[gid])[0]) R2 = cat(zip(*unmapped_fastq[gid])[1]) future = soapsplice.nonblocking( self.ex, R1, R2, soapsplice_index, path_to_soapsplice=path_to_soapsplice, options=soapsplice_options, via=self.via, memory=8, threads=soapsplice_options['-p']) try: template = future.wait() except Exception as err: self.write_debug("SOAPsplice failed: %s." % str(err)) return if template is None: self.write_debug("SOAPsplice failed.") return junc_file = template + '.junc' bed = self.convert_junc_file(junc_file, self.assembly) bed_descr = set_file_descr('junctions_%s.bed' % group['name'], groupId=gid, type='bed', step='junctions', ucsc=1) bam_descr = set_file_descr('junctions_%s.bam' % group['name'], groupId=gid, type='bam', step='junctions', ucsc=0) sam = template + '.sam' try: bam = sam_to_bam(self.ex, sam, reheader=self.assembly.name) add_and_index_bam(self.ex, bam, description=bam_descr) self.ex.add(bam, description=bam_descr) except Exception as e: self.write_debug( "%s\n(Qualities may be in the wrong format, try with '-q 0'.)" % str(e)) self.ex.add(bed, description=bed_descr) return bed, bam
def add_macs_results(ex, read_length, genome_size, bamfile, ctrlbam=None, name=None, poisson_threshold=None, alias=None, macs_args=None, via='lsf'): """Calls the ``macs`` function on each possible pair of test and control bam files and adds the respective outputs to the execution repository. ``macs`` options can be controlled with `macs_args`. If a dictionary of Poisson thresholds for each sample is given, then the enrichment bounds ('-m' option) are computed from them otherwise the default is '-m 10,100'. Returns the set of file prefixes. """ if not (isinstance(bamfile, list)): bamfile = [bamfile] if not (isinstance(ctrlbam, list)): ctrlbam = [ctrlbam] if poisson_threshold is None: poisson_threshold = {} if macs_args is None: macs_args = [] futures = {} rl = read_length for i, bam in enumerate(bamfile): n = name['tests'][i] if poisson_threshold.get(n) > 0: low = (poisson_threshold.get(n) + 1) * 5 enrich_bounds = str(min(30, low)) + "," + str(10 * low) else: enrich_bounds = "10,100" if not ("-m" in macs_args): macs_args += ["-m", enrich_bounds] if isinstance(read_length, list): rl = read_length[i] for j, cam in enumerate(ctrlbam): m = name['controls'][j] nm = (n, m) futures[nm] = macs.nonblocking(ex, rl, genome_size, bam, cam, args=macs_args, via=via, memory=12) prefixes = {} for n, f in futures.iteritems(): p = f.wait() prefixes[n] = p macs_descr0 = { 'step': 'macs', 'type': 'none', 'view': 'admin', 'groupId': n[0][0] } macs_descr1 = {'step': 'macs', 'type': 'xls', 'groupId': n[0][0]} macs_descr2 = { 'step': 'macs', 'type': 'bed', 'groupId': n[0][0], 'ucsc': '1' } filename = "_vs_".join([x[1] for x in n if x[0]]) touch(ex, p) ex.add(p, description=set_file_descr(filename, **macs_descr0), alias=alias) ex.add(p + "_peaks.xls", description=set_file_descr(filename + "_peaks.xls", **macs_descr1), associate_to_filename=p, template='%s_peaks.xls') bedzip = gzip.open(p + "_peaks.bed.gz", 'wb') bedzip.write("track name='" + filename + "_macs_peaks'\n") with open(p + "_peaks.bed") as bedinf: [bedzip.write(l) for l in bedinf] bedzip.close() ex.add(p + "_peaks.bed.gz", description=set_file_descr(filename + "_peaks.bed.gz", **macs_descr2), associate_to_filename=p, template='%s_peaks.bed.gz') bedzip = gzip.open(p + "_summits.bed.gz", 'wb') bedzip.write("track name='" + filename + "_macs_summits'\n") with open(p + "_summits.bed") as bedinf: [bedzip.write(l) for l in bedinf] bedzip.close() ex.add(p + "_summits.bed.gz", description=set_file_descr(filename + "_summits.bed.gz", **macs_descr2), associate_to_filename=p, template='%s_summits.bed.gz') if n[1][0]: ex.add(p + "_negative_peaks.xls", description=set_file_descr(filename + "_negative_peaks.xls", **macs_descr0), associate_to_filename=p, template='%s_negative_peaks.xls') return prefixes