def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local', logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow""" ref_genome = assembly.fasta_by_chrom sample_names = [job.groups[gid]['name'] for gid in sorted(job.files.keys())] logfile.write("\n* Generate vcfs for each chrom/group\n"); logfile.flush() vcfs = dict((chrom,{}) for chrom in ref_genome.keys()) # {chr: {}} bams = {} # Launch the jobs for gid in sorted(job.files.keys()): # Merge all bams belonging to the same group runs = [r['bam'] for r in job.files[gid].itervalues()] bam = Samfile(runs[0]) header = bam.header headerfile = unique_filename_in() for h in header["SQ"]: if h["SN"] in assembly.chrmeta: h["SN"] = assembly.chrmeta[h["SN"]]["ac"] head = Samfile( headerfile, "wh", header=header ) head.close() if len(runs) > 1: _b = merge_bam(ex,runs) index_bam(ex,_b) bams[gid] = _b else: bams[gid] = runs[0] # Samtools mpileup + bcftools + vcfutils.pl for chrom,ref in ref_genome.iteritems(): vcf = unique_filename_in() vcfs[chrom][gid] = (vcf, pileup.nonblocking(ex, bams[gid], ref, header=headerfile, via=via, stdout=vcf)) logfile.write(" ...Group %s running.\n" %job.groups[gid]['name']); logfile.flush() # Wait for vcfs to finish and store them in *vcfs[chrom][gid]* for gid in sorted(job.files.keys()): for chrom,ref in ref_genome.iteritems(): vcfs[chrom][gid][1].wait() vcfs[chrom][gid] = vcfs[chrom][gid][0] logfile.write(" ...Group %s done.\n" %job.groups[gid]['name']); logfile.flush() # Targz the pileup files (vcf) tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for chrom,v in vcfs.iteritems(): for gid,vcf in v.iteritems(): tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'],chrom)) tarfh.close() ex.add( tarname, description=set_file_descr("vcfs_files.tar.gz",step="pileup",type="tar",view='admin') ) logfile.write("\n* Merge info from vcf files\n"); logfile.flush() outall = unique_filename_in() outexons = unique_filename_in() with open(outall,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \ ['gene','location_type','distance'])+'\n') with open(outexons,"w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \ + ['new_aa_'+s for s in sample_names])+'\n') msa_table = dict((s,'') for s in [assembly.name]+sample_names) for chrom,v in vcfs.iteritems(): logfile.write(" > Chromosome '%s'\n" % chrom); logfile.flush() # Put together info from all vcf files logfile.write(" - All SNPs\n"); logfile.flush() allsnps = all_snps(ex,chrom,vcfs[chrom],bams,outall,assembly, sample_names,mincov,float(minsnp),logfile,debugfile) # Annotate SNPs and check synonymy logfile.write(" - Exonic SNPs\n"); logfile.flush() exon_snps(chrom,outexons,allsnps,assembly,sample_names,ref_genome,logfile,debugfile) for snprow in allsnps: for n,k in enumerate([assembly.name]+sample_names): msa_table[k] += snprow[3+n][0] description = set_file_descr("allSNP.txt",step="SNPs",type="txt") ex.add(outall,description=description) description = set_file_descr("exonsSNP.txt",step="SNPs",type="txt") ex.add(outexons,description=description) msafile = unique_filename_in() with open(msafile,"w") as msa: msa.write(" %i %i\n"%(len(msa_table),len(msa_table.values()[0]))) for name,seq in msa_table.iteritems(): msa.write("%s\t%s\n" %(name,seq)) msa_table = {} description = set_file_descr("SNPalignment.txt",step="SNPs",type="txt") ex.add(msafile,description=description) # Create UCSC bed tracks logfile.write("\n* Create tracks\n"); logfile.flush() create_tracks(ex,outall,sample_names,assembly) # Create quantitative tracks logfile.write("\n* Create heteroz. and quality tracks\n"); logfile.flush() def _process_pileup(pileups, seq, startpos, endpos): atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3} vectors = ([],[],[]) for pileupcolumn in pileups: position = pileupcolumn.pos if position < startpos: continue if position >= endpos: break coverage = pileupcolumn.n ref_symbol = seq[position-startpos] ref = atoi.get(ref_symbol, 4) symbols = [0,0,0,0,0] quality = 0 for pileupread in pileupcolumn.pileups: symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1 quality += ord(pileupread.alignment.qual[pileupread.qpos])-33 quality = float(quality)/coverage info = heterozygosity(ref, symbols[0:4]) if coverage > 0: vectors[0].append((position, position+1, coverage)) if info > 0: vectors[1].append((position, position+1, info)) if quality > 0: vectors[2].append((position, position+1, quality)) # yield (position, position+1, coverage, info, quality) return vectors if job.options.get('make_bigwigs',False): _descr = {'groupId':0,'step':"tracks",'type':"bigWig",'ucsc':'1'} for gid,bamfile in bams.iteritems(): _descr['groupId'] = gid bamtr = track(bamfile,format="bam") covname = unique_filename_in()+".bw" out_cov = track(covname, chrmeta=assembly.chrmeta) hetname = unique_filename_in()+".bw" out_het = track(hetname, chrmeta=assembly.chrmeta) qualname = unique_filename_in()+".bw" out_qual = track(qualname, chrmeta=assembly.chrmeta) for chrom, cinfo in assembly.chrmeta.iteritems(): fasta = Fastafile(ref_genome[chrom]) #process fasta and bam by 10Mb chunks for chunk in range(0,cinfo["length"],10**7): fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk+10**7) vecs = _process_pileup(bamtr.pileup(chrom, chunk, chunk+10**7), fastaseq, chunk, chunk+10**7) out_cov.write(vecs[0], fields=['start','end','score'], chrom=chrom) out_het.write(vecs[1], fields=['start','end','score'], chrom=chrom) out_qual.write(vecs[2], fields=['start','end','score'], chrom=chrom) out_cov.close() out_het.close() out_qual.close() description = set_file_descr(job.groups[gid]['name']+"_coverage.bw",**_descr) ex.add(covname,description=description) description = set_file_descr(job.groups[gid]['name']+"_heterozygosity.bw",**_descr) ex.add(hetname,description=description) description = set_file_descr(job.groups[gid]['name']+"_quality.bw",**_descr) ex.add(qualname,description=description) return 0
def dnaseseq_workflow( ex, job, assembly, logfile=sys.stdout, via='lsf' ): """ This workflow performs the following steps: * BAM files from replicates within the same group are merged * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group * Wellington is called to identify footprints within these enriched regions * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file * Average DNAse profiles around motifs are plotted """ tests = [] controls = [] names = {'tests': [], 'controls': []} supdir = os.path.split(ex.remote_working_directory)[0] for gid,mapped in job.files.iteritems(): group_name = job.groups[gid]['name'] if not isinstance(mapped,dict): raise TypeError("Files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} if len(mapped)>1: bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()]) index = index_bam(ex, bamfile) else: bamfile = mapped.values()[0]['bam'] if job.groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: if os.path.exists(job.groups[gid].get('bedfile','null')): bedfile = job.groups[gid]['bedfile'] elif os.path.exists(os.path.join(supdir,job.groups[gid].get('bedfile','null'))): bedfile = os.path.join(supdir,job.groups[gid]['bedfile']) else: bedfile = None tests.append((bedfile,bamfile)) names['tests'].append((gid,group_name)) if len(controls)<1: controls = [None] names['controls'] = [(0,None)] tests = macs_bedfiles( ex, assembly.chrmeta, tests, controls, names, job.options.get('macs_args',["--keep-dup","10"]), via, logfile ) bedlist = run_wellington(ex, tests, names, assembly, via, logfile) ######################### Motif scanning / plotting if any([gr.get('motif') != 'null' and gr.get('motif') for gr in job.groups.values()]): motifbeds = motif_scan( ex, bedlist, assembly, job.groups, via, logfile ) siglist = dict((gid[0],[]) for gid in names['tests']) for gid,mapped in job.files.iteritems(): wig = [] suffixes = ["fwd","rev"] merge_strands = int(job.options.get('merge_strands',-1)) read_extension = int(job.options.get('read_extension') or -1) make_wigs = merge_strands >= 0 or read_extension != 1 for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=1, convert=False, b2w_args=[], via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: wig[0] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) _trn = job.groups[gid]['name']+"_%s" if job.groups[gid]['control']: for s,w in wig[0].iteritems(): for _g in siglist.keys(): siglist[_g].append(track(w,info={'name': _trn%s})) else: siglist[gid].extend([track(w,info={'name': _trn%s}) for s,w in wig[0].iteritems()]) plot_files = plot_footprint_profile( ex, motifbeds, siglist, assembly.chrnames, job.groups, logfile ) for gid, flist in plot_files.iteritems(): gname = job.groups[gid]['name'] plotall = unique_filename_in() touch( ex, plotall ) ex.add(plotall, description=set_file_descr(gname+'_footprints_plots', type='none', view='admin', step='motifs', groupId=gid)) ex.add(flist['pdf'], description=set_file_descr(gname+'_footprints_plots.pdf', type='pdf', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.pdf') tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for mname,matf in flist['mat']: tarfh.add(matf, arcname="%s_%s.txt" % (gname,mname)) tarfh.close() ex.add( tarname, description=set_file_descr(gname+'_footprints_plots.tar.gz', type='tar', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.tar.gz') logfile.write("\nDone.\n ");logfile.flush() return 0
def dnaseseq_workflow(ex, job, assembly, logfile=sys.stdout, via='lsf'): """ This workflow performs the following steps: * BAM files from replicates within the same group are merged * MACS is called to identify enriched regions (only peak summit +- 300 will be used), this can be by-passed by provinding a bed file to any group * Wellington is called to identify footprints within these enriched regions * If a list of motifs is provided (by group), footprints are scanned and motif occurences (log-likelihood ratio > 0) are recorded in a bed file * Average DNAse profiles around motifs are plotted """ tests = [] controls = [] names = {'tests': [], 'controls': []} supdir = os.path.split(ex.remote_working_directory)[0] for gid, mapped in job.files.iteritems(): group_name = job.groups[gid]['name'] if not isinstance(mapped, dict): raise TypeError( "Files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} if len(mapped) > 1: bamfile = merge_bam(ex, [m['bam'] for m in mapped.values()]) index = index_bam(ex, bamfile) else: bamfile = mapped.values()[0]['bam'] if job.groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: if os.path.exists(job.groups[gid].get('bedfile', 'null')): bedfile = job.groups[gid]['bedfile'] elif os.path.exists( os.path.join(supdir, job.groups[gid].get('bedfile', 'null'))): bedfile = os.path.join(supdir, job.groups[gid]['bedfile']) else: bedfile = None tests.append((bedfile, bamfile)) names['tests'].append((gid, group_name)) if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] tests = macs_bedfiles(ex, assembly.chrmeta, tests, controls, names, job.options.get('macs_args', ["--keep-dup", "10"]), via, logfile) bedlist = run_wellington(ex, tests, names, assembly, via, logfile) ######################### Motif scanning / plotting if any([ gr.get('motif') != 'null' and gr.get('motif') for gr in job.groups.values() ]): motifbeds = motif_scan(ex, bedlist, assembly, job.groups, via, logfile) siglist = dict((gid[0], []) for gid in names['tests']) for gid, mapped in job.files.iteritems(): wig = [] suffixes = ["fwd", "rev"] merge_strands = int(job.options.get('merge_strands', -1)) read_extension = int(job.options.get('read_extension') or -1) make_wigs = merge_strands >= 0 or read_extension != 1 for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=1, convert=False, b2w_args=[], via=via) wig.append(dict( (s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: wig[0] = dict((s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) _trn = job.groups[gid]['name'] + "_%s" if job.groups[gid]['control']: for s, w in wig[0].iteritems(): for _g in siglist.keys(): siglist[_g].append(track(w, info={'name': _trn % s})) else: siglist[gid].extend([ track(w, info={'name': _trn % s}) for s, w in wig[0].iteritems() ]) plot_files = plot_footprint_profile(ex, motifbeds, siglist, assembly.chrnames, job.groups, logfile) for gid, flist in plot_files.iteritems(): gname = job.groups[gid]['name'] plotall = unique_filename_in() touch(ex, plotall) ex.add(plotall, description=set_file_descr(gname + '_footprints_plots', type='none', view='admin', step='motifs', groupId=gid)) ex.add(flist['pdf'], description=set_file_descr(gname + '_footprints_plots.pdf', type='pdf', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.pdf') tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for mname, matf in flist['mat']: tarfh.add(matf, arcname="%s_%s.txt" % (gname, mname)) tarfh.close() ex.add(tarname, description=set_file_descr(gname + '_footprints_plots.tar.gz', type='tar', step='motifs', groupId=gid), associate_to_filename=plotall, template='%s.tar.gz') logfile.write("\nDone.\n ") logfile.flush() return 0
def snp_workflow(ex, job, assembly, minsnp=40., mincov=5, path_to_ref=None, via='local', logfile=sys.stdout, debugfile=sys.stderr): """Main function of the workflow""" ref_genome = assembly.fasta_by_chrom sample_names = [ job.groups[gid]['name'] for gid in sorted(job.files.keys()) ] logfile.write("\n* Generate vcfs for each chrom/group\n") logfile.flush() vcfs = dict((chrom, {}) for chrom in ref_genome.keys()) # {chr: {}} bams = {} # Launch the jobs bam = Samfile(job.files.values()[0].values()[0]['bam']) header = bam.header headerfile = unique_filename_in() for h in header["SQ"]: if h["SN"] in assembly.chrmeta: h["SN"] = assembly.chrmeta[h["SN"]]["ac"] head = Samfile(headerfile, "wh", header=header) head.close() for gid in job.files.keys(): # Merge all bams belonging to the same group runs = [r['bam'] for r in job.files[gid].itervalues()] if len(runs) > 1: _b = merge_bam(ex, runs) index_bam(ex, _b) bams[gid] = _b else: index_bam(ex, runs[0]) bams[gid] = runs[0] # Samtools mpileup + bcftools + vcfutils.pl for chrom, ref in ref_genome.iteritems(): vcf = unique_filename_in() vcfs[chrom][gid] = (vcf, pileup.nonblocking(ex, bams[gid], ref, header=headerfile, via=via, stdout=vcf)) logfile.write(" ...Group %s running.\n" % job.groups[gid]['name']) logfile.flush() # Wait for vcfs to finish and store them in *vcfs[chrom][gid]* for gid in job.files.keys(): for chrom, ref in ref_genome.iteritems(): vcfs[chrom][gid][1].wait() vcfs[chrom][gid] = vcfs[chrom][gid][0] logfile.write(" ...Group %s done.\n" % job.groups[gid]['name']) logfile.flush() # Targz the pileup files (vcf) tarname = unique_filename_in() tarfh = tarfile.open(tarname, "w:gz") for chrom, v in vcfs.iteritems(): for gid, vcf in v.iteritems(): tarfh.add(vcf, arcname="%s_%s.vcf" % (job.groups[gid]['name'], chrom)) tarfh.close() ex.add(tarname, description=set_file_descr("vcf_files.tar.gz", step="pileup", type="tar", view='admin')) logfile.write("\n* Merge info from vcf files\n") logfile.flush() outall = unique_filename_in() outexons = unique_filename_in() with open(outall, "w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+ \ ['gene','location_type','distance'])+'\n') with open(outexons, "w") as fout: fout.write('#'+'\t'.join(['chromosome','position','reference']+sample_names+['exon','strand','ref_aa'] \ + ['new_aa_'+s for s in sample_names])+'\n') msa_table = dict((s, '') for s in [assembly.name] + sample_names) for chrom, v in vcfs.iteritems(): logfile.write(" > Chromosome '%s'\n" % chrom) logfile.flush() # Put together info from all vcf files logfile.write(" - All SNPs\n") logfile.flush() allsnps = all_snps(ex, chrom, vcfs[chrom], bams, outall, assembly, headerfile, sample_names, mincov, float(minsnp), logfile, debugfile, via) # Annotate SNPs and check synonymy logfile.write(" - Exonic SNPs\n") logfile.flush() exon_snps(chrom, outexons, allsnps, assembly, sample_names, ref_genome, logfile, debugfile) for snprow in allsnps: for n, k in enumerate([assembly.name] + sample_names): base = snprow[3 + n][0] if base == "-": base = snprow[3][0] if base not in 'ACGTacgt': base = "N" msa_table[k] += base description = set_file_descr("allSNP.txt", step="SNPs", type="txt") ex.add(outall, description=description) description = set_file_descr("exonsSNP.txt", step="SNPs", type="txt") ex.add(outexons, description=description) msafile = unique_filename_in() with open(msafile, "w") as msa: msa.write(" %i %i\n" % (len(msa_table), len(msa_table.values()[0]))) for name, seq in msa_table.iteritems(): msa.write("%s\t%s\n" % (name, seq)) msa_table = {} description = set_file_descr("SNPalignment.txt", step="SNPs", type="txt") ex.add(msafile, description=description) # Create UCSC bed tracks logfile.write("\n* Create tracks\n") logfile.flush() create_tracks(ex, outall, sample_names, assembly) # Create quantitative tracks logfile.write("\n* Create heteroz. and quality tracks\n") logfile.flush() def _process_pileup(pileups, seq, startpos, endpos): atoi = {'A': 0, 'C': 1, 'G': 2, 'T': 3} vectors = ([], [], []) for pileupcolumn in pileups: position = pileupcolumn.pos if position < startpos: continue if position >= endpos: break coverage = pileupcolumn.n ref_symbol = seq[position - startpos] ref = atoi.get(ref_symbol, 4) symbols = [0, 0, 0, 0, 0] quality = 0 for pileupread in pileupcolumn.pileups: if pileupread.qpos >= len(pileupread.alignment.seq): coverage -= 1 else: symbols[atoi.get(pileupread.alignment.seq[pileupread.qpos], 4)] += 1 quality += ord( pileupread.alignment.qual[pileupread.qpos]) - 33 quality = float(quality) / coverage info = heterozygosity(ref, symbols[0:4]) if coverage > 0: vectors[0].append((position, position + 1, coverage)) if info > 0: vectors[1].append((position, position + 1, info)) if quality > 0: vectors[2].append((position, position + 1, quality)) # yield (position, position+1, coverage, info, quality) return vectors if job.options.get('make_bigwigs', False): _descr = { 'groupId': 0, 'step': "tracks", 'type': "bigWig", 'ucsc': '1' } for gid, bamfile in bams.iteritems(): _descr['groupId'] = gid bamtr = track(bamfile, format="bam") covname = unique_filename_in() + ".bw" out_cov = track(covname, chrmeta=assembly.chrmeta) hetname = unique_filename_in() + ".bw" out_het = track(hetname, chrmeta=assembly.chrmeta) qualname = unique_filename_in() + ".bw" out_qual = track(qualname, chrmeta=assembly.chrmeta) for chrom, cinfo in assembly.chrmeta.iteritems(): fasta = Fastafile(ref_genome[chrom]) #process fasta and bam by 10Mb chunks for chunk in range(0, cinfo["length"], 10**7): fastaseq = fasta.fetch(cinfo['ac'], chunk, chunk + 10**7) vecs = _process_pileup( bamtr.pileup(chrom, chunk, chunk + 10**7), fastaseq, chunk, chunk + 10**7) out_cov.write(vecs[0], fields=['start', 'end', 'score'], chrom=chrom) out_het.write(vecs[1], fields=['start', 'end', 'score'], chrom=chrom) out_qual.write(vecs[2], fields=['start', 'end', 'score'], chrom=chrom) out_cov.close() out_het.close() out_qual.close() description = set_file_descr( job.groups[gid]['name'] + "_coverage.bw", **_descr) ex.add(covname, description=description) description = set_file_descr( job.groups[gid]['name'] + "_heterozygosity.bw", **_descr) ex.add(hetname, description=description) description = set_file_descr( job.groups[gid]['name'] + "_quality.bw", **_descr) ex.add(qualname, description=description) return 0