def merge_results(name, sample): for r in sample.replicates + [ sample.combined_replicate, ]: for q_val in sample.conf.Q_VALUE_THRESHOLDS + [ 0, ]: if q_val: output = os.path.join( r.results_dir(sample), '%s_%f_hits_filtered.bed' % (r.rep_name(sample), q_val)) else: output = os.path.join( r.results_dir(sample), '%s_hits_filtered.bed' % r.rep_name(sample)) r.unfiltered_results = output cmd = filter_hits_cmd(r.results_dir(sample), r.sgr_dir(sample), sample.genome, output, q_val) sample.add_jobs(name, [ sjm.Job('merge_' + r.rep_name(sample) + '%g' % (q_val), [ cmd, ], queue=QUEUE, project=PROJECT, sched_options="-m e"), ]) # Merge Pseudoreplicate Hits output = os.path.join(r.results_dir(sample), '%s_hits.bed' % (r.rep_name(sample) + '_PR1')) r.unfiltered_results_pr1 = output cmd = filter_hits_cmd(r.pr1_results_dir, r.pr1_sgr_dir, sample.genome, output) sample.add_jobs(name, [ sjm.Job('merge_' + r.rep_name(sample) + '_PR1', [ cmd, ], queue=QUEUE, project=PROJECT, sched_options="-m e"), ]) output = os.path.join(r.results_dir(sample), '%s_hits.bed' % (r.rep_name(sample) + '_PR2')) r.unfiltered_results_pr2 = output cmd = filter_hits_cmd(r.pr1_results_dir, r.pr1_sgr_dir, sample.genome, output) sample.add_jobs(name, [ sjm.Job('merge_' + r.rep_name(sample) + '_PR2', [ cmd, ], queue=QUEUE, project=PROJECT, sched_options="-m e"), ])
def idr_analysis(name, sample): jobs = [] modules = ["r/3.0.1"] for i, rep_a in enumerate(sample.replicates): for j in range(i + 1, len(sample.replicates)): rep_b = sample.replicates[j] idr_name = '%s_VS_%s' % (rep_a.rep_name(sample), rep_b.rep_name(sample)) cmd = idr.idr_analysis_cmd(rep_a.narrowPeak, rep_b.narrowPeak, os.path.join(sample.idr_dir, idr_name), 'q.value', sample.genome) jobs.append( sjm.Job('idr_analysis_' + idr_name, [ cmd, ], queue=QUEUE, modules=modules, project=PROJECT, sched_options="-m e")) # Pseudoreplicates idr_name = '%s_PR1_VS_%s_PR2' % (rep_a.rep_name(sample), rep_a.rep_name(sample)) cmd = idr.idr_analysis_cmd( rep_a.narrowPeak_pr1, rep_a.narrowPeak_pr2, os.path.join(sample.idr_dir, idr_name + '_PR'), 'q.value', sample.genome) jobs.append( sjm.Job('idr_analysis_' + idr_name, [ cmd, ], queue=QUEUE, modules=modules, project=PROJECT, sched_options="-m e")) # Pooled Pseudoreplicates idr_name = '%s_PR1_VS_%s_PR2' % (sample.combined_replicate.rep_name( sample), sample.combined_replicate.rep_name(sample)) cmd = idr.idr_analysis_cmd(sample.combined_replicate.narrowPeak_pr1, sample.combined_replicate.narrowPeak_pr2, os.path.join(sample.idr_dir, idr_name), 'q.value', sample.genome) jobs.append( sjm.Job('idr_analysis_' + idr_name, [ cmd, ], queue=QUEUE, modules=modules, project=PROJECT, sched_options="-m e")) sample.add_jobs(name, jobs)
def main(gvcfs): jobs = merge_gvcf(gvcfs) jobs = gatk_joint(jobs) descout = sys.stdout if jobfile is None else open(jobfile.path, "w") descout.write(sjm.Job().depend(*jobs).desc()) descout.flush()
def replicate_scoring(name, sample): cmds = [] # Mapped Read Statistics cmd = os.path.join(BIN_DIR, 'read_stats.py') cmd += ' %s' % os.path.join(sample.results_dir, 'rep_stats') cmd += ' ' + sample.conf.path cmds.append(cmd) # Replicate Overlap Statistics for q in sample.conf.Q_VALUE_THRESHOLDS: for r1 in sample.replicates: for r2 in sample.replicates: if r1 == r2: continue cmd = os.path.join(BIN_DIR, 'overlap_stats.py') cmd += ' ' + r1.narrowPeak cmd += ' ' + r2.narrowPeak cmd += ' ' + os.path.join(sample.results_dir, 'rep_stats') cmd += ' %f' % q cmd += ' %s_VS_%s_%f' % (r1.rep_name(sample), r2.rep_name(sample), q) cmds.append(cmd) j = sjm.Job('replicate_stats', cmds, queue=QUEUE, project=PROJECT, sched_options="-m e") sample.add_jobs(name, [ j, ])
def mail_results(sample, control, run_name, emails): cmds = [] cmd = os.path.join(BIN_DIR, 'build_report_text.py') cmd += ' %s' % sample.run_name cmd += ' %s' % sample.archive_file_download cmd += ' %s' % control.archive_file_download cmd += ' %s' % os.path.join(sample.results_dir, 'rep_stats') cmd += ' %s' % os.path.join(sample.results_dir, 'spp_stats.txt') cmd += ' %s' % os.path.join(sample.results_dir, 'idr_results.txt') cmd += ' %s' % os.path.join(sample.results_dir, 'pbc_stats.txt') cmd += ' %s' % os.path.join(sample.results_dir, 'full_report.txt') cmds.append(cmd) cmd = os.path.join(BIN_DIR, 'mail_wrapper.py') cmd += ' "%s Scoring Results"' % sample.run_name cmd += ' %s' % os.path.join(sample.results_dir, 'full_report.txt') for email in emails: cmd += ' %s' % email cmds.append(cmd) return sjm.Job('mail_results', cmds, queue=QUEUE, project=PROJECT, host='localhost', sched_options="-m e", dependencies=sample.all_jobs() + control.all_jobs())
def form_control_files(name, control): print " ******* form control files ****** " cmds = [] control.merged_file_location = os.path.join( control.temp_dir, '%s_merged_eland.txt' % control.run_name) print " merged conrol files ", control.merged_file_location # Merge eland files cmd = os.path.join(BIN_DIR, 'merge_and_filter_reads.py') cmd += ' %s' % control.merged_file_location print " merged conrol files ", cmd #sys.exit() for mr in control.mapped_read_files: cmd += ' %s' % mr cmds.append(cmd) # Divide merged file by chr cmd = os.path.join(BIN_DIR, 'divide_eland.py') cmd += " %s %s %s" % (control.merged_file_location, control.genome, control.results_dir) cmds.append(cmd) # Create Signal Map cmd = os.path.join(BIN_DIR, 'create_signal_map.py') cmd += ' %s %s' % (control.sgr_dir, control.results_dir) cmds.append(cmd) control.add_jobs(name, [ sjm.Job(control.run_name, cmds, modules=["samtools/1.2"], queue=QUEUE, project=PROJECT, sched_options="-m e"), ])
def form_idr_inputs(name, sample): os.makedirs(os.path.join(sample.results_dir, 'idr')) jobs = [] for rep in sample.replicates + [ sample.combined_replicate, ]: rep.narrowPeak = os.path.join( rep.results_dir(sample), rep.rep_name(sample) + '_unfiltered_narrowPeak.bed') cmd = os.path.join(SUBMISSION_BIN_DIR, 'normalhits2narrowPeak') cmd += ' %s > %s' % (rep.unfiltered_results, rep.narrowPeak) jobs.append( sjm.Job(rep.rep_name(sample) + '_hits2narrowPeak', [ cmd, ], queue=QUEUE, project=PROJECT, sched_options="-m e")) # Pseudoreplicates rep.narrowPeak_pr1 = os.path.join( rep.results_dir(sample), rep.rep_name(sample) + '_PR1_unfiltered_narrowPeak.bed') cmd = os.path.join(SUBMISSION_BIN_DIR, 'normalhits2narrowPeak') cmd += ' %s > %s' % (rep.unfiltered_results_pr1, rep.narrowPeak_pr1) jobs.append( sjm.Job(rep.rep_name(sample) + '_PR1_hits2narrowPeak', [ cmd, ], queue=QUEUE, project=PROJECT, sched_options="-m e")) rep.narrowPeak_pr2 = os.path.join( rep.results_dir(sample), rep.rep_name(sample) + '_PR2_unfiltered_narrowPeak.bed') cmd = os.path.join(SUBMISSION_BIN_DIR, 'normalhits2narrowPeak') cmd += ' %s > %s' % (rep.unfiltered_results_pr2, rep.narrowPeak_pr2) jobs.append( sjm.Job(rep.rep_name(sample) + '_PR2_hits2narrowPeak', [ cmd, ], queue=QUEUE, project=PROJECT, sched_options="-m e")) sample.add_jobs(name, jobs)
def gatk_vqsr(pjob): jobs = [] vcf = pjob.output job = sjm.Job('gatk_VQSR-%s' % (vcf.prefix)) job.memory = "10G" job.append('gatk_vqsr.sh %s' % vcf) job.depend(pjob) return job
def gatk_mvcf(pjobs, vcfout): vcfs = [pjob.output for pjob in pjobs] job = sjm.Job('gatk_CatVCF-%s' % (bamfile.prefix)) job.memory = "10G" job.output = util.File(os.path.join(outdir, vcfout)) job.append('gatk_catvcf.sh %s %s' % (job.output, ' '.join(vcfs))) job.depend(*pjobs) return job
def form_idr_inputs(name, sample): os.makedirs(os.path.join(sample.results_dir, 'idr')) jobs = [] for rep in sample.replicates + [sample.combined_replicate,]: rep.narrowPeak = rep.unfiltered_results rep.narrowPeak_pr1 = rep.unfiltered_results_pr1 rep.narrowPeak_pr2 = rep.unfiltered_results_pr2 sample.add_jobs(name, [sjm.Job('form_idr_inputs', ['echo form_idr_inputs',], queue=QUEUE, project=PROJECT, host='localhost',sched_options="-m e"),])
def merge_results(name, sample): jobs = [] for r in sample.replicates + [sample.combined_replicate,]: r.unfiltered_results = os.path.join(r.results_dir(sample), '%s_peaks.regionPeak' % r.rep_name(sample)) r.unfiltered_results_pr1 = os.path.join(r.pr1_results_dir, '%s_PR1_peaks.regionPeak' % r.rep_name(sample)) r.unfiltered_results_pr2 = os.path.join(r.pr2_results_dir, '%s_PR2_peaks.regionPeak' % r.rep_name(sample)) unpack_cmds = ['zcat %s > %s' % (r.spp_results, r.unfiltered_results), 'zcat %s > %s' % (r.spp_results_pr1, r.unfiltered_results_pr1), 'zcat %s > %s' % (r.spp_results_pr2, r.unfiltered_results_pr2),] jobs.append(sjm.Job('merge_results_%s' % r.rep_name(sample), unpack_cmds, queue=QUEUE, project=PROJECT,sched_options="-m e")) sample.add_jobs(name, jobs)
def form_idr_inputs(name, sample): os.makedirs(os.path.join(sample.results_dir, 'idr')) jobs = [] for rep in sample.replicates + [ sample.combined_replicate, ]: cmds = [] rep.narrowPeak = rep.unfiltered_results + '.filtered' cmd = 'sort -k8nr %s | head -n 300000 > %s.temp && mv %s.temp %s' % ( rep.unfiltered_results, rep.narrowPeak, rep.narrowPeak, rep.narrowPeak) cmds.append(cmd) jobs.append( sjm.Job(rep.rep_name(sample) + '_narrowPeak_filter', cmds, queue=QUEUE, project=PROJECT)) # Pseudoreplicates cmds = [] rep.narrowPeak_pr1 = rep.unfiltered_results_pr1 + '.filtered' cmd = 'sort -k8nr %s | head -n 300000 > %s.temp && mv %s.temp %s' % ( rep.unfiltered_results_pr1, rep.narrowPeak_pr1, rep.narrowPeak_pr1, rep.narrowPeak_pr1) cmds.append(cmd) jobs.append( sjm.Job(rep.rep_name(sample) + '_PR1_narrowPeak_filter', cmds, queue=QUEUE, project=PROJECT)) cmds = [] rep.narrowPeak_pr2 = rep.unfiltered_results_pr2 + '.filtered' cmd = 'sort -k8nr %s | head -n 300000 > %s.temp && mv %s.temp %s' % ( rep.unfiltered_results_pr2, rep.narrowPeak_pr2, rep.narrowPeak_pr2, rep.narrowPeak_pr2) cmds.append(cmd) jobs.append( sjm.Job(rep.rep_name(sample) + '_PR2_narrowPeak_filter', cmds, queue=QUEUE, project=PROJECT)) sample.add_jobs(name, jobs)
def gatk_joint(pjobs): jobs = [] gvcfs = [pjob.output.path for pjob in pjobs] outvcf = util.File(os.path.join(args.outdir, args.output)) job = sjm.Job('GATK-joint-gt-%s' % outvcf.name) job.memory = "20G" job.output = outvcf job.append('gatk_gt_joint.sh %s %s' % (job.output, ' '.join(gvcfs))) job.depend(*pjobs) jobs.append(job) return jobs
def idr_filter(name, sample): cmd = os.path.join(BIN_DIR, 'idr_filter.py') cmd += ' %s' % sample.run_name cmd += ' %s' % sample.genome cmd += ' %i' % len(sample.replicates) cmd += ' %s' % sample.idr_dir cmd += ' %s' % os.path.join(os.path.join(sample.results_dir, 'All'), sample.combined_replicate.unfiltered_results) cmd += ' %s' % sample.results_dir cmd += ' 7' # sort column (signal.value) sample.add_jobs(name, [sjm.Job('idr_filter_' + sample.run_name, [cmd,], queue=QUEUE, project=PROJECT,sched_options="-m e"),])
def sam_flagstat(pjobs): jobs = [] for pjob in pjobs: bam=util.File(pjob.output) job=sjm.Job('samtools-flagstat-%s' % bam.prefix) job.memory = "10G" job.output = bam.chext("flagstat.txt") job.append('samtools flagstat %s > %s'%(bam, job.output)) job.depend(pjob) jobs.append(job) return jobs
def form_sample_files(name, sample): jobs = [] print " peakseq: form sample files ***" for rep in sample.replicates: jobs.append( sjm.Job(rep.rep_name(sample) + '_merge', form_replicate_files(rep, sample), queue=QUEUE, modules=["samtools/1.2"], project=PROJECT, sched_options="-m e")) jobs.append( sjm.Job(sample.run_name + '_All_merge', form_replicate_files(sample.combined_replicate, sample), modules=["samtools/1.2"], queue=QUEUE, project=PROJECT, sched_options="-m e")) sample.add_jobs(name, jobs)
def archive_results(name, results_dir, archive_file, force=False): if os.path.exists(archive_file): if not force: raise Exception("Archive file %s already exists" % archive_file) archive_cmd = '%s %s %s %s' % (os.path.join( BIN_DIR, 'archive_results.py'), results_dir, archive_file, force) return sjm.Job('Archive_%s' % name, archive_cmd, queue=QUEUE, project=PROJECT, sched_options="-m e")
def dedup_bam(pjobs): jobs = [] for pjob in pjobs: bamfile = pjob.output job = sjm.Job('picard_mdup-%s' % bamfile.prefix) job.memory = "20G" job.output = os.path.join(outdir, bamfile.chext("mdup.bam").name) job.append('picard_mdup.sh %s %s' % (job.output, bamfile)) job.depend(pjob) jobs.append(job) return jobs
def split_species(pjobs): jobs_mm9 = [] jobs_dm3 = [] for pjob in pjobs: bamfile = pjob.output job1 = sjm.Job('samtools_mm9-%s' % bamfile.prefix) job1.memory = "12G" job1.output = bamfile.chext('mm9.bam') job1.append('samtools view -hb %s %s > %s' % (bamfile, mm9_chrs, job1.output)) job1.depend(*pjobs) jobs_mm9.append(job1) job2 = sjm.Job('samtools_dm3-%s' % bamfile.prefix) job2.memory = "12G" job2.output = bamfile.chext('dm3.bam') job2.append('samtools view -hb %s %s > %s' % (bamfile, dm3_chrs, job2.output)) job2.depend(*pjobs) jobs_dm3.append(job2) return jobs_mm9, jobs_dm3
def dedup_merge(pjobs, outbam): jobs = [] bams = [] for pjob in pjobs: bams.append(pjob.output.path) job = sjm.Job('picard_mdup-%s' % outbam ) job.memory = "20G" job.output = util.File( os.path.join(outdir, outbam) ) job.append('picard_mdup.sh %s %s'%(job.output, ' '.join(bams) ) ) job.depend(*pjobs) jobs.append(job) return jobs
def gatk_recal(pjobs): jobs = [] for pjob in pjobs: bamfile = util.File(pjob.output) job = sjm.Job('gatk_recalibrate-%s'%(bamfile.prefix)) job.memory = "20G" job.output = os.path.join(tmpdir, '%s.%s' % (bamfile.prefix, 'recal.bam')) job.regions = pjob.regions job.append('gatk_recal.sh %s %s'%(job.output, bamfile.path)) job.depend(pjob) jobs.append(job) return jobs
def gatk_hc(pjobs): jobs = [] for pjob in pjobs: bamfile = util.File(pjob.output) job = sjm.Job('gatk_haplotypecaller-%s'%(bamfile.prefix)) job.memory = "40G" job.output = os.path.join(tmpdir, '%s.%s' % (bamfile.prefix, 'g.vcf.gz')) job.regions = pjob.regions job.append('gatk_hc.sh %s %s %s'%(job.output, bamfile.path, pjob.regions)) job.depend(pjob) jobs.append(job) return jobs
def gatk_gt(pjobs): jobs = [] for pjob in pjobs: gvcffile = util.File(pjob.output, iszipfile=True) job = sjm.Job('gatk_genotypeGVCFs-%s'%(gvcffile.prefix)) job.memory = "15G" job.output = os.path.join( tmpdir, '%s.%s' % (gvcffile.prefix, 'gt.vcf.gz') ) job.regions = pjob.regions job.append('gatk_gt.sh %s %s %s'%(job.output, gvcffile.path, pjob.regions)) job.depend(pjob) jobs.append(job) return jobs
def merge_aln(pjobs): jobs = [] for pjob in pjobs: alnbam = pjob.output ubam = pjob.input job = sjm.Job('picard_mergeBam-%s' % alnbam.name) job.memory = "10G" job.output = util.File(alnbam.path.rstrip('.aln.bam') + '.sort.bam') job.append('picard_mergeBam.sh %s %s %s' % (job.output, alnbam, ubam)) job.depend(pjob) jobs.append(job) return jobs
def gatk_hc_batch(bamfile, regions_file): jobs = [] for region_line in open(args.regions_file): region_line = region_line.rstrip('\n') if region_line.startswith("#"): continue region_name, regions = region_line.split(' ',1) job = sjm.Job('gatk_hc_bam-%s-%s'%(bamfile.prefix, region_name)) job.memory = "40G" job.output = os.path.join(tmpdir, '%s.%s.%s' % (bamfile.prefix, region_name,'g.vcf.gz')) job.regions = regions job.append('gatk_hc.sh %s %s %s'%(job.output, bamfile.path,regions)) jobs.append(job) return jobs
def align_pe(pjobs): jobs = [] for pjob in pjobs: inbam = pjob.output obam = inbam.chext('aln.bam') job = sjm.Job('bwa_aln_pe-%s' % inbam.prefix) job.memory = "20G" job.input = inbam job.output = obam job.append('bwa_aln_pe_qn.sh %s %s %s' % (job.output, inbam, inbam)) job.depend(pjob) jobs.append(job) return jobs
def form_sample_files_nodups(name, sample): jobs = [] for rep in sample.replicates: jobs.append( sjm.Job(rep.rep_name(sample) + '_merge', form_replicate_files(rep, sample, rmdups=True), modules=["samtools/1.2"], queue=QUEUE, project=PROJECT, memory='16G', sched_options="-m e")) jobs.append( sjm.Job(sample.run_name + '_All_merge', form_replicate_files(sample.combined_replicate, sample, rmdups=True), modules=["samtools/1.2"], queue=QUEUE, project=PROJECT, memory='16G', sched_options="-m e")) sample.add_jobs(name, jobs)
def sort_ubam(ubams): jobs = [] for ubam in ubams: ubam = util.File(ubam) obam = util.File( os.path.join(tmpdir, os.path.basename(ubam.path.rstrip('u.bam') + '.bam'))) job = sjm.Job('picard_sortUbam-%s' % ubam.prefix) job.memory = "20G" job.input = ubam job.output = obam job.append('picard_sortUbam.sh %s %s' % (job.input, job.output)) jobs.append(job) return jobs
def align_se(reads1, reads2): jobs = [] for i in range(0, len(reads1)): read1 = reads1[i] read2 = reads2[i] readfile1 = util.File(read1) readfile2 = util.File(read2) bamname = re.sub(r'[._][Rr]1', '', readfile1.prefix) + '.sorted.bam' bam = util.File(os.path.join(tmpdir, bamname)) job = sjm.Job('bwa_aln_se-%s' % readfile1.prefix) job.output = bam job.append('bwa_aln_se.sh %s %s %s %s' % (job.output, read1, read2, readgroup)) jobs.append(job) return jobs
def merge_bam(pjobs, out_prefix, suffix=None): ''' Caveat: If output bam exists, needs to apply "-f" to overwrite or task will abort. ''' bams = [] for pjob in pjobs: bams.append(pjob.output.path) job = sjm.Job('samtools_merge-%s' % suffix) job.memory = "5G" outname = os.path.join(tmpdir, '%s.%s.bam' % (out_prefix, suffix)) job.output = util.File(outname) job.append('samtools merge %s %s && samtools index %s' % (job.output, ' '.join(bams), job.output)) job.depend(*pjobs) return job