def buildCoverageStats(infile, outfile): '''Generate coverage statistics for regions of interest from a bed file using Picard''' # TS check whether this is always required or specific to current baits # file # baits file requires modification to make picard accept it # this is performed before CalculateHsMetrics to_cluster = USECLUSTER baits = PARAMS["roi_baits"] modified_baits = infile + "_temp_baits_final.bed" regions = PARAMS["roi_regions"] statement = '''samtools view -H %(infile)s > %(infile)s_temp_header.txt; awk 'NR>2' %(baits)s | awk -F '\\t' 'BEGIN { OFS="\\t" } {print $1,$2,$3,"+",$4;}' > %(infile)s_temp_baits.bed; cat %(infile)s_temp_header.txt %(infile)s_temp_baits.bed > %(modified_baits)s; checkpoint ; rm -rf %(infile)s_temp_baits.bed %(infile)s_temp_header.txt ''' P.run() PipelineMappingQC.buildPicardCoverageStats(infile, outfile, modified_baits, modified_baits) IOTools.zapFile(modified_baits)
def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir() job_memory = PARAMS["gatk_memory"] genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr") outfile2 = outfile.replace(".bqsr", ".realign.bqsr") PipelineExome.GATKReadGroups(infile, outfile1, genome, PARAMS["readgroup_library"], PARAMS["readgroup_platform"], PARAMS["readgroup_platform_unit"]) PipelineExome.GATKIndelRealign(outfile1, outfile2, genome, PARAMS["gatk_threads"]) IOTools.zapFile(outfile1) PipelineExome.GATKBaseRecal(outfile2, outfile, genome, PARAMS["gatk_dbsnp"], PARAMS["gatk_solid_options"]) IOTools.zapFile(outfile2)
def buildCoverageStats(infile, outfile): '''Generate coverage statistics for regions of interest from a bed file using Picard''' # TS check whether this is always required or specific to current baits file # baits file requires modification to make picard accept it # this is performed before CalculateHsMetrics to_cluster = USECLUSTER baits = PARAMS["roi_baits"] modified_baits = infile + "_temp_baits_final.bed" regions = PARAMS["roi_regions"] statement = '''samtools view -H %(infile)s > %(infile)s_temp_header.txt; awk 'NR>2' %(baits)s | awk -F '\\t' 'BEGIN { OFS="\\t" } {print $1,$2,$3,"+",$4;}' > %(infile)s_temp_baits.bed; cat %(infile)s_temp_header.txt %(infile)s_temp_baits.bed > %(modified_baits)s; checkpoint ; rm -rf %(infile)s_temp_baits.bed %(infile)s_temp_header.txt ''' P.run() PipelineMappingQC.buildPicardCoverageStats( infile, outfile, modified_baits, modified_baits) IOTools.zapFile(modified_baits)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.getTempDir(shared=True) outfile_tumor = outfile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) statement = '''picard AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' statement += '''picard AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s ; checkpoint ;''' statement += '''samtools index %(outfile)s ; checkpoint ;''' statement += '''rm -rf %(tmpdir_gatk)s ; checkpoint ; ''' P.run() IOTools.zapFile(infile) IOTools.zapFile(infile_tumor)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.getTempDir(shared=True) outfile_tumor = outfile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) statement = '''AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' statement += '''AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s ; checkpoint ;''' statement += '''samtools index %(outfile)s ; checkpoint ;''' statement += '''rm -rf %(tmpdir_gatk)s ; checkpoint ; ''' P.run() IOTools.zapFile(infile) IOTools.zapFile(infile_tumor)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_options = getGATKOptions() # TS no multithreading so why 6 threads? # job_threads = 6 # tmpdir_gatk = P.getTempDir('tmpbam') tmpdir_gatk = P.getTempDir('/ifs/scratch') # threads = PARAMS["gatk_threads"] outfile_tumor = outfile.replace("Control", PARAMS["mutect_tumour"]) infile_tumor = infile.replace("Control", PARAMS["mutect_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace("Control", PARAMS["mutect_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace("Control", PARAMS["mutect_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace("Control", PARAMS["mutect_tumour"]) # T.S delete after testing # tmpdir_gatk = P.getTempDir('.') statement = '''AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s ; checkpoint ;''' % locals() statement += '''samtools index %(outfile)s ; checkpoint ;''' statement += '''rm -rf %(tmpdir_gatk)s ; checkpoint ; ''' % locals() P.run() IOTools.zapFile(infile) IOTools.zapFile(infile_tumor)
def realignMatchedSample(infile, outfile): ''' repeat realignments with merged bam of control and tumor this should help avoid problems with sample-specific realignments''' genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineExome.GATKIndelRealign(infile, outfile, genome) IOTools.zapFile(infile)
def splitMergedRealigned(infile, outfile): ''' split realignment file and truncate intermediate bams''' track = P.snip(os.path.basename(infile), ".realigned.bqsr.bam") + ".bqsr" track_tumor = track.replace("Control", PARAMS["mutect_tumour"]) outfile_tumor = outfile.replace("Control", PARAMS["mutect_tumour"]) statement = '''samtools view -hb %(infile)s -r %(track)s > %(outfile)s; samtools view -hb %(infile)s -r %(track_tumor)s > %(outfile_tumor)s; checkpoint ; samtools index %(outfile)s; samtools index %(outfile_tumor)s; checkpoint;''' % locals() P.run() IOTools.zapFile(infile)
def splitMergedRealigned(infile, outfile): ''' split realignment file and truncate intermediate bams''' track = P.snip(os.path.basename(infile), ".realigned.bqsr.bam") + ".bqsr" track_tumor = track.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) outfile_tumor = outfile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) statement = '''samtools view -hb %(infile)s -r %(track)s > %(outfile)s; samtools view -hb %(infile)s -r %(track_tumor)s > %(outfile_tumor)s; checkpoint ; samtools index %(outfile)s; samtools index %(outfile_tumor)s; checkpoint;''' P.run() IOTools.zapFile(infile)
def clean(files, logfile): '''clean up files given by glob expressions. Files are cleaned up by zapping, i.e. the files are set to size 0. Links to files are replaced with place-holders. Information about the original file is written to `logfile`. Arguments --------- files : list List of glob expressions of files to clean up. logfile : string Filename of logfile. ''' fields = ('st_atime', 'st_blksize', 'st_blocks', 'st_ctime', 'st_dev', 'st_gid', 'st_ino', 'st_mode', 'st_mtime', 'st_nlink', 'st_rdev', 'st_size', 'st_uid') dry_run = PARAMS.get("dryrun", False) if not dry_run: if not os.path.exists(logfile): outfile = IOTools.openFile(logfile, "w") outfile.write("filename\tzapped\tlinkdest\t%s\n" % "\t".join(fields)) else: outfile = IOTools.openFile(logfile, "a") c = E.Counter() for fn in files: c.files += 1 if not dry_run: stat, linkdest = IOTools.zapFile(fn) if stat is not None: c.zapped += 1 if linkdest is not None: c.links += 1 outfile.write("%s\t%s\t%s\t%s\n" % ( fn, time.asctime(time.localtime(time.time())), linkdest, "\t".join([str(getattr(stat, x)) for x in fields]))) E.info("zapped: %s" % (c)) outfile.close() return c