def buildBAMforPeakCalling(infiles, outfile, dedup, mask): ''' Make a BAM file suitable for peak calling. Infiles are merged and unmapped reads removed. If specificied duplicate reads are removed. This method use Picard. If a mask is specified, reads falling within the mask are filtered out. This uses bedtools. The mask is a quicksect object containing the regions from which reads are to be excluded. ''' # open the infiles, if more than one merge and sort first using samtools. samfiles = [] num_reads = 0 nfiles = 0 statement = [] tmpfile = P.getTempFilename(".") if len(infiles) > 1 and isinstance(infiles, str) == 0: # assume: samtools merge output is sorted # assume: sam files are sorted already statement.append('''samtools merge @OUT@ %s''' % (infiles.join(" "))) statement.append('''samtools sort @IN@ @OUT@''') if dedup: statement.append('''MarkDuplicates INPUT=@IN@ ASSUME_SORTED=true REMOVE_DUPLICATES=true QUIET=true OUTPUT=@OUT@ METRICS_FILE=%(outfile)s.picardmetrics VALIDATION_STRINGENCY=SILENT > %(outfile)s.picardlog ''') if mask: statement.append( '''intersectBed -abam @IN@ -b %(mask)s -wa -v > @OUT@''') statement.append('''mv @IN@ %(outfile)s''') statement.append('''samtools index %(outfile)s''') statement = P.joinStatements(statement, infiles) P.run()
def filterBamfiles(infile, sentinel): """ Pre-process bamfiles prior to peak calling. i) sort bamfiles ii) remove unmapped readswith bam2bam.py iii) remove non-uniquely mapping reads with bam2bam.py (optional) iv) remove duplicates with Picards MarkDuplicates (optional) v) remove reads from masked regions with bedtools intersect (optional) vi) index """ # create tempfile for Picard's MarkDuplicates picard_tmp = P.getTempDir(PARAMS["scratchdir"]) outfile = P.snip(sentinel, ".sentinel") + ".bam" # ensure bamfile is sorted, statement = [ "samtools sort @IN@ -o @[email protected]", ] # remove unmapped reads statement.append("cgat bam2bam" " --method=filter --filter-method=mapped" " --log=%(outfile)s.log" " < @[email protected]" " > @OUT@") # remove non-uniquely mapping reads, if requested if PARAMS["filter_remove_non_unique"]: statement.append("cgat bam2bam" " --method=filter --filter-method=unique" " --log=%(outfile)s.log" " < @IN@" " > @OUT@") # remove duplicates, if requested if PARAMS["filter_remove_duplicates"]: statement.append("MarkDuplicates" " INPUT=@IN@" " ASSUME_SORTED=true" " REMOVE_DUPLICATES=true" " QUIET=false" " OUTPUT=@OUT@" " METRICS_FILE=/dev/null" " VALIDATION_STRINGENCY=SILENT" " TMP_DIR=%(picard_tmp)s" " 2> %(outfile)s.log") # mask regions, if intervals supplied if PARAMS["filter_mask_intervals"]: mask = PARAMS["filter_mask_intervals"] statement.append("bedtools intersect" " -abam @IN@" " -b %(mask)s" " -wa" " -v" " > @OUT@") statement.append("mv @IN@ %(outfile)s") statement.append("samtools index %(outfile)s") job_memory = "5G" statement = P.joinStatements(statement, infile) P.run() P.touch(sentinel) shutil.rmtree(picard_tmp)
def filterBamfiles(infile, sentinel): """ Pre-process bamfiles prior to peak calling. i) sort bamfiles ii) remove unmapped readswith bam2bam.py iii) remove non-uniquely mapping reads with bam2bam.py (optional) iv) remove duplicates with Picards MarkDuplicates (optional) v) remove reads from masked regions with bedtools intersect (optional) vi) index """ # create tempfile for Picard's MarkDuplicates picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"]) outfile = P.snip(sentinel, ".sentinel") + ".bam" # ensure bamfile is sorted, statement = ["samtools sort @IN@ @OUT@", ] # remove unmapped reads statement.append("python %(scriptsdir)s/bam2bam.py" " --method=filter --filter-method=mapped" " --log=%(outfile)s.log" " < @[email protected]" " > @OUT@") # remove non-uniquely mapping reads, if requested if PARAMS["filter_remove_non_unique"]: statement.append("python %(scriptsdir)s/bam2bam.py" " --method=filter --filter-method=unique" " --log=%(outfile)s.log" " < @IN@" " > @OUT@") # remove duplicates, if requested if PARAMS["filter_remove_duplicates"]: statement.append("MarkDuplicates" " INPUT=@IN@" " ASSUME_SORTED=true" " REMOVE_DUPLICATES=true" " QUIET=false" " OUTPUT=@OUT@" " METRICS_FILE=/dev/null" " VALIDATION_STRINGENCY=SILENT" " TMP_DIR=%(picard_tmp)s" " 2> %(outfile)s.log") # mask regions, if intervals supplied if PARAMS["filter_mask_intervals"]: mask = PARAMS["filter_mask_intervals"] statement.append("bedtools intersect" " -abam @IN@" " -b %(mask)s" " -wa" " -v" " > @OUT@") statement.append("mv @IN@ %(outfile)s") statement.append("samtools index %(outfile)s") job_options = "-l mem_free=10G" statement = P.joinStatements(statement, infile) P.run() P.touch(sentinel) shutil.rmtree(picard_tmp)