def buildGeneLevelReadCounts(infiles, outfile): '''compute read counts and coverage of exons with reads. ''' bamfile, exons = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' # ignore multi-mapping reads statement = ''' zcat %(exons)s | python %(scriptsdir)s/gtf2table.py --reporter=genes --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def buildGeneLevelReadCounts(infiles, outfile): '''compute read counts and coverage of exons with reads. ''' bamfile, exons = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' # ignore multi-mapping reads statement = ''' zcat %(exons)s | python %(scriptsdir)s/gtf2table.py --reporter=genes --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def runFeatureCounts(annotations_file, bamfile, outfile, nthreads=4, strand=2, options=""): '''run feature counts on *annotations_file* with *bam_file*. If the bam-file is paired, paired-end counting is enabled and the bam file automatically sorted. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" job_threads = nthreads # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(nthreads)i -s %(strand)s -b -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def buildTranscriptLevelReadCounts(infiles, outfile): '''count reads falling into transcripts of protein coding gene models. .. note:: In paired-end data sets each mate will be counted. Thus the actual read counts are approximately twice the fragment counts. ''' bamfile, geneset = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' statement = ''' zcat %(geneset)s | python %(scriptsdir)s/gtf2table.py --reporter=transcripts --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def bamToBed(infile, outfile, min_insert_size=0, max_insert_size=1000): """convert bam to bed with bedtools.""" scriptsdir = "/ifs/devel/andreas/cgat/scripts" if BamTools.isPaired(infile): # output strand as well statement = [ "cat %(infile)s " "| python %(scriptsdir)s/bam2bed.py " "--merge-pairs " "--min-insert-size=%(min_insert_size)i " "--max-insert-size=%(max_insert_size)i " "--log=%(outfile)s.log " "--bed-format=6 " "> %(outfile)s" % locals() ] else: statement = "bamToBed -i %(infile)s > %(outfile)s" % locals() E.debug("executing statement '%s'" % statement) retcode = subprocess.call(statement, cwd=os.getcwd(), shell=True) if retcode < 0: raise OSError("Child was terminated by signal %i: \n%s\n" % (-retcode, statement)) return outfile
def isPaired(filename): '''return "T" if bamfile contains paired end reads.''' if BamTools.isPaired(filename): return "T" else: return "F"
def bamToBed(infile, outfile, min_insert_size=0, max_insert_size=1000): '''convert bam to bed with bedtools.''' scriptsdir = "/ifs/devel/andreas/cgat/scripts" if BamTools.isPaired(infile): # output strand as well statement = [ 'cat %(infile)s ' '| python %(scriptsdir)s/bam2bed.py ' '--merge-pairs ' '--min-insert-size=%(min_insert_size)i ' '--max-insert-size=%(max_insert_size)i ' '--log=%(outfile)s.log ' '--bed-format=6 ' '> %(outfile)s' % locals() ] else: statement = "bamToBed -i %(infile)s > %(outfile)s" % locals() E.debug("executing statement '%s'" % statement) retcode = subprocess.call(statement, cwd=os.getcwd(), shell=True) if retcode < 0: raise OSError("Child was terminated by signal %i: \n%s\n" % (-retcode, statement)) return outfile
def isPaired(filename): '''return "T" if bamfile contains paired end reads.''' if BamTools.isPaired(filename): return "T" else: return "F"
def buildTranscriptLevelReadCounts(infiles, outfile): '''count reads falling into transcripts of protein coding gene models. .. note:: In paired-end data sets each mate will be counted. Thus the actual read counts are approximately twice the fragment counts. ''' bamfile, geneset = infiles if BamTools.isPaired(bamfile): counter = 'readpair-counts' else: counter = 'read-counts' statement = ''' zcat %(geneset)s | python %(scriptsdir)s/gtf2table.py --reporter=transcripts --bam-file=%(bamfile)s --counter=length --prefix="exons_" --counter=%(counter)s --prefix="" --counter=read-coverage --prefix=coverage_ --min-mapping-quality=%(counting_min_mapping_quality)i --multi-mapping=ignore --log=%(outfile)s.log | gzip > %(outfile)s ''' P.run()
def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0): '''Module to generate rMATS statment Module offers the option to permute group name labels and calculates readlength, which must be identical in all reads. Arguments --------- gtffile: string path to :term:`gtf` file designfile: string path to design file pvalue: string threshold for FDR testing strand: string strandedness option: can be 'fr-unstranded', 'fr-firststrand', or 'fr-secondstrand' outdir: string directory path for rMATS results permute : 1 or 0 option to activate random shuffling of sample groups ''' design = Expression.ExperimentalDesign(designfile) if permute == 1: design.table.group = random.choice( list(itertools.permutations(design.table.group))) group1 = ",".join( ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[0])]) with open(outdir + "/b1.txt", "w") as f: f.write(group1) group2 = ",".join( ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[1])]) with open(outdir + "/b2.txt", "w") as f: f.write(group2) readlength = BamTools.estimateTagSize(design.samples[0] + ".bam") statement = '''rMATS --b1 %(outdir)s/b1.txt --b2 %(outdir)s/b2.txt --gtf <(gunzip -c %(gtffile)s) --od %(outdir)s --readLength %(readlength)s --cstat %(pvalue)s --libType %(strand)s ''' % locals() # if Paired End Reads if BamTools.isPaired(design.samples[0] + ".bam"): statement += '''-t paired''' % locals() statement += ''' > %(outdir)s/%(designfile)s.log ''' P.run()
def SPMRWithMACS2(infile, outfile): '''Calculate signal per million reads with MACS2, output bedGraph''' # --SPMR ask MACS2 to generate pileup signal file of 'fragment pileup per million reads' sample = infile WCE = sample.replace("-sample", "-WCE") name = P.snip(outfile, ".Macs2SPMR.log").split("/")[-1] fragment_size = PARAMS["macs2_fragment_size"] job_memory = "10G" if BamTools.isPaired(sample): statement = '''macs2 callpeak --format=BAMPE --treatment %(sample)s --verbose=10 --name=%(name)s --outdir=macs2.dir --qvalue=0.1 --bdg --SPMR --control %(WCE)s --mfold 5 50 --gsize 1.87e9 >& %(outfile)s''' % locals() else: statement = '''macs2 callpeak --format=BAM --treatment %(sample)s --verbose=10 --name=%(name)s --outdir=macs2.dir --qvalue=0.1 --bdg --SPMR --control %(WCE)s --tsize %(fragment_size)s --mfold 5 50 --gsize 1.87e9 >& %(outfile)s''' % locals() print statement P.run()
def countDEXSeq(infiles, outfile): '''create counts for DEXSeq Counts bam reads agains exon features in flattened gtf. The required python script is provided by DEXSeq and uses HTSeqCounts. Parameters ---------- infile[0]: string :term:`bam` file input infile[1]: string :term:`gff` output from buildGff function outfile : string A :term:`txt` file containing results DEXSeq_strandedness : string :term:`PARAMS`. Specifies strandedness, options are 'yes', 'no' and 'reverse' ''' infile, gfffile = infiles ps = PYTHONSCRIPTSDIR if BamTools.isPaired(infile): paired = "yes" else: paired = "no" strandedness = PARAMS["DEXSeq_strandedness"] statement = '''python %(ps)s/dexseq_count.py -p %(paired)s -s %(strandedness)s -r pos -f bam %(gfffile)s %(infile)s %(outfile)s''' P.run()
def countDEXSeq(infiles, outfile): '''create counts for DEXSeq Counts bam reads agains exon features in flattened gtf. The required python script is provided by DEXSeq and uses HTSeqCounts. Parameters ---------- infile[0]: string :term:`bam` file input infile[1]: string :term:`gff` output from buildGff function outfile : string A :term:`txt` file containing results DEXSeq_strandedness : string :term:`PARAMS`. Specifies strandedness, options are 'yes', 'no' and 'reverse' ''' infile, gfffile = infiles ps = PYTHONSCRIPTSDIR if BamTools.isPaired(infile): paired = "yes" else: paired = "no" strandedness = PARAMS["DEXSeq_strandedness"] statement = '''python %(ps)s/dexseq_count.py -p %(paired)s -s %(strandedness)s -r pos -f bam %(gfffile)s %(infile)s %(outfile)s''' P.run()
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard'): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriated for RNA-Seq. Optional steps include: * deduplication - remove duplicate reads * quality score filtering - remove reads below a certain quality score. * paired ended data - merge pairs * paired ended data - filter by insert size ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.isPaired(bamfile) current_file = bamfile tmpdir = P.getTempFilename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(bedfile)s.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)
def runFeatureCounts(annotations_file, bamfile, outfile, job_threads=4, strand=0, options=""): '''run FeatureCounts to collect read counts. If `bamfile` is paired, paired-end counting is enabled and the bam file automatically sorted. Arguments --------- annotations_file : string Filename with gene set in :term:`gtf` format. bamfile : string Filename with short reads in :term:`bam` format. outfile : string Output filename in :term:`tsv` format. job_threads : int Number of threads to use. strand : int Strand option in FeatureCounts. options : string Options to pass on to FeatureCounts. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # sort by read name paired_processing = \ """samtools sort -@ %(job_threads)i -n -o %(bam_tmp)s %(bamfile)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(job_threads)i -s %(strand)s -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def runFeatureCounts(annotations_file, bamfile, outfile, nthreads=4, strand=2, options=""): '''run feature counts on *annotations_file* with *bam_file*. If the bam-file is paired, paired-end counting is enabled and the bam file automatically sorted. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, bamfile) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" job_options = "-pe dedicated %i" % nthreads # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(nthreads)i -s %(strand)s -b -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def runFeatureCounts(annotations_file, bamfile, outfile, job_threads=4, strand=0, options=""): '''run FeatureCounts to collect read counts. If `bamfile` is paired, paired-end counting is enabled and the bam file automatically sorted. Arguments --------- annotations_file : string Filename with gene set in :term:`gtf` format. bamfile : string Filename with short reads in :term:`bam` format. outfile : string Output filename in :term:`tsv` format. job_threads : int Number of threads to use. strand : int Strand option in FeatureCounts. options : string Options to pass on to FeatureCounts. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(job_threads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(job_threads)i -s %(strand)s -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard', filtering_nonunique=False): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriate for RNA-Seq. Optional steps include: For paired end data, pairs are merged and optionally filtered by insert size. Arguments --------- bamfile : string Filename of input file in :term:`bam` format. bedfile : string Filename of output file in :term:`bed` format. filtering_quality : int If set, remove reads with a quality score below given threshold. filtering_dedup : bool If True, deduplicate data. filtering_dedup_method : string Deduplication method. Possible options are ``picard`` and ``samtools``. filtering_nonunique : bool If True, remove non-uniquely matching reads. ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.isPaired(bamfile) current_file = bamfile tmpdir = P.getTempFilename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_nonunique: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''cat %(current_file)s | python %%(scriptsdir)s/bam2bam.py --method=filter --filter-method=unique,mapped --log=%%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(bedfile)s.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard'): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriated for RNA-Seq. Optional steps include: * deduplication - remove duplicate reads * quality score filtering - remove reads below a certain quality score. * paired ended data - merge pairs * paired ended data - filter by insert size ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.isPaired(bamfile) current_file = bamfile tmpdir = P.getTempFilename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(bedfile)s.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run() os.unlink(tmpdir)
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard', filtering_nonunique=False): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriate for RNA-Seq. Optional steps include: For paired end data, pairs are merged and optionally filtered by insert size. Arguments --------- bamfile : string Filename of input file in :term:`bam` format. bedfile : string Filename of output file in :term:`bed` format. filtering_quality : int If set, remove reads with a quality score below given threshold. filtering_dedup : bool If True, deduplicate data. filtering_dedup_method : string Deduplication method. Possible options are ``picard`` and ``samtools``. filtering_nonunique : bool If True, remove non-uniquely matching reads. ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.isPaired(bamfile) current_file = bamfile tmpdir = P.getTempFilename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_nonunique: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''cat %(current_file)s | python %%(scriptsdir)s/bam2bam.py --method=filter --filter-method=unique,mapped --log=%%(bedfile)s.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT 2>> %%(bedfile)s.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | python %(scriptsdir)s/bam2bed.py --log=%(bedfile)s.log - | python %(scriptsdir)s/bed2bed.py --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.log | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run()