def bamCoverageRNA(infile, outfile): '''Make normalised bigwig tracks with deeptools''' norm_method = PARAMS["deeptools_norm_method"] # STAR MAPQ of 255 indicates uniquely mapped read if len(infile) > 0: if BamTools.is_paired(infile): statement = f'''bamCoverage -b {infile} -o {outfile} --binSize 5 --normalizeUsing {norm_method} --samFlagInclude 64 --centerReads --minMappingQuality 255 --smoothLength 10 --skipNAs -p "max" ''' else: statement = f'''bamCoverage -b {infile} -o {outfile} --binSize 5 --normalizeUsing {norm_method} --minMappingQuality 255 --smoothLength 10 --samFlagExclude 4 --centerReads -p "max" ''' P.run(statement, job_memory="2G", job_threads=10)
def runRMATS(gtffile, designfile, pvalue, strand, outdir, permute=0): '''Module to generate rMATS statment Module offers the option to permute group name labels and calculates readlength, which must be identical in all reads. Arguments --------- gtffile: string path to :term:`gtf` file designfile: string path to design file pvalue: string threshold for FDR testing strand: string strandedness option: can be 'fr-unstranded', 'fr-firststrand', or 'fr-secondstrand' outdir: string directory path for rMATS results permute : 1 or 0 option to activate random shuffling of sample groups ''' design = Expression.ExperimentalDesign(designfile) if permute == 1: permutelist = design.table.group.tolist() random.shuffle(permutelist) design.table.group = permutelist group1 = ",".join( ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[0])]) with open(outdir + "/b1.txt", "w") as f: f.write(group1) group2 = ",".join( ["%s.bam" % x for x in design.getSamplesInGroup(design.groups[1])]) with open(outdir + "/b2.txt", "w") as f: f.write(group2) readlength = BamTools.estimateTagSize(design.samples[0]+".bam") statement = '''rMATS --b1 %(outdir)s/b1.txt --b2 %(outdir)s/b2.txt --gtf <(gunzip -c %(gtffile)s) --od %(outdir)s --readLength %(readlength)s --cstat %(pvalue)s --libType %(strand)s ''' % locals() # if Paired End Reads if BamTools.is_paired(design.samples[0]+".bam"): statement += '''-t paired''' % locals() statement += ''' > %(outdir)s/%(designfile)s.log ''' P.run(statement, job_condaenv="splicing")
def BAMtotalcounts(infile, outfile): '''Count total reads in BAM for normalisation''' if bamtools.is_paired(infile): statement = f'''samtools view -f 2 {infile} | wc -l | awk 'BEGIN {{OFS="\\t"}} {{print $0/2}}' > {outfile}''' # count only reads mapped in proper pairs else: statement = f'''samtools view -F 4 {infile} | wc -l | awk 'BEGIN {{OFS="\\t"}} {{print $0}}' > {outfile}''' # exclude unmapped reads P.run(statement)
def countDEXSeq(infiles, outfile): '''create counts for DEXSeq Counts bam reads agains exon features in flattened gtf. The required python script is provided by DEXSeq and uses HTSeqCounts. Parameters ---------- infile[0]: string :term:`bam` file input infile[1]: string :term:`gff` output from buildGff function outfile : string A :term:`txt` file containing results DEXSeq_strandedness : string :term:`PARAMS`. Specifies strandedness, options are 'yes', 'no' and 'reverse' ''' infile, gfffile = infiles ps = PYTHONSCRIPTSDIR if BamTools.is_paired(infile): paired = "yes" else: paired = "no" strandedness = PARAMS["DEXSeq_strandedness"] statement = '''python %(ps)s/dexseq_count.py -p %(paired)s -s %(strandedness)s -r pos -f bam %(gfffile)s %(infile)s %(outfile)s''' P.run(statement, job_condaenv="splicing")
def scoreIntervalsBAM(infiles, outfile): '''Count reads in bed intervals''' interval, bam = infiles tmp_file = bam.replace(".merge.bam", ".tmp") if bamtools.is_paired(bam): # -p flag specifes only to count paired reads options = "-p" else: options = " " statement = f'''bedtools multicov {options} -q 10 -bams {bam} -bed <(cut -f1-7 {interval} ) > {outfile} && sed -i '1i \contig\\tstart\\tend\\tpeak_id\\tpeak_score\\twidth\\tfeature\\ttotal' {outfile}''' P.run(statement)
def convertReadsToIntervals(bamfile, bedfile, filtering_quality=None, filtering_dedup=None, filtering_dedup_method='picard', filtering_nonunique=False): '''convert reads in *bamfile* to *intervals*. This method converts read data into intervals for counting based methods. This method is not appropriate for RNA-Seq. Optional steps include: For paired end data, pairs are merged and optionally filtered by insert size. Arguments --------- bamfile : string Filename of input file in :term:`bam` format. bedfile : string Filename of output file in :term:`bed` format. filtering_quality : int If set, remove reads with a quality score below given threshold. filtering_dedup : bool If True, deduplicate data. filtering_dedup_method : string Deduplication method. Possible options are ``picard`` and ``samtools``. filtering_nonunique : bool If True, remove non-uniquely matching reads. ''' track = P.snip(bedfile, ".bed.gz") is_paired = BamTools.is_paired(bamfile) current_file = bamfile tmpdir = P.get_temp_filename() statement = ["mkdir %(tmpdir)s"] nfiles = 0 if filtering_quality > 0: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''samtools view -q %(filtering_quality)i -b %(current_file)s 2>> %%(bedfile)s.quality.log > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_nonunique: next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() statement.append('''cat %(current_file)s | cgat bam2bam --method=filter --filter-method=unique,mapped --log=%%(bedfile)s.nonunique.log 2> %%(bedfile)s.nonunique.err > %(next_file)s ''' % locals()) nfiles += 1 current_file = next_file if filtering_dedup is not None: # Picard's MarkDuplicates requries an explicit bam file. next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals() if filtering_dedup_method == 'samtools': statement.append('''samtools rmdup - - ''') elif filtering_dedup_method == 'picard': statement.append('''picard MarkDuplicates INPUT=%(current_file)s OUTPUT=%(next_file)s ASSUME_SORTED=TRUE METRICS_FILE=%(bedfile)s.duplicate_metrics REMOVE_DUPLICATES=TRUE VALIDATION_STRINGENCY=SILENT >& %%(bedfile)s.markdup.log ''' % locals()) nfiles += 1 current_file = next_file if is_paired: statement.append('''cat %(current_file)s | cgat bam2bed --merge-pairs --min-insert-size=%(filtering_min_insert_size)i --max-insert-size=%(filtering_max_insert_size)i --log=%(bedfile)s.bam2bed.log - 2> %(bedfile)s.bam2bed.err | cgat bed2bed --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.sanitize.log 2> %(bedfile)s.sanitize.err | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') else: statement.append('''cat %(current_file)s | cgat bam2bed --log=%(bedfile)s.bam2bed.log - 2> %(bedfile)s.bam2bed.err | cgat bed2bed --method=sanitize-genome --genome-file=%(genome_dir)s/%(genome)s --log=%(bedfile)s.sanitize.log 2> %(bedfile)s.sanitize.err | cut -f 1,2,3,4 | sort -k1,1 -k2,2n | bgzip > %(bedfile)s''') statement.append("tabix -p bed %(bedfile)s >& %(bedfile)s.tabix.log") statement.append("rm -rf %(tmpdir)s") statement = " ; ".join(statement) P.run(statement, job_memory="8G")