def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # variables required for statement built by FastqScreen() tempdir = P.getTempDir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # configure job_threads with fastq_screen_options from PARAMS job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) job_memory = "8G" # Create fastq_screen config file in temp directory # using parameters from Pipeline.ini with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in list(PARAMS.items()): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles,), outfile) P.run() shutil.rmtree(tempdir) P.touch(outfile)
def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir() job_memory = PARAMS["gatk_memory"] genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr") outfile2 = outfile.replace(".bqsr", ".realign.bqsr") PipelineExome.GATKReadGroups(infile, outfile1, genome, PARAMS["readgroup_library"], PARAMS["readgroup_platform"], PARAMS["readgroup_platform_unit"]) PipelineExome.GATKIndelRealign(outfile1, outfile2, genome, PARAMS["gatk_threads"]) IOTools.zapFile(outfile1) PipelineExome.GATKBaseRecal(outfile2, outfile, genome, PARAMS["gatk_dbsnp"], PARAMS["gatk_solid_options"]) IOTools.zapFile(outfile2)
def GATKReadGroups(infile, outfile, genome, library="unknown", platform="Illumina", platform_unit="1", track="unknown"): '''Reorders BAM according to reference fasta and adds read groups''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%(genome)s ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals() statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(outfile)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(outfile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def GATKBaseRecal(infile, outfile, genome, dbsnp, solid_options=""): '''Recalibrates base quality scores using GATK''' track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals() statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp, solid_options=""): '''Recalibrates base quality scores using GATK''' track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -L %(intervals)s -ip %(padding)s -I %(infile)s --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals() statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.getTempDir(shared=True) outfile_tumor = outfile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) statement = '''AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' statement += '''AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s ; checkpoint ;''' statement += '''samtools index %(outfile)s ; checkpoint ;''' statement += '''rm -rf %(tmpdir_gatk)s ; checkpoint ; ''' P.run() IOTools.zapFile(infile) IOTools.zapFile(infile_tumor)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_options = getGATKOptions() # TS no multithreading so why 6 threads? # job_threads = 6 # tmpdir_gatk = P.getTempDir('tmpbam') tmpdir_gatk = P.getTempDir('/ifs/scratch') # threads = PARAMS["gatk_threads"] outfile_tumor = outfile.replace("Control", PARAMS["mutect_tumour"]) infile_tumor = infile.replace("Control", PARAMS["mutect_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace("Control", PARAMS["mutect_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace("Control", PARAMS["mutect_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace("Control", PARAMS["mutect_tumour"]) # T.S delete after testing # tmpdir_gatk = P.getTempDir('.') statement = '''AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s ; checkpoint ;''' % locals() statement += '''samtools index %(outfile)s ; checkpoint ;''' statement += '''rm -rf %(tmpdir_gatk)s ; checkpoint ; ''' % locals() P.run() IOTools.zapFile(infile) IOTools.zapFile(infile_tumor)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.getTempDir(shared=True) outfile_tumor = outfile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) statement = '''picard AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' statement += '''picard AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s ; checkpoint ;''' statement += '''samtools index %(outfile)s ; checkpoint ;''' statement += '''rm -rf %(tmpdir_gatk)s ; checkpoint ; ''' P.run() IOTools.zapFile(infile) IOTools.zapFile(infile_tumor)
def __init__(self, save=True, summarize=False, threads=1, *args, **kwargs): self.save = save self.summarize = summarize self.threads = threads if self.save: self.outdir = "processed.dir" else: self.outdir = P.getTempDir(shared=True) self.processors = []
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def runFastqScreen(infiles, outfile): """run FastqScreen on input files.""" # variables required for statement built by FastqScreen() tempdir = P.getTempDir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # Create fastq_screen config file in temp directory # using parameters from Pipeline.ini with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in PARAMS.items(): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles,), outfile) P.run() shutil.rmtree(tempdir) P.touch(outfile)
def runPicardOnRealigned(infile, outfile): to_cluster = USECLUSTER job_options = getGATKOptions() # TS no multithreading so why 6 threads? # job_threads = 6 tmpdir_gatk = P.getTempDir('/ifs/scratch') # threads = PARAMS["gatk_threads"] outfile_tumor = outfile.replace("Control", PARAMS["mutect_tumour"]) infile_tumor = infile.replace("Control", PARAMS["mutect_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace("Control", PARAMS["mutect_tumour"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineMappingQC.buildPicardAlignmentStats(infile, outfile, genome) PipelineMappingQC.buildPicardAlignmentStats(infile_tumor, outfile_tumor, genome) # check above functions then remove statement statement = ''' cat %(infile)s | python %%(scriptsdir)s/bam2bam.py -v 0 --method=set-sequence | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%%(bwa_index_dir)s/%%(genome)s.fa ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s; cat %(infile_tumor)s | python %%(scriptsdir)s/bam2bam.py -v 0 --method=set-sequence --output-sam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%%(bwa_index_dir)s/%%(genome)s.fa ASSUME_SORTED=true OUTPUT=%(outfile_tumor)s VALIDATION_STRINGENCY=SILENT >& %(outfile_tumor)s;''' % locals()
def runPicardOnRealigned(infile, outfile): to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.getTempDir() outfile_tumor = outfile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineMappingQC.buildPicardAlignmentStats(infile, outfile, genome) PipelineMappingQC.buildPicardAlignmentStats(infile_tumor, outfile_tumor, genome)
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # variables required for statement built by FastqScreen() tempdir = P.getTempDir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # Create fastq_screen config file in temp directory # using parameters from Pipeline.ini with IOTools.openFile(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in PARAMS.items(): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles,), outfile) P.run() shutil.rmtree(tempdir) P.touch(outfile)
def runMemeCHIP(infile, outfile, motifs=None): '''Run the MEME-CHiP pipeline on the input files. optional motifs files can be supplied as a list''' if motifs: motifs = " ".join("-db %s" % motif for motif in motifs) else: motifs = " " nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme-chip skipped") P.touch(outfile) return target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.getTempDir(".") statement = ''' meme-chip %(infile)s -p %(meme_threads)s -oc %(tmpdir)s -nmeme %(memechip_nmeme)s %(memechip_options)s %(motifs)s > %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["memechip_threads"]) != 1: job_options = str(PARAMS["memechip_job_options"]) job_threads = int(PARAMS["memechip_threads"]) cluster_parallel_environment = str(PARAMS["memechip_cluster_parallel_environment"]) P.run() collectMEMEResults(tmpdir, target_path, outfile, method="memechip")
def runPicardOnRealigned(infile, outfile): to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.getTempDir() outfile_tumor = outfile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineMappingQC.buildPicardAlignmentStats(infile, outfile, genome) PipelineMappingQC.buildPicardAlignmentStats(infile_tumor, outfile_tumor, genome)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) return target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "meme", outfile) tmpdir = P.getTempDir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) return target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) tmpdir = P.getTempDir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def GATKReadGroups(infile, outfile, genome, library="unknown", platform="Illumina", platform_unit="1", track="unknown"): '''Reorders BAM according to reference fasta and adds read groups''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() job_threads = 3 statement = '''picard ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%(genome)s ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals() statement += '''picard AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(outfile)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(outfile)s ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run()
def runDREME(infile, outfile, neg_file = "", options = ""): ''' Run DREME on fasta file. If a neg_file is passed then DREME will use this as the negative set, otherwise the default is to shuffle the input ''' nseqs_pos = int(FastaIterator.count(infile)) if nseqs_pos < 2: E.warn("%s: less than 2 sequences - dreme skipped" % outfile) P.touch(outfile) return if neg_file: nseqs_neg = int(FastaIterator.count(neg_file)) if nseqs_neg < 2: E.warn("%s: less than 2 sequences in negatives file - dreme skipped" % outfile) P.touch(outfile) return else: neg_file = "-n %s" % neg_file logfile = outfile + ".log" target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.getTempDir(".") statement = ''' dreme -p %(infile)s %(neg_file)s -png -oc %(tmpdir)s %(dreme_options)s %(options)s > %(logfile)s ''' P.run() collectMEMEResults(tmpdir, target_path, outfile, method="dreme")
def __init__(self, save=True, summarise=False, threads=1, trimgalore_options=None, trimmomatic_options=None, sickle_options=None, flash_options=None, fastx_trimmer_options=None, cutadapt_options=None, adapter_file=None, *args, **kwargs): self.save = save self.summarise = summarise self.threads = threads self.trimgalore_opt = trimgalore_options self.trimmomatic_opt = trimmomatic_options self.sickle_opt = sickle_options self.flash_opt = flash_options self.fastx_trimmer_opt = fastx_trimmer_options self.cutadapt_opt = cutadapt_options self.adapters = adapter_file if self.save: self.outdir = "processed.dir" else: self.outdir = P.getTempDir("/ifs/scratch")
def buildCodingPotential(infile, outfile): '''run CPC analysis as in the cpc script. This module runs framefinder and blastx on both strands. It seems to work, but I have not thoroughly tested it. I expect that the false positive rate increases (i.e., predicting non-coding as coding) in cases where the best framefinder match and the best blast match are on opposite strands. In the original CPC, these would be separated. ''' try: cpc_dir = os.environ["CPC_HOME"] except KeyError: raise ValueError("CPC_HOME environment variable is not set. ") tmpdir = P.getTempDir(".") track = P.snip(outfile, ".coding.gz") # extract features for frame finder # replaces extract_framefinder_feats.pl to parse both strands with open(os.path.join(tmpdir, "ff.feat"), "w") as outf: outf.write("\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n") for line in IOTools.openFile("%s.frame.gz" % track): if line.startswith(">"): try: (id, start, end, score, used, mode, tpe) = \ re.match( ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups() except AttributeError: raise ValueError("parsing error in line %s" % line) length = int(end) - int(start) + 1 strict = int(tpe == "strict") outf.write("\t".join((id, str(length), used, str(strict))) + "\n") to_cluster = USECLUSTER # extract features and prepare svm data s = [] s.append(''' zcat %(infile)s | perl %(cpc_dir)s/libs/blast2table.pl | tee %(tmpdir)s/blastx.table | perl %(cpc_dir)s/bin/extract_blastx_features.pl > %(tmpdir)s/blastx.feat1; ''') s.append(''' cat %(track)s_norepeats.fasta | perl %(cpc_dir)s/bin/add_missing_entries.pl %(tmpdir)s/blastx.feat1 > %(tmpdir)s/blastx.feat; ''') # step 2 - prepare data s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat > %(tmpdir)s/blastx.lsv; ''') s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat > %(tmpdir)s/ff.lsv; ''') s.append(''' perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv > %(tmpdir)s/test.lsv; ''') s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale -r %(cpc_dir)s/data/libsvm.range %(tmpdir)s/test.lsv > %(tmpdir)s/test.lsv.scaled; ''') # step 3: prediction m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0") # standard m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model") # Prob m_libsvm_model2 = os.path.join( cpc_dir, "data/libsvm.model2") # Prob + weighted version m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range") s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2 %(tmpdir)s/test.lsv.scaled %(m_libsvm_model0)s %(tmpdir)s/test.svm0.predict > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr; ''') s.append(''' printf "gene_id\\tlength\\tresult\\tvalue\\n" | gzip > %(outfile)s; cat %(tmpdir)s/test.svm0.predict | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta | gzip >> %(outfile)s; ''') # generate reports s.append('''cat %(tmpdir)s/blastx.feat | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz) | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf; gzip %(outfile)s.orf %(outfile)s.homology; ''') # now run it all statement = " checkpoint; ".join(s) P.run() # clean up shutil.rmtree(tmpdir)
def filterBamfiles(infile, sentinel): """ Pre-process bamfiles prior to peak calling. i) sort bamfiles ii) remove unmapped readswith bam2bam.py iii) remove non-uniquely mapping reads with bam2bam.py (optional) iv) remove duplicates with Picards MarkDuplicates (optional) v) remove reads from masked regions with bedtools intersect (optional) vi) index """ # create tempfile for Picard's MarkDuplicates picard_tmp = P.getTempDir(PARAMS["scratchdir"]) outfile = P.snip(sentinel, ".sentinel") + ".bam" # ensure bamfile is sorted, statement = [ "samtools sort @IN@ -o @[email protected]", ] # remove unmapped reads statement.append("cgat bam2bam" " --method=filter --filter-method=mapped" " --log=%(outfile)s.log" " < @[email protected]" " > @OUT@") # remove non-uniquely mapping reads, if requested if PARAMS["filter_remove_non_unique"]: statement.append("cgat bam2bam" " --method=filter --filter-method=unique" " --log=%(outfile)s.log" " < @IN@" " > @OUT@") # remove duplicates, if requested if PARAMS["filter_remove_duplicates"]: statement.append("MarkDuplicates" " INPUT=@IN@" " ASSUME_SORTED=true" " REMOVE_DUPLICATES=true" " QUIET=false" " OUTPUT=@OUT@" " METRICS_FILE=/dev/null" " VALIDATION_STRINGENCY=SILENT" " TMP_DIR=%(picard_tmp)s" " 2> %(outfile)s.log") # mask regions, if intervals supplied if PARAMS["filter_mask_intervals"]: mask = PARAMS["filter_mask_intervals"] statement.append("bedtools intersect" " -abam @IN@" " -b %(mask)s" " -wa" " -v" " > @OUT@") statement.append("mv @IN@ %(outfile)s") statement.append("samtools index %(outfile)s") job_memory = "5G" statement = P.joinStatements(statement, infile) P.run() P.touch(sentinel) shutil.rmtree(picard_tmp)
def buildCodingPotential(infile, outfile): '''run CPC analysis as in the cpc script. This module runs framefinder and blastx on both strands. It seems to work, but I have not thoroughly tested it. I expect that the false positive rate increases (i.e., predicting non-coding as coding) in cases where the best framefinder match and the best blast match are on opposite strands. In the original CPC, these would be separated. ''' try: cpc_dir = os.environ["CPC_HOME"] except KeyError: raise ValueError("CPC_HOME environment variable is not set. ") tmpdir = P.getTempDir(".") track = P.snip(outfile, ".coding.gz") # extract features for frame finder # replaces extract_framefinder_feats.pl to parse both strands with open(os.path.join(tmpdir, "ff.feat"), "w") as outf: outf.write( "\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n") for line in IOTools.openFile("%s.frame.gz" % track): if line.startswith(">"): try: (id, start, end, score, used, mode, tpe) = \ re.match( ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups() except AttributeError: raise ValueError("parsing error in line %s" % line) length = int(end) - int(start) + 1 strict = int(tpe == "strict") outf.write( "\t".join((id, str(length), used, str(strict))) + "\n") to_cluster = USECLUSTER # extract features and prepare svm data s = [] s.append(''' zcat %(infile)s | perl %(cpc_dir)s/libs/blast2table.pl | tee %(tmpdir)s/blastx.table | perl %(cpc_dir)s/bin/extract_blastx_features.pl > %(tmpdir)s/blastx.feat1; ''') s.append(''' cat %(track)s_norepeats.fasta | perl %(cpc_dir)s/bin/add_missing_entries.pl %(tmpdir)s/blastx.feat1 > %(tmpdir)s/blastx.feat; ''') # step 2 - prepare data s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat > %(tmpdir)s/blastx.lsv; ''') s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat > %(tmpdir)s/ff.lsv; ''') s.append(''' perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv > %(tmpdir)s/test.lsv; ''') s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale -r %(cpc_dir)s/data/libsvm.range %(tmpdir)s/test.lsv > %(tmpdir)s/test.lsv.scaled; ''') # step 3: prediction m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0") # standard m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model") # Prob m_libsvm_model2 = os.path.join( cpc_dir, "data/libsvm.model2") # Prob + weighted version m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range") s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2 %(tmpdir)s/test.lsv.scaled %(m_libsvm_model0)s %(tmpdir)s/test.svm0.predict > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr; ''') s.append(''' printf "gene_id\\tlength\\tresult\\tvalue\\n" | gzip > %(outfile)s; cat %(tmpdir)s/test.svm0.predict | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta | gzip >> %(outfile)s; ''') # generate reports s.append('''cat %(tmpdir)s/blastx.feat | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz) | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf; gzip %(outfile)s.orf %(outfile)s.homology; ''') # now run it all statement = " checkpoint; ".join(s) P.run() # clean up shutil.rmtree(tmpdir)
def filterBamfiles(infile, sentinel): """ Pre-process bamfiles prior to peak calling. i) sort bamfiles ii) remove unmapped readswith bam2bam.py iii) remove non-uniquely mapping reads with bam2bam.py (optional) iv) remove duplicates with Picards MarkDuplicates (optional) v) remove reads from masked regions with bedtools intersect (optional) vi) index """ # create tempfile for Picard's MarkDuplicates picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"]) outfile = P.snip(sentinel, ".sentinel") + ".bam" # ensure bamfile is sorted, statement = ["samtools sort @IN@ @OUT@", ] # remove unmapped reads statement.append("python %(scriptsdir)s/bam2bam.py" " --method=filter --filter-method=mapped" " --log=%(outfile)s.log" " < @[email protected]" " > @OUT@") # remove non-uniquely mapping reads, if requested if PARAMS["filter_remove_non_unique"]: statement.append("python %(scriptsdir)s/bam2bam.py" " --method=filter --filter-method=unique" " --log=%(outfile)s.log" " < @IN@" " > @OUT@") # remove duplicates, if requested if PARAMS["filter_remove_duplicates"]: statement.append("MarkDuplicates" " INPUT=@IN@" " ASSUME_SORTED=true" " REMOVE_DUPLICATES=true" " QUIET=false" " OUTPUT=@OUT@" " METRICS_FILE=/dev/null" " VALIDATION_STRINGENCY=SILENT" " TMP_DIR=%(picard_tmp)s" " 2> %(outfile)s.log") # mask regions, if intervals supplied if PARAMS["filter_mask_intervals"]: mask = PARAMS["filter_mask_intervals"] statement.append("bedtools intersect" " -abam @IN@" " -b %(mask)s" " -wa" " -v" " > @OUT@") statement.append("mv @IN@ %(outfile)s") statement.append("samtools index %(outfile)s") job_options = "-l mem_free=10G" statement = P.joinStatements(statement, infile) P.run() P.touch(sentinel) shutil.rmtree(picard_tmp)
def runFeatureCounts(annotations_file, bamfile, outfile, nthreads=4, strand=0, options=""): '''run feature counts on *annotations_file* with *bam_file*. If the bam-file is paired, paired-end counting is enabled and the bam file automatically sorted. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" job_threads = nthreads # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(nthreads)i -s %(strand)s -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)
def runMEMEOnSequences(infile, outfile, background=None, psp=None): '''run MEME on fasta sequences to find motifs By defualt MEME calculates a zero-th order background model from the nucleotide frequencies in the input set. To use a different background set, a background file created by fasta-get-markov must be supplied. To perform descrimantive analysis a position specific prior (psp) file must be provided. This can be generated used generatePSP. ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs < 2: E.warn("%s: less than 2 sequences - meme skipped" % outfile) P.touch(outfile) return # Get the total length of the sequences to decide the memory total_seqs_length = 0 with IOTools.openFile(infile, "r") as fasta_reader: iterator_fasta = FastaIterator.iterate(fasta_reader) for fasta_seq in iterator_fasta: total_seqs_length += len(fasta_seq.sequence) fasta_reader.close() # If the length of all sequences is higher than 160,000bp # Up the memory job_memory = "2G" if (total_seqs_length > 160000): job_memory = "4G" if PARAMS.get("meme_revcomp", True): revcomp = "-revcomp" else: revcomp = "" target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.getTempDir(".") if background: background_model = "-bfile %s" % background else: background_model = "" if psp: E.info("Running MEME in descriminative mode") psp_file = "-psp %s" % psp else: psp_file = "" statement = ''' meme %(infile)s -dna %(revcomp)s -p %(meme_threads)s -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(background_model)s %(psp_file)s %(meme_options)s 2> %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["meme_threads"]) != 1: job_options = str(PARAMS["meme_job_options"]) job_threads = int(PARAMS["meme_threads"]) cluster_parallel_environment = str(PARAMS["meme_cluster_parallel_environment"]) P.run() collectMEMEResults(tmpdir, target_path, outfile)
def runCufflinks(gtffile, bamfile, outfile, job_threads=1): '''run cufflinks to estimate expression levels. See cufflinks manuals for full explanation of infiles/outfiles/options http://cole-trapnell-lab.github.io/cufflinks/cufflinks/index.html Arguments --------- gtffile : string Filename of geneset in :term:`gtf` format. bamfile : string Filename of reads in :term:`bam` format. genome_dir : string :term:`PARAMS` - genome directory containing fasta file. This is specified in pipeline_ini cufflinks_library_type : string :term:`PARAMS` - cufflinks library type option. This is specified in pipeline_ini cufflinks_options : string :term:`PARAMS` - cufflinks options (see manual). These are specified in pipeline_ini outfile : string defines naming of 3 output files for each input file 1.outfile.gtf.gz: transcripts.gtf file in :term:`gtf` format produced by cufflinks (see manual). Contains the assembled gene isoforms. This is the file used for the downstream file analysis 2.outfile.fpkm_tracking.gz: renamed outfile.isoforms.fpkm_tracking file from cufflinks - contains estimated isoform-level expression values in "FPKM Tracking Format". 3.outfile.genes_tracking.gz: renamed outfile.genes.fpkm_tracking.gz from cufflinks - contains estimated gene-level expression values in "FPKM Tracking Format". job_threads : int Number of threads to use ''' track = os.path.basename(P.snip(gtffile, ".gtf.gz")) tmpdir = P.getTempDir() gtffile = os.path.abspath(gtffile) bamfile = os.path.abspath(bamfile) outfile = os.path.abspath(outfile) # note: cufflinks adds \0 bytes to gtf file - replace with '.' # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a # 4.3Mb intron. # AH: removed log messages about BAM record error # These cause logfiles to grow several Gigs and are # frequent for BAM files not created by tophat. # Error is: # BAM record error: found spliced alignment without XS attribute statement = '''mkdir %(tmpdir)s; cd %(tmpdir)s; cufflinks --label %(track)s --GTF <(gunzip < %(gtffile)s) --num-threads %(job_threads)i --frag-bias-correct %(genome_dir)s/%(genome)s.fa --library-type %(cufflinks_library_type)s %(cufflinks_options)s %(bamfile)s | grep -v 'BAM record error' >& %(outfile)s; perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz; gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz; gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz; rm -rf %(tmpdir)s ''' P.run()
def runFeatureCounts(annotations_file, bamfile, outfile, job_threads=4, strand=0, options=""): '''run FeatureCounts to collect read counts. If `bamfile` is paired, paired-end counting is enabled and the bam file automatically sorted. Arguments --------- annotations_file : string Filename with gene set in :term:`gtf` format. bamfile : string Filename with short reads in :term:`bam` format. outfile : string Output filename in :term:`tsv` format. job_threads : int Number of threads to use. strand : int Strand option in FeatureCounts. options : string Options to pass on to FeatureCounts. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # sort by read name paired_processing = \ """samtools sort -@ %(job_threads)i -n -o %(bam_tmp)s %(bamfile)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(job_threads)i -s %(strand)s -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise ValueError( "control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)
def runFeatureCounts(annotations_file, bamfile, outfile, job_threads=4, strand=0, options=""): '''run FeatureCounts to collect read counts. If `bamfile` is paired, paired-end counting is enabled and the bam file automatically sorted. Arguments --------- annotations_file : string Filename with gene set in :term:`gtf` format. bamfile : string Filename with short reads in :term:`bam` format. outfile : string Output filename in :term:`tsv` format. job_threads : int Number of threads to use. strand : int Strand option in FeatureCounts. options : string Options to pass on to FeatureCounts. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(job_threads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(job_threads)i -s %(strand)s -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()