def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir() job_memory = PARAMS["gatk_memory"] genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr") outfile2 = outfile.replace(".bqsr", ".realign.bqsr") PipelineExome.GATKReadGroups(infile, outfile1, genome, PARAMS["readgroup_library"], PARAMS["readgroup_platform"], PARAMS["readgroup_platform_unit"]) PipelineExome.GATKIndelRealign(outfile1, outfile2, genome, PARAMS["gatk_threads"]) IOTools.zap_file(outfile1) PipelineExome.GATKBaseRecal(outfile2, outfile, genome, PARAMS["gatk_dbsnp"], PARAMS["gatk_solid_options"]) IOTools.zap_file(outfile2)
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # variables required for statement built by FastqScreen() tempdir = P.get_temp_dir(".") outdir = os.path.join(PARAMS["exportdir"], "fastq_screen") # configure job_threads with fastq_screen_options from PARAMS job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) # Create fastq_screen config file in temp directory # using parameters from Pipeline.yml with IOTools.open_file(os.path.join(tempdir, "fastq_screen.conf"), "w") as f: for i, k in list(PARAMS.items()): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen() statement = m.build((infiles, ), outfile) P.run(statement, job_memory="8G") shutil.rmtree(tempdir) IOTools.touch_file(outfile)
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.is_empty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp, solid_options=""): '''Recalibrates base quality scores using GATK''' track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir('.') job_options = getGATKOptions() job_threads = 3 statement = '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -L %(intervals)s -ip %(padding)s -I %(infile)s --knownSites %(dbsnp)s %(solid_options)s ; ''' % locals() statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s ; ''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run(statement)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.get_temp_dir(shared=True) outfile_tumor = outfile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) statement = '''picard AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ;''' statement += '''picard AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ;''' statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s;''' statement += "samtools index %(outfile)s; " statement += "rm -rf %(tmpdir_gatk)s ;" P.run(statement) IOTools.zap_file(infile) IOTools.zap_file(infile_tumor)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def __init__(self, save=True, summarize=False, threads=1, qual_format='phred64', *args, **kwargs): self.save = save self.summarize = summarize self.threads = threads if self.save: self.outdir = "processed.dir" else: self.outdir = P.get_temp_dir(shared=True) self.processors = [] self.qual_format = qual_format
def runPicardOnRealigned(infile, outfile): to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.get_temp_dir() outfile_tumor = outfile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) PipelineMappingQC.buildPicardAlignmentStats(infile, outfile, genome) PipelineMappingQC.buildPicardAlignmentStats(infile_tumor, outfile_tumor, genome)
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # configure job_threads with fastq_screen_options from PARAMS job_threads = re.findall(r'--threads \d+', PARAMS['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) tempdir = P.get_temp_dir(".") conf_fn = os.path.join(tempdir, "fastq_screen.conf") with IOTools.open_file(conf_fn, "w") as f: for i, k in PARAMS.items(): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = PipelineMapping.FastqScreen(config_filename=conf_fn) statement = m.build((infiles,), outfile) P.run(statement, job_memory="8G") shutil.rmtree(tempdir) IOTools.touch_file(outfile)
def runMemeCHIP(infile, outfile, motifs=None): '''Run the MEME-CHiP pipeline on the input files. optional motifs files can be supplied as a list''' if motifs: motifs = " ".join("-db %s" % motif for motif in motifs) else: motifs = " " nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme-chip skipped") P.touch(outfile) return target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") statement = ''' meme-chip %(infile)s -p %(meme_threads)s -oc %(tmpdir)s -nmeme %(memechip_nmeme)s %(memechip_options)s %(motifs)s > %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["memechip_threads"]) != 1: job_options = str(PARAMS["memechip_job_options"]) job_threads = int(PARAMS["memechip_threads"]) cluster_parallel_environment = str( PARAMS["memechip_cluster_parallel_environment"]) P.run(statement) collectMEMEResults(tmpdir, target_path, outfile, method="memechip")
def GATKReadGroups(infile, outfile, genome, library="unknown", platform="Illumina", platform_unit="1", track="unknown"): '''Reorders BAM according to reference fasta and adds read groups''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir('.') job_options = getGATKOptions() job_threads = 3 statement = '''picard ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%(genome)s ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; ''' % locals() statement += '''picard AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(outfile)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ;''' % locals() statement += '''samtools index %(outfile)s ; ''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run(statement)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) IOTools.touch_file(outfile) return target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) tmpdir = P.get_temp_dir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def runDREME(infile, outfile, neg_file="", options=""): ''' Run DREME on fasta file. If a neg_file is passed then DREME will use this as the negative set, otherwise the default is to shuffle the input ''' nseqs_pos = int(FastaIterator.count(infile)) if nseqs_pos < 2: E.warn("%s: less than 2 sequences - dreme skipped" % outfile) P.touch(outfile) return if neg_file: nseqs_neg = int(FastaIterator.count(neg_file)) if nseqs_neg < 2: E.warn( "%s: less than 2 sequences in negatives file - dreme skipped" % outfile) P.touch(outfile) return else: neg_file = "-n %s" % neg_file logfile = outfile + ".log" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") statement = ''' dreme -p %(infile)s %(neg_file)s -png -oc %(tmpdir)s %(dreme_options)s %(options)s > %(logfile)s ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile, method="dreme")
def runMEMEOnSequences(infile, outfile, background=None, psp=None): '''run MEME on fasta sequences to find motifs By defualt MEME calculates a zero-th order background model from the nucleotide frequencies in the input set. To use a different background set, a background file created by fasta-get-markov must be supplied. To perform descrimantive analysis a position specific prior (psp) file must be provided. This can be generated used generatePSP. ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs < 2: E.warn("%s: less than 2 sequences - meme skipped" % outfile) P.touch(outfile) return # Get the total length of the sequences to decide the memory total_seqs_length = 0 with IOTools.open_file(infile, "r") as fasta_reader: iterator_fasta = FastaIterator.iterate(fasta_reader) for fasta_seq in iterator_fasta: total_seqs_length += len(fasta_seq.sequence) fasta_reader.close() # If the length of all sequences is higher than 160,000bp # Up the memory job_memory = "2G" if (total_seqs_length > 160000): job_memory = "4G" if PARAMS.get("meme_revcomp", True): revcomp = "-revcomp" else: revcomp = "" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), outfile) tmpdir = P.get_temp_dir(".") if background: background_model = "-bfile %s" % background else: background_model = "" if psp: E.info("Running MEME in descriminative mode") psp_file = "-psp %s" % psp else: psp_file = "" statement = ''' meme %(infile)s -dna %(revcomp)s -p %(meme_threads)s -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(background_model)s %(psp_file)s %(meme_options)s 2> %(outfile)s.log ''' # If running with more than one thread # http://git.net/ml/clustering.gridengine.users/2007-04/msg00058.html # specify "excl=false -w n -pe openmpi-ib num_threads" in cluster_options # through job_options if int(PARAMS["meme_threads"]) != 1: job_options = str(PARAMS["meme_job_options"]) job_threads = int(PARAMS["meme_threads"]) cluster_parallel_environment = str( PARAMS["meme_cluster_parallel_environment"]) P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.is_empty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.get_temp_dir(".") tmpfile = P.get_temp_filename(".") for motiffile in motiffiles: if IOTools.is_empty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) of = IOTools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) statement = "gzip < %(tmpfile)s > %(outfile)s" P.run(statement) shutil.rmtree(tmpdir) os.unlink(tmpfile)
def setUp(self): self.work_dir = P.get_temp_dir()
def setUp(self): self.work_dir = P.get_temp_dir(shared=True)