def runFeatureCounts(annotations_file, bamfile, outfile, nthreads=4, strand=2, options=""): '''run feature counts on *annotations_file* with *bam_file*. If the bam-file is paired, paired-end counting is enabled and the bam file automatically sorted. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile)) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" job_threads = nthreads # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(nthreads)i -s %(strand)s -b -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def build(self, config): ''' return build statement to be run ''' # output directory outdir = "soapdenovo.dir" # get track from config file for line in open(config).readlines(): if line.startswith("q2"): continue elif line.startswith("q") or line.startswith("q1"): track = self.getTrack(line[:-1].split("=")[1]) options = "%(soapdenovo_options)s" tempdir = P.getTempDir(".") statement = '''%%(soapdenovo_executable)s all -s %%(infile)s -o %(tempdir)s/%(track)s -K %%(kmer)s %(options)s; checkpoint; mv %(tempdir)s/%(track)s* %(outdir)s; mv %(outdir)s/%(track)s.contig %(outdir)s/%(track)s.contigs.fa; cat %(outdir)s/%(track)s.contigs.fa | python %%(scriptsdir)s/rename_contigs.py -a --log=%(outdir)s/%(track)s.contigs.log rm -rf %(tempdir)s''' % locals() return statement
def runPicardOnRealigned(infile, outfile): to_cluster = USECLUSTER job_options = getGATKOptions() tmpdir_gatk = P.getTempDir('/ifs/scratch') threads = PARAMS["gatk_threads"] outfile_tumor = outfile.replace("Control", PARAMS["mutect_tumour"]) infile_tumor = infile.replace("Control", PARAMS["mutect_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace("Control", PARAMS["mutect_tumour"]) statement = ''' cat %(infile)s | python %%(scriptsdir)s/bam2bam.py -v 0 --set-sequence --bam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%%(bwa_index_dir)s/%%(genome)s.fa ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT >& %(outfile)s; cat %(infile_tumor)s | python %%(scriptsdir)s/bam2bam.py -v 0 --set-sequence --sam | CollectMultipleMetrics INPUT=/dev/stdin REFERENCE_SEQUENCE=%%(bwa_index_dir)s/%%(genome)s.fa ASSUME_SORTED=true OUTPUT=%(outfile_tumor)s VALIDATION_STRINGENCY=SILENT >& %(outfile_tumor)s;''' % locals() P.run()
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.getTempDir(".") to_cluster = True databases = " ".join(P.asList(PARAMS["tomtom_databases"])) target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile) if IOTools.isEmpty(infile): E.warn("input is empty - no computation performed") P.touch(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run() # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] threads = PARAMS["gatk_threads"] dbsnp = PARAMS["gatk_dbsnp"] solid_options = PARAMS["gatk_solid_options"] statement = '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%%(bwa_index_dir)s/%%(genome)s.fa ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals() statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(tmpdir_gatk)s/%(track)s.readgroups.bam RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals( ) statement += '''samtools index %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals() statement += '''GenomeAnalysisTK -T RealignerTargetCreator -o %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals --num_threads %(threads)s -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals( ) statement += '''GenomeAnalysisTK -T IndelRealigner -o %(tmpdir_gatk)s/%(track)s.indelrealigned.bam -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam -targetIntervals %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals ; checkpoint ;''' % locals( ) statement += '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals( ) statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam ; checkpoint ;''' % locals( ) statement += '''rm -rf %(tmpdir_gatk)s ;''' P.run()
def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('.') job_options = getGATKOptions() library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] threads = PARAMS["gatk_threads"] dbsnp = PARAMS["gatk_dbsnp"] solid_options = PARAMS["gatk_solid_options"] statement = '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%%(bwa_index_dir)s/%%(genome)s.fa ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals( ) statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals( ) statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(tmpdir_gatk)s/%(track)s.readgroups.bam RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals( ) statement += '''samtools index %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals( ) statement += '''GenomeAnalysisTK -T RealignerTargetCreator -o %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals --num_threads %(threads)s -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals( ) statement += '''GenomeAnalysisTK -T IndelRealigner -o %(tmpdir_gatk)s/%(track)s.indelrealigned.bam -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam -targetIntervals %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals ; checkpoint ;''' % locals( ) statement += '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals( ) statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam ; checkpoint ;''' % locals( ) statement += '''rm -rf %(tmpdir_gatk)s ;''' P.run()
def build(self, infile): track = self.getTrack(infile) format = self.getFormat(infile) if format.endswith(".gz"): format = P.snip(format, ".gz") format = format.upper() # cortex_var only uses paired end information to # remove pcr duplicates if not self.checkPairs(infile): paired = "--se_list" reads = os.path.join(os.getcwd(), infile) elif len(self.checkPairs(infile)) > 1: paired = "--pe_list" read1 = infile format = P.snip(format, ".1") read2 = self.checkPairs(infile)[1] elif self.checkPairs(infile) == "interleaved": raise ValueError, "pipeline does not support file of type 'interleaved'" temp = P.getTempDir() read1_new = os.path.join(temp, P.snip(read1, ".1.gz")) read2_new = os.path.join(temp, P.snip(read2, ".2.gz")) # paired end list list1 = open("cortex_var.dir/read1.txt", "w") list2 = open("cortex_var.dir/read2.txt", "w") list1.write(read1_new + "\n") list2.write(read2_new + "\n") list1.close() list2.close() list1 = os.path.abspath("cortex_var.dir/read1.txt") list2 = os.path.abspath("cortex_var.dir/read2.txt") reads = ",".join([os.path.join(os.getcwd(), x) for x in [read1_new, read2_new]]) statement = ( """ gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s ; cd cortex_var.dir ; %%(cortex_var_executable)s %(paired)s %(list1)s,%(list2)s --format %(format)s --mem_height 15 --quality_score_threshold %%(cortex_var_qual_threshold)i --remove_pcr_duplicates --remove_low_coverage_supernodes %%(cortex_var_rm_low_coverage_supernodes)i --sample_id %(track)s --kmer_size %%(kmer)s --dump_binary dump_binary.ctx ; rm -rf %(temp)s """ % locals() ) return statement
def preprocess(self, infile): ''' fastq files need to be converted to fasta and pairs need to be merged ''' mtype = None # check for paired end data either in the same file or in a separate file # for each read - will need to be gunzipped # check compression status if infile.endswith(".gz"): if len(self.checkPairs( infile)) > 1: # check for paired data in separate files read1 = infile read2 = self.checkPairs(infile)[1] temp = P.getTempDir() elif self.checkPairs == "interleaved": infile_new = os.path.join(temp, P.snip(infile, ".gz")) zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals( ) else: zippy = "" # only need to convert if the data are in fastq format if self.getFormat(infile).find("fastq") != -1 and len( self.checkPairs(infile) ) > 1: # reads are fastq and paired in separate files mtype = "--merge" # argument for conversion tool elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs( infile ) == "interleaved": # reads are fastq and in the same file mtype = "--paired" # argument for conversion tool # requires a merge of the fastq files in to fasta format if mtype: # the reads are paired end if mtype == "--merge": outf = P.snip(os.path.basename(read1), ".fastq.1.gz") + ".fa" # check if file exists - metaphlan also performs this preprocessing step if not os.path.exists(outf): statement = '''python %%(scriptsdir)s/fastqs2fasta.py -a %(read1)s -b %(read2)s --log=%(read1)s.log > %(outf)s ''' % locals() P.run() else: E.info("no need to create file %s - exists" % outf) elif mtype == "--paired": outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa" statement = '''%(zippy)s''' P.run() statement = '''fq2fa %(mtype)s %(infile_new)s %(outf)s rm -rf %(temp)s''' % locals() P.run() else: statement = None return statement
def build(self, infile): track = self.getTrack(infile) format = self.getFormat(infile) if format.endswith(".gz"): format = P.snip(format, ".gz") format = format.upper() # cortex_var only uses paired end information to # remove pcr duplicates if not self.checkPairs(infile): paired = "--se_list" reads = os.path.join(os.getcwd(), infile) elif len(self.checkPairs(infile)) > 1: paired = "--pe_list" read1 = infile format = P.snip(format, ".1") read2 = self.checkPairs(infile)[1] elif self.checkPairs(infile) == "interleaved": raise ValueError, "pipeline does not support file of type 'interleaved'" temp = P.getTempDir() read1_new = os.path.join(temp, P.snip(read1, ".1.gz")) read2_new = os.path.join(temp, P.snip(read2, ".2.gz")) # paired end list list1 = open("cortex_var.dir/read1.txt", "w") list2 = open("cortex_var.dir/read2.txt", "w") list1.write(read1_new + "\n") list2.write(read2_new + "\n") list1.close() list2.close() list1 = os.path.abspath("cortex_var.dir/read1.txt") list2 = os.path.abspath("cortex_var.dir/read2.txt") reads = ",".join( [os.path.join(os.getcwd(), x) for x in [read1_new, read2_new]]) statement = ''' gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s ; cd cortex_var.dir ; %%(cortex_var_executable)s %(paired)s %(list1)s,%(list2)s --format %(format)s --mem_height 15 --quality_score_threshold %%(cortex_var_qual_threshold)i --remove_pcr_duplicates --remove_low_coverage_supernodes %%(cortex_var_rm_low_coverage_supernodes)i --sample_id %(track)s --kmer_size %%(kmer)s --dump_binary dump_binary.ctx ; rm -rf %(temp)s ''' % locals() return statement
def preprocess(self, infile): ''' fastq files need to be converted to fasta and pairs need to be merged ''' mtype = None # check for paired end data either in the same file or in a separate file # for each read - will need to be gunzipped # check compression status if infile.endswith(".gz"): # check for paired data in separate files if len(self.checkPairs(infile)) > 1: read1 = infile read2 = self.checkPairs(infile)[1] temp = P.getTempDir() elif self.checkPairs == "interleaved": infile_new = os.path.join(temp, P.snip(infile, ".gz")) zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals() else: zippy = "" # only need to convert if the data are in fastq format # reads are fastq and paired in separate files if self.getFormat(infile).find("fastq") != -1 and len(self.checkPairs(infile)) > 1: mtype = "--merge" # argument for conversion tool # reads are fastq and in the same file elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(infile) == "interleaved": mtype = "--paired" # argument for conversion tool # requires a merge of the fastq files in to fasta format if mtype: # the reads are paired end if mtype == "--merge": outf = P.snip(os.path.basename(read1), ".fastq.1.gz") + ".fa" # check if file exists - metaphlan also performs this # preprocessing step if not os.path.exists(outf): statement = '''python %%(scriptsdir)s/fastqs2fasta.py -a %(read1)s -b %(read2)s --log=%(read1)s.log > %(outf)s ''' % locals() P.run() else: E.info("no need to create file %s - exists" % outf) elif mtype == "--paired": outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa" statement = '''%(zippy)s''' P.run() statement = '''fq2fa %(mtype)s %(infile_new)s %(outf)s rm -rf %(temp)s''' % locals() P.run() else: statement = None return statement
def preprocess(self, infile): ''' fastq files need to be converted to fasta and pairs need to be merged ''' mtype = None # check for paired end data either in the same file or in a separate file # for each read - will need to be gunzipped # check compression status if infile.endswith(".gz"): if len(self.checkPairs( infile)) > 1: # check for paired data in separate files read1 = infile read2 = self.checkPairs(infile)[1] temp = P.getTempDir() read1_new = os.path.join(temp, P.snip(infile, ".gz")) read2_new = os.path.join( temp, P.snip(self.checkPairs(infile)[1], ".gz")) zippy = """gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s; """ % locals() elif self.checkPairs == "interleaved": infile_new = os.path.join(temp, P.snip(infile, ".gz")) zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals( ) else: zippy = "" # only need to convert if the data are in fastq format if self.getFormat(infile).find("fastq") != -1 and len( self.checkPairs(infile) ) > 1: # reads are fastq and paired in separate files mtype = "--merge" # argument for conversion tool elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs( infile ) == "interleaved": # reads are fastq and in the same file mtype = "--paired" # argument for conversion tool # build statement if mtype: # the reads are paired end if mtype == "--merge": outf = P.snip(os.path.basename(read1_new), ".fastq.1") + ".fa" statement = '''%(zippy)s fq2fa %(mtype)s %(read1_new)s %(read2_new)s %(outf)s ''' % locals() elif mtype == "--paired": outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa" statement = '''%(zippy)s fq2fa %(mtype)s %(infile_new)s %(outf)s rm -rf %(temp)s''' % locals() else: statement = None return statement
def build(self, infile, PARAMS): ''' run velveth and velvetg followed by meta-velvetg ''' outdir = P.getTempDir(".") format = self.getFormat(infile) paired = self.checkPairs(infile) if not paired: pair = "" files = infile read_type = "short" else: pair = "-%s" % paired[0] files = " ".join([infile, paired[1]]) read_type = "shortPaired" if format == "fastq.1.gz": format = "fastq.gz" metavelvet_dir = os.path.join(os.getcwd(), "metavelvet.dir") track = self.getTrack(os.path.basename(infile)) self.stats_file = track + ".stats.txt" if paired: insert_length = "-ins_length %i" % PARAMS["velvetg_insert_length"] else: insert_length = "" # velveth and velvetg have to be run to build hash tables and initial # de bruijn graphs statement = '''%%(velveth_executable)s %(outdir)s %%(kmer)i -%(format)s -%(read_type)s %(pair)s %(files)s >> %(metavelvet_dir)s/%(track)s_velveth.log ; checkpoint ; mv %(outdir)s/Log %(metavelvet_dir)s/%(track)s.velveth.log ; %%(velvetg_executable)s %(outdir)s -exp_cov auto %(insert_length)s ; checkpoint ; %%(metavelvet_executable)s %(outdir)s %(insert_length)s ; mv %(outdir)s/Roadmaps %(metavelvet_dir)s/%(track)s.roadmaps ; gzip %(metavelvet_dir)s/%(track)s.roadmaps ; mv %(outdir)s/Sequences %(metavelvet_dir)s/%(track)s.sequences ; gzip %(metavelvet_dir)s/%(track)s.sequences ; mv %(outdir)s/Graph2 %(metavelvet_dir)s/%(track)s.graph2 ; gzip %(metavelvet_dir)s/%(track)s.graph2 ; cat %(outdir)s/meta-velvetg.contigs.fa | python %%(scriptsdir)s/rename_contigs.py -a metavelvet --log=%(metavelvet_dir)s/%(track)s.contigs.log > %(metavelvet_dir)s/%(track)s.contigs.fa ; sed -i 's/in/_in/g' %(outdir)s/meta-velvetg.Graph2-stats.txt ; mv %(outdir)s/meta-velvetg.Graph2-stats.txt %(metavelvet_dir)s/%(track)s.stats.txt ; rm -rf %(outdir)s ''' % locals() return statement
def trimReads( infile, outfile ): '''trim reads with FastX''' to_cluster = True tmpdir_fastq = P.getTempDir() track = P.snip( os.path.basename( infile ), ".gz" ) statement = """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log > %(tmpdir_fastq)s/%(track)s;""" % locals() statement += """zcat %(infile)s | fastx_trimmer -f %(first_base)s -l %(last_base)s -z -o %(outfile)s """ P.run()
def trimReads(infile, outfile): '''trim reads with FastX''' to_cluster = True tmpdir_fastq = P.getTempDir() track = P.snip(os.path.basename(infile), ".gz") statement = """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log > %(tmpdir_fastq)s/%(track)s;""" % locals() statement += """zcat %(infile)s | fastx_trimmer -f %(first_base)s -l %(last_base)s -z -o %(outfile)s """ P.run()
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' to_cluster = True # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def assignEssentialGenesToContigs(infile, outfile): ''' assign essential genes to contigs ''' dirname = os.path.dirname(infile) essential = PARAMS["hmmer_hmm"] tempdir = P.getTempDir(".") statement = '''zcat %(infile)s > %(tempdir)s/orfs.fa; hmmsearch --tblout %(tempdir)s/hmm.out --cut_tc --notextw %(essential)s %(tempdir)s/orfs.fa; tail -n+4 %(tempdir)s/hmm.out | sed 's/ * / /g' | cut -f 1,4 -d " " | gzip > %(outfile)s''' P.run() statement = '''rm -rf %(tempdir)s''' P.run()
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' to_cluster = True # job_options = "-l mem_free=8000M" target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(PARAMS["genome_dir"], PARAMS["genome"])) tmpdir = P.getTempDir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals(track, tmpfasta, dbhandle, full=False, masker=P.asList(PARAMS['motifs_masker']), halfwidth=int(PARAMS["meme_halfwidth"]), maxsize=int(PARAMS["meme_max_size"]), proportion=PARAMS["meme_proportion"], min_sequences=PARAMS["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def preprocess(self, infile): ''' fastq files need to be converted to fasta and pairs need to be merged ''' mtype = None # check for paired end data either in the same file or in a separate file # for each read - will need to be gunzipped # check compression status if infile.endswith(".gz"): if len(self.checkPairs(infile)) > 1: # check for paired data in separate files read1 = infile read2 = self.checkPairs(infile)[1] temp = P.getTempDir() read1_new = os.path.join(temp, P.snip(infile, ".gz")) read2_new = os.path.join(temp, P.snip(self.checkPairs(infile)[1], ".gz")) zippy = """gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s; """ % locals() elif self.checkPairs == "interleaved": infile_new = os.path.join(temp, P.snip(infile, ".gz")) zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals() else: zippy = "" # only need to convert if the data are in fastq format if self.getFormat(infile).find("fastq") != -1 and len(self.checkPairs(infile)) >1: # reads are fastq and paired in separate files mtype = "--merge" # argument for conversion tool elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(infile) == "interleaved": # reads are fastq and in the same file mtype = "--paired" # argument for conversion tool # build statement if mtype: # the reads are paired end if mtype == "--merge": outf = P.snip(os.path.basename(read1_new), ".fastq.1") + ".fa" statement = '''%(zippy)s fq2fa %(mtype)s %(read1_new)s %(read2_new)s %(outf)s ''' % locals() elif mtype == "--paired": outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa" statement = '''%(zippy)s fq2fa %(mtype)s %(infile_new)s %(outf)s rm -rf %(temp)s''' % locals() else: statement = None return statement
def build(self, infile): ''' build statement for running idba input is merged fasta file and output is contigs fasta file ''' track = self.getTrack(infile) outdir = "idba.dir" # get temporary file for running idba tempdir = P.getTempDir() # NB at the moment we assume the default maxkmer of 100 statement = '''%%(idba_executable)s -r %(infile)s -o %(tempdir)s %%(idba_options)s ; mv %(tempdir)s/scaffold.fa idba.dir/%(track)s.scaffolds.fa ; mv %(tempdir)s/contig-%%(idba_maxkmer)s.fa idba.dir/%(track)s.contigs.fa''' % locals() shutil.rmtree(tempdir) return statement
def buildCoverageOverContigs(infiles, outfile): ''' build histograms of the coverage over each of the contigs ''' bam = infiles[0] # genomecoveragebed does not like some of the # output from bwa. bwa outputs some reads # that map off the end of contigs # as having a leftmost position of 0. This is # not ideal. Need to use temporary bam # files with only mapped reads - this is # nasty and needs changing tempdir = P.getTempDir(".") tempname = P.getTempFilename(tempdir) + ".bam" P.submit("CGATPipelines.PipelineMetagenomeAssembly", "filterBamOnPos", infiles = bam, outfiles = tempname) # tablename where alignment stats live tablename = os.path.dirname( bam)[:-len(".dir")] + "_" + P.snip(os.path.basename(bam), ".bam") + "_alignment_stats" # hack to convert to table - add .load tablename = P.toTable(tablename + ".load") # connect to database dbh = connect() cc = dbh.cursor() # get number of reads aligned from bam2stats if PARAMS.get("coverage_scale"): scale_factor = cc.execute("""SELECT counts FROM %s WHERE category == 'reads_mapped'""" % tablename).fetchone()[0] scale_factor = 1 / (float(scale_factor) / 1000000) scale_options = "-scale %(scale_factor)f" else: scale_options = "" statement = '''genomeCoverageBed -ibam %(tempname)s %(scale_options)s -d | gzip > %(outfile)s; rm -rf %(tempdir)s''' P.run()
def build(self, infile): ''' build statement for running spades ''' track = self.getTrack(os.path.basename(infile)) format = self.getFormat(infile) paired = self.checkPairs(infile) tempdir = P.getTempDir(".") outdir = "spades.dir" # input files if not paired: files = infile files_statement = "-s %s" % files else: # spades doesn't like the fastq.1.gz type format temp1 = os.path.join(tempdir, track+".1.fastq") temp2 = os.path.join(tempdir, track+".2.fastq") infile2 = paired[1] unzip_statement = "zcat %(infile)s > %(temp1)s; zcat %(infile2)s > %(temp2)s" % locals() files_statement = " -1 " + " -2 ".join( [temp1, temp2] ) # kmer to use k = "-k %(kmer)s" # spades options spades_options = "%(spades_options)s" # deal with spades output move_statement = """mv %(tempdir)s/corrected/%(track)s*cor.* %(outdir)s; \ mv %(tempdir)s/contigs.fasta %(outdir)s/%(track)s.contigs.fa; \ mv %(tempdir)s/scaffolds.fasta %(outdir)s/%(track)s.scaffolds.fa; \ mv %(tempdir)s/spades.log %(outdir)s/%(track)s.contigs.log""" % locals() # statement - simple and default statement = '''%(unzip_statement)s; checkpoint; spades.py %(files_statement)s -o %(tempdir)s %(k)s %(spades_options)s; checkpoint; %(move_statement)s; checkpoint; rm -rf %(tempdir)s %(temp1)s %(temp2)s''' % locals() return statement
def build(self, infile): ''' build statement for running idba input is merged fasta file and output is contigs fasta file ''' track = self.getTrack(infile) outdir = "idba.dir" # get temporary file for running idba tempdir = P.getTempDir() # NB at the moment we assume the default maxkmer of 100 statement = '''%%(idba_executable)s -r %(infile)s -o %(tempdir)s %%(idba_options)s ; mv %(tempdir)s/scaffold.fa idba.dir/%(track)s.scaffolds.fa ; cat %(tempdir)s/contig-%%(idba_maxkmer)s.fa | python %%(scriptsdir)s/rename_contigs.py -a idba --log=%(outdir)s/%(track)s.contigs.log > idba.dir/%(track)s.contigs.fa''' % locals() shutil.rmtree(tempdir) return statement
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' to_cluster = True # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) return target_path = os.path.join( os.path.abspath(PARAMS["exportdir"]), "meme", outfile) tmpdir = P.getTempDir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' to_cluster = True # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) P.touch(outfile) return target_path = os.path.join(os.path.abspath(PARAMS["exportdir"]), "meme", outfile) tmpdir = P.getTempDir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run() collectMEMEResults(tmpdir, target_path, outfile)
def buildClusters( infiles, outfiles ): '''run c-means clustering on expression level data.''' to_cluster = USECLUSTER job_options = "-l mem_free=10G" # ignore the background file (why is it included in infiles?) infile, _ = infiles instructions_filename, centroid_filename, membership_filename = outfiles instructions_filename = os.path.abspath( instructions_filename ) cdt_filename = os.path.abspath( infile ) kmeans_clusters = PARAMS["kmeans_clusters"] # run aerie in a temporary directory tmpdir = P.getTempDir(".") with open( instructions_filename, "w" ) as outf: outf.write( '''load %(cdt_filename)s fuzzy %(kmeans_clusters)i %(tmpdir)s/all exit ''' % locals()) statement = ''' aerie < %(instructions_filename)s >& %(instructions_filename)s.log ''' P.run() try: shutil.move( os.path.join( tmpdir, "all.fct"), centroid_filename ) shutil.move( os.path.join( tmpdir, "all.mb"), membership_filename ) except IOError,msg: E.warn("no results for %s,%s: %s" % (centroid_filename, membership_filename, msg)) P.touch( centroid_filename ) P.touch( membership_filename )
def build(self, infile): """ build statement for running idba input is merged fasta file and output is contigs fasta file """ track = self.getTrack(infile) outdir = "idba.dir" # get temporary file for running idba tempdir = P.getTempDir() # NB at the moment we assume the default maxkmer of 100 statement = ( """%%(idba_executable)s -r %(infile)s -o %(tempdir)s %%(idba_options)s ; mv %(tempdir)s/scaffold.fa idba.dir/%(track)s.scaffolds.fa ; cat %(tempdir)s/contig-%%(idba_maxkmer)s.fa | python %%(scriptsdir)s/rename_contigs.py -a idba --log=%(outdir)s/%(track)s.contigs.log > idba.dir/%(track)s.contigs.fa""" % locals() ) shutil.rmtree(tempdir) return statement
def buildClusters(infiles, outfiles): '''run c-means clustering on expression level data.''' to_cluster = USECLUSTER job_options = "-l mem_free=10G" # ignore the background file (why is it included in infiles?) infile, _ = infiles instructions_filename, centroid_filename, membership_filename = outfiles instructions_filename = os.path.abspath(instructions_filename) cdt_filename = os.path.abspath(infile) kmeans_clusters = PARAMS["kmeans_clusters"] # run aerie in a temporary directory tmpdir = P.getTempDir(".") with open(instructions_filename, "w") as outf: outf.write('''load %(cdt_filename)s fuzzy %(kmeans_clusters)i %(tmpdir)s/all exit ''' % locals()) statement = ''' aerie < %(instructions_filename)s >& %(instructions_filename)s.log ''' P.run() try: shutil.move(os.path.join(tmpdir, "all.fct"), centroid_filename) shutil.move(os.path.join(tmpdir, "all.mb"), membership_filename) except IOError, msg: E.warn("no results for %s,%s: %s" % (centroid_filename, membership_filename, msg)) P.touch(centroid_filename) P.touch(membership_filename)
def build(self, infile): ''' run velveth and velvetg followed by meta-velvetg ''' outdir = P.getTempDir() format = self.getFormat(infile) paired = self.checkPairs(infile) if len(paired) > 1: pair = paired[0] files = " ".join([infile, paired[1]]) else: pair = paired files = infile if format == "fastq.1.gz": format = "fastq.gz" metavelvet_dir = os.path.join(os.getcwd(), "metavelvet.dir") track = self.getTrack(infile) self.stats_file = track + ".stats.txt" # velveth and velvetg have to be run to build hash tables and initial de bruijn graphs statement = '''%%(velveth_executable)s %(outdir)s %%(kmer)i -%(format)s -shortPaired -%(pair)s %(files)s ; cd %(outdir)s; %%(velvetg_executable)s %(outdir)s -exp_cov auto -ins_length %%(velvetg_insert_length)i ; %%(metavelvet_executable)s %(outdir)s -ins_length %%(velvetg_insert_length)i ; mv %(outdir)s/Roadmaps %(metavelvet_dir)s/%(track)s.roadmaps ; gzip %(metavelvet_dir)s/%(track)s.roadmaps ; mv %(outdir)s/Sequences %(metavelvet_dir)s/%(track)s.sequences ; gzip %(metavelvet_dir)s/%(track)s.sequences ; mv %(outdir)s/Graph2 %(metavelvet_dir)s/%(track)s.graph2 ; gzip %(metavelvet_dir)s/%(track)s.graph2 ; mv %(outdir)s/meta-velvetg.contigs.fa %(metavelvet_dir)s/%(track)s.contigs.fa ; sed -i 's/in/_in/g' %(outdir)s/meta-velvetg.Graph2-stats.txt ; mv %(outdir)s/meta-velvetg.Graph2-stats.txt %(metavelvet_dir)s/%(track)s.stats.txt ; rm -rf %(outdir)s''' % locals() return statement
def realignMatchedSample(infile, outfile): ''' repeat realignments with merged bam of control and tumor this should help avoid problems with sample-specific realignments''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_options = getGATKOptions() tmpdir_gatk = P.getTempDir('/ifs/scratch') threads = PARAMS["gatk_threads"] outfile_tumor = outfile.replace("Control", PARAMS["mutect_tumour"]) infile_tumor = infile.replace("Control", PARAMS["mutect_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace("Control", PARAMS["mutect_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] statement = '''module unload apps/java/jre1.6.0_26; checkpoint;''' statement += '''AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/control.bam RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=Control VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools view -H %(tmpdir_gatk)s/control.bam > %(tmpdir_gatk)s/header.sam; samtools view -H %(infile_tumor)s | grep "^@RG" >> %(tmpdir_gatk)s/header.sam; samtools merge -h %(tmpdir_gatk)s/header.sam %(tmpdir_gatk)s/merged.bam %(tmpdir_gatk)s/control.bam %(infile_tumor)s ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/merged.bam; checkpoint ;''' % locals() statement += '''java -Xmx4g -jar /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar -T RealignerTargetCreator -o %(tmpdir_gatk)s/merged.indelrealignment.intervals -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/merged.bam ; checkpoint ;''' % locals() statement += '''java -Xmx4g -jar /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar -T IndelRealigner -o %(tmpdir_gatk)s/merged.indelrealigned.bam -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/merged.bam -targetIntervals %(tmpdir_gatk)s/merged.indelrealignment.intervals; checkpoint ;''' % locals() statement += '''samtools view -hb %(tmpdir_gatk)s/merged.indelrealigned.bam -r Control > %(outfile)s; samtools view -hb %(tmpdir_gatk)s/merged.indelrealigned.bam -r 1 > %(outfile_tumor)s; samtools index %(outfile)s; samtools index %(outfile_tumor)s; checkpoint;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' print statement P.run()
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' to_cluster = True # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise P.PipelineError( "control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)
def runFeatureCounts(annotations_file, bamfile, outfile, nthreads=4, strand=2, options=""): '''run feature counts on *annotations_file* with *bam_file*. If the bam-file is paired, paired-end counting is enabled and the bam file automatically sorted. ''' # featureCounts cannot handle gzipped in or out files outfile = P.snip(outfile, ".gz") tmpdir = P.getTempDir() annotations_tmp = os.path.join(tmpdir, 'geneset.gtf') bam_tmp = os.path.join(tmpdir, bamfile) # -p -B specifies count fragments rather than reads, and both # reads must map to the feature # for legacy reasons look at feature_counts_paired if BamTools.isPaired(bamfile): # select paired end mode, additional options paired_options = "-p -B" # remove .bam extension bam_prefix = P.snip(bam_tmp, ".bam") # sort by read name paired_processing = \ """samtools sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; checkpoint; """ % locals() bamfile = bam_tmp else: paired_options = "" paired_processing = "" job_options = "-pe dedicated %i" % nthreads # AH: what is the -b option doing? statement = '''mkdir %(tmpdir)s; zcat %(annotations_file)s > %(annotations_tmp)s; checkpoint; %(paired_processing)s featureCounts %(options)s -T %(nthreads)i -s %(strand)s -b -a %(annotations_tmp)s %(paired_options)s -o %(outfile)s %(bamfile)s >& %(outfile)s.log; checkpoint; gzip -f %(outfile)s; checkpoint; rm -rf %(tmpdir)s ''' P.run()
def build(self, infile): ''' build statement for running Ray ''' track = self.getTrack(infile) format = self.getFormat(infile) paired = self.checkPairs(infile) tempdir = P.getTempDir() # check whether the data are paired-end if not paired: pair = paired files = os.path.join(tempdir, P.snip(infile, ".gz")) gunzy = "gunzip -c %(infile)s > %(files)s" % locals() else: pair = paired[0] # Ray doesn't like .fastq.1.gz etc read1 = infile read2 = paired[1] read1_new = os.path.join(tempdir,read1.replace(".fastq.1.gz", ".1.fastq")) read2_new = os.path.join(tempdir,read2.replace(".fastq.2.gz", ".2.fastq")) files = " ".join([read1_new, read2_new]) gunzy = """gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s""" % locals() # ray likes an output directory but needs it not # to exist beforehand raydir = os.path.join(os.getcwd(), "ray.dir/export_%s" % track) raydir_orig = os.path.join(os.getcwd(), "ray.dir") # Ray picks up file types so should just have to # say whether its paired or not # build statement common_options = "-k %(kmer)s" if pair == "interleaved": filetype = "-i" elif not pair: filetype = "-s" elif pair == "separate": filetype = "-p" else: raise IOError, "do not support file of this type: %s" % infile # note restrict use to 5 cores statement = ''' %(gunzy)s ; mpiexec -n 5 %%(ray_executable)s %(common_options)s %(filetype)s %(files)s -o %(raydir)s ; checkpoint; mv %(raydir)s/Scaffolds.fasta %(raydir_orig)s/%(track)s.scaffolds.fa ; mv %(raydir)s/ScaffoldComponents.txt %(raydir_orig)s/%(track)s.scaffold_components.txt ; mv %(raydir)s/ScaffoldLengths.txt %(raydir_orig)s/%(track)s.scaffold_lengths.txt ; mv %(raydir)s/ScaffoldLinks.txt %(raydir_orig)s/%(track)s.scaffold_links.txt ; mv %(raydir)s/Contigs.fasta %(raydir_orig)s/%(track)s.contigs.fa ; mv %(raydir)s/OutputNumbers.txt %(raydir_orig)s/%(track)s.numbers.txt ; mv %(raydir)s/CoverageDistribution.txt %(raydir_orig)s/graph/%(track)s.coverage_distribution.txt ; mkdir %(raydir)s/graph ; mv %(raydir)s/CoverageDistributionAnalysis.txt %(raydir_orig)s/graph/%(track)s.coverage_distribution_analysis.txt ; mv %(raydir)s/degreeDistribution.txt %(raydir_orig)s/graph/%(track)s.degree_distribution.txt ; mv %(raydir)s/Kmers.txt %(raydir_orig)s/graph/%(track)s.kmers.txt ; mkdir %(raydir)s/assembly ; mv %(raydir)s/SeedLengthDistribution.txt %(raydir_orig)s/assembly/%(track)s.seed_length_distribution.txt ; mv %(raydir)s/LibraryStatistics.txt %(raydir_orig)s/%(track)s.library_statistics.txt ; mv %(raydir)s/LibraryData.xml %(raydir_orig)s/%(track)s.library_data.xml ; rm -rf %(tempdir)s''' % locals() return statement
def buildCodingPotential(infile, outfile): '''run CPC analysis as in the cpc script. This module runs framefinder and blastx on both strands. It seems to work, but I have not thoroughly tested it. I expect that the false positive rate increases (i.e., predicting non-coding as coding) in cases where the best framefinder match and the best blast match are on opposite strands. In the original CPC, these would be separated. ''' try: cpc_dir = os.environ["CPC_HOME"] except KeyError: raise ValueError("CPC_HOME environment variable is not set. ") tmpdir = P.getTempDir(".") track = P.snip(outfile, ".coding.gz") # extract features for frame finder # replaces extract_framefinder_feats.pl to parse both strands with open(os.path.join(tmpdir, "ff.feat"), "w") as outf: outf.write("\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n") for line in IOTools.openFile("%s.frame.gz" % track): if line.startswith(">"): try: (id, start, end, score, used, mode, tpe) = \ re.match( ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups() except AttributeError: raise ValueError("parsing error in line %s" % line) length = int(end) - int(start) + 1 strict = int(tpe == "strict") outf.write("\t".join((id, str(length), used, str(strict))) + "\n") to_cluster = USECLUSTER # extract features and prepare svm data s = [] s.append(''' zcat %(infile)s | perl %(cpc_dir)s/libs/blast2table.pl | tee %(tmpdir)s/blastx.table | perl %(cpc_dir)s/bin/extract_blastx_features.pl > %(tmpdir)s/blastx.feat1; ''') s.append(''' cat %(track)s_norepeats.fasta | perl %(cpc_dir)s/bin/add_missing_entries.pl %(tmpdir)s/blastx.feat1 > %(tmpdir)s/blastx.feat; ''') # step 2 - prepare data s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat > %(tmpdir)s/blastx.lsv; ''') s.append(''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat > %(tmpdir)s/ff.lsv; ''') s.append(''' perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv > %(tmpdir)s/test.lsv; ''') s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale -r %(cpc_dir)s/data/libsvm.range %(tmpdir)s/test.lsv > %(tmpdir)s/test.lsv.scaled; ''') # step 3: prediction m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0") # standard m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model") # Prob m_libsvm_model2 = os.path.join( cpc_dir, "data/libsvm.model2") # Prob + weighted version m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range") s.append(''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2 %(tmpdir)s/test.lsv.scaled %(m_libsvm_model0)s %(tmpdir)s/test.svm0.predict > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr; ''') s.append(''' printf "gene_id\\tlength\\tresult\\tvalue\\n" | gzip > %(outfile)s; cat %(tmpdir)s/test.svm0.predict | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta | gzip >> %(outfile)s; ''') # generate reports s.append('''cat %(tmpdir)s/blastx.feat | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz) | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf; gzip %(outfile)s.orf %(outfile)s.homology; ''') # now run it all statement = " checkpoint; ".join(s) P.run() # clean up shutil.rmtree(tmpdir)
def filterBamfiles(infile, sentinel): """ Pre-process bamfiles prior to peak calling. i) sort bamfiles ii) remove unmapped readswith bam2bam.py iii) remove non-uniquely mapping reads with bam2bam.py (optional) iv) remove duplicates with Picards MarkDuplicates (optional) v) remove reads from masked regions with bedtools intersect (optional) vi) index """ # create tempfile for Picard's MarkDuplicates picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"]) outfile = P.snip(sentinel, ".sentinel") + ".bam" # ensure bamfile is sorted, statement = ["samtools sort @IN@ @OUT@", ] # remove unmapped reads statement.append("python %(scriptsdir)s/bam2bam.py" " --filter=mapped" " --log=%(outfile)s.log" " < @[email protected]" " > @OUT@") # remove non-uniquely mapping reads, if requested if PARAMS["filter_remove_non_unique"]: statement.append("python %(scriptsdir)s/bam2bam.py" " --filter=unique" " --log=%(outfile)s.log" " < @IN@" " > @OUT@") # remove duplicates, if requested if PARAMS["filter_remove_duplicates"]: statement.append("MarkDuplicates" " INPUT=@IN@" " ASSUME_SORTED=true" " REMOVE_DUPLICATES=true" " QUIET=false" " OUTPUT=@OUT@" " METRICS_FILE=/dev/null" " VALIDATION_STRINGENCY=SILENT" " TMP_DIR=%(picard_tmp)s" " 2> %(outfile)s.log") # mask regions, if intervals supplied if PARAMS["filter_mask_intervals"]: mask = PARAMS["filter_mask_intervals"] statement.append("bedtools intersect" " -abam @IN@" " -b %(mask)s" " -wa" " -v" " > @OUT@") statement.append("mv @IN@ %(outfile)s") statement.append("samtools index %(outfile)s") job_options = "-l mem_free=10G" statement = P.joinStatements(statement, infile) P.run() P.touch(sentinel) shutil.rmtree(picard_tmp)
def build(self, infile): ''' build statement for running Ray ''' track = self.getTrack(os.path.basename(infile)) format = self.getFormat(infile) paired = self.checkPairs(infile) tempdir = P.getTempDir(dir=".") # check whether the data are paired-end if not paired: pair = paired files = os.path.join( tempdir, P.snip(os.path.basename(infile), ".gz")) gunzy = "gunzip -c %(infile)s > %(files)s" % locals() else: pair = paired[0] # Ray doesn't like .fastq.1.gz etc read1 = infile read2 = paired[1] read1_new = os.path.join( tempdir, read1.replace(".fastq.1.gz", ".1.fastq")) read2_new = os.path.join( tempdir, read2.replace(".fastq.2.gz", ".2.fastq")) files = " ".join([read1_new, read2_new]) gunzy = """gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s""" % locals() # ray likes an output directory but needs it not # to exist beforehand raydir = os.path.join(os.getcwd(), "ray.dir/export_%s" % track) raydir_orig = os.path.join(os.getcwd(), "ray.dir") # Ray picks up file types so should just have to # say whether its paired or not # build statement common_options = "-k %(kmer)s" if pair == "interleaved": filetype = "-i" elif not pair: filetype = "-s" elif pair == "separate": filetype = "-p" else: raise IOError, "do not support file of this type: %s" % infile # note restrict use to 10 cores statement = ''' %(gunzy)s ; mpiexec %%(ray_executable)s %(common_options)s %(filetype)s %(files)s -o %(raydir)s >> %(raydir_orig)s/%(track)s.log ; checkpoint; mv %(raydir)s/Scaffolds.fasta %(raydir_orig)s/%(track)s.scaffolds.fa ; mv %(raydir)s/ScaffoldComponents.txt %(raydir_orig)s/%(track)s.scaffold_components.txt ; mv %(raydir)s/ScaffoldLengths.txt %(raydir_orig)s/%(track)s.scaffold_lengths.txt ; mv %(raydir)s/ScaffoldLinks.txt %(raydir_orig)s/%(track)s.scaffold_links.txt ; cat %(raydir)s/Contigs.fasta | python %%(scriptsdir)s/rename_contigs.py -a ray --log=%(raydir_orig)s/%(track)s.contigs.log > %(raydir_orig)s/%(track)s.contigs.fa ; mv %(raydir)s/OutputNumbers.txt %(raydir_orig)s/%(track)s.numbers.txt ; mv %(raydir)s/CoverageDistribution.txt %(raydir_orig)s/graph/%(track)s.coverage_distribution.txt ; mkdir %(raydir)s/graph ; mv %(raydir)s/CoverageDistributionAnalysis.txt %(raydir_orig)s/graph/%(track)s.coverage_distribution_analysis.txt ; mv %(raydir)s/degreeDistribution.txt %(raydir_orig)s/graph/%(track)s.degree_distribution.txt ; mv %(raydir)s/Kmers.txt %(raydir_orig)s/graph/%(track)s.kmers.txt ; mkdir %(raydir)s/assembly ; mv %(raydir)s/SeedLengthDistribution.txt %(raydir_orig)s/assembly/%(track)s.seed_length_distribution.txt ; mv %(raydir)s/LibraryStatistics.txt %(raydir_orig)s/%(track)s.library_statistics.txt ; mv %(raydir)s/LibraryData.xml %(raydir_orig)s/%(track)s.library_data.xml ; rm -rf %(tempdir)s''' % locals() return statement
def buildCodingPotential( infile, outfile ): '''run CPC analysis as in the cpc script. This module runs framefinder and blastx on both strands. It seems to work, but I have not thoroughly tested it. I expect that the false positive rate increases (i.e., predicting non-coding as coding) in cases where the best framefinder match and the best blast match are on opposite strands. In the original CPC, these would be separated. ''' try: cpc_dir = os.environ["CPC_HOME"] except KeyError: raise ValueError("CPC_HOME environment variable is not set. ") tmpdir = P.getTempDir( ".") track = P.snip( outfile, ".coding.gz" ) # extract features for frame finder # replaces extract_framefinder_feats.pl to parse both strands with open( os.path.join(tmpdir, "ff.feat"), "w") as outf: outf.write( "\t".join(("QueryID", "CDSLength", "Score", "Used", "Strict")) + "\n") for line in IOTools.openFile( "%s.frame.gz" % track ): if line.startswith(">"): try: ( id, start, end, score, used, mode, tpe) = \ re.match( ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line ).groups() except AttributeError: raise ValueError( "parsing error in line %s" % line ) length = int(end) - int(start) + 1 strict = int(tpe == "strict") outf.write( "\t".join( (id, str(length), used, str(strict )) )+ "\n") to_cluster = USECLUSTER # extract features and prepare svm data s = [] s.append( ''' zcat %(infile)s | perl %(cpc_dir)s/libs/blast2table.pl | tee %(tmpdir)s/blastx.table | perl %(cpc_dir)s/bin/extract_blastx_features.pl > %(tmpdir)s/blastx.feat1; ''' ) s.append( ''' cat %(track)s_norepeats.fasta | perl %(cpc_dir)s/bin/add_missing_entries.pl %(tmpdir)s/blastx.feat1 > %(tmpdir)s/blastx.feat; ''') # step 2 - prepare data s.append( ''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat > %(tmpdir)s/blastx.lsv; ''' ) s.append( ''' perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat > %(tmpdir)s/ff.lsv; ''' ) s.append( ''' perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv > %(tmpdir)s/test.lsv; ''' ) s.append( ''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale -r %(cpc_dir)s/data/libsvm.range %(tmpdir)s/test.lsv > %(tmpdir)s/test.lsv.scaled; ''' ) # step 3: prediction m_libsvm_model0=os.path.join( cpc_dir, "data/libsvm.model0") # standard m_libsvm_model=os.path.join( cpc_dir, "data/libsvm.model") # Prob m_libsvm_model2=os.path.join( cpc_dir, "data/libsvm.model2" ) # Prob + weighted version m_libsvm_range=os.path.join( cpc_dir, "data/libsvm.range" ) s.append( ''' %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2 %(tmpdir)s/test.lsv.scaled %(m_libsvm_model0)s %(tmpdir)s/test.svm0.predict > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr; ''' ) s.append( ''' printf "gene_id\\tlength\\tresult\\tvalue\\n" | gzip > %(outfile)s; cat %(tmpdir)s/test.svm0.predict | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta | gzip >> %(outfile)s; ''' ) # generate reports s.append( '''cat %(tmpdir)s/blastx.feat | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz) | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf; gzip %(outfile)s.orf %(outfile)s.homology; ''' ) # now run it all statement = " checkpoint; ".join( s ) P.run() # clean up shutil.rmtree( tmpdir )
def preprocess( self, infiles, outfile ): '''build preprocessing statement Build a command line statement that extracts/converts various input formats to fastq formatted files. Mapping qualities are changed to solexa format. returns the statement and the fastq files to map. ''' assert len(infiles) > 0, "no input files for mapping" tmpdir_fastq = P.getTempDir() # create temporary directory again for nodes statement = [ "mkdir -p %s" % tmpdir_fastq ] fastqfiles = [] # get track by extension of outfile track = os.path.splitext( os.path.basename( outfile ) )[0] if self.compress: compress_cmd = "| gzip" extension = ".gz" else: compress_cmd = "" extension = "" for infile in infiles: if infile.endswith( ".export.txt.gz"): # single end illumina export statement.append( """gunzip < %(infile)s | awk '$11 != "QC" || $10 ~ /(\d+):(\d+):(\d+)/ \ { if ($1 != "") { readname=sprintf( "%%%%s_%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$2,$3,$4,$5,$6);} else { readname=sprintf( "%%%%s:%%%%s:%%%%s:%%%%s:%%%%s", $1,$3,$4,$5,$6); } printf("@%%%%s\\n%%%%s\\n+\\n%%%%s\\n",readname,$9,$10);}' %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension ),) ) elif infile.endswith( ".fa.gz" ): statement.append( '''gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.fa''' % locals() ) fastqfiles.append( ("%s/%s.fa" % (tmpdir_fastq, track ),) ) self.datatype = "fasta" elif infile.endswith( ".sra"): # sneak preview to determine if paired end or single end outdir = P.getTempDir() # --split-files is present in fastq-dump 2.1.7 P.execute( "fastq-dump --split-files --gzip -X 1000 --outdir %(outdir)s %(infile)s" % locals() ) # --split-files will create files called prefix_#.fastq.gz # where # is the read number. # The following cases are: # * file cotains paired end data: output = prefix_1.fastq.gz, prefix_2.fastq.gz # * special case: unpaired reads in a paired end run end up in prefix.fastq.gz # * special case: if paired reads are stored in a single read, fastq-dump will split. # There might be a joining sequence. The output would thus be: # prefix_1.fastq.gz, prefix_2.fastq.gz and prefix_3.fastq.gz # You want files 1 and 3. f = sorted(glob.glob( os.path.join( outdir, "*.fastq.gz" ) )) ff = [ os.path.basename(x) for x in f ] if len(f) == 1: # sra file contains one read: output = prefix.fastq.gz pass elif len(f) == 2: # sra file contains read pairs: output = prefix_1.fastq.gz, prefix_2.fastq.gz assert ff[0].endswith( "_1.fastq.gz") and ff[1].endswith( "_2.fastq.gz" ) elif len(f) == 3: if ff[2].endswith( "_3.fastq.gz"): f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) ) else: f = glob.glob( os.path.join( outdir, "*_[13].fastq.gz" ) ) E.info("sra file contains the following files: %s" % f ) shutil.rmtree( outdir ) fastqfiles.append( [ "%s/%s" % (tmpdir_fastq, os.path.basename( x )) for x in sorted(f) ] ) statement.append( "fastq-dump --split-files --gzip --outdir %(tmpdir_fastq)s %(infile)s" % locals() ) elif infile.endswith( ".fastq.gz" ): format = Fastq.guessFormat( IOTools.openFile( infile, "r"), raises = False) if 'sanger' not in format and self.convert: statement.append( """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) else: E.debug( "%s: assuming quality score format %s" % (infile, format ) ) fastqfiles.append( (infile, ) ) elif infile.endswith( ".csfasta.gz" ): # single end SOLiD data if self.preserve_colourspace: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" if not os.path.exists( quality ): raise ValueError( "no quality file for %s" % infile ) statement.append( """gunzip < %(infile)s > %(tmpdir_fastq)s/%(track)s.csfasta%(extension)s""" % locals() ) statement.append( """gunzip < %(quality)s > %(tmpdir_fastq)s/%(track)s.qual%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.csfasta%s" % (tmpdir_fastq, track, extension ), "%s/%s.qual%s" % (tmpdir_fastq, track, extension) ) ) self.datatype = "solid" else: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s) %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) elif infile.endswith( ".csfasta.F3.gz" ): # paired end SOLiD data if self.preserve_colourspace: bn = P.snip( infile, ".csfasta.F3.gz" ) # order is important - mirrors tophat reads followed by quals f = [] for suffix in ("csfasta.F3", "csfasta.F5", "qual.F3", "qual.F5" ): fn = "%(bn)s.%(suffix)s" % locals() if not os.path.exists( fn + ".gz"): raise ValueError( "expected file %s.gz missing" % fn ) statement.append( """gunzip < %(fn)s.gz %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s""" % locals() ) f.append( "%(tmpdir_fastq)s/%(track)s.%(suffix)s%(extension)s" % locals() ) fastqfiles.append( f ) self.datatype = "solid" else: quality = P.snip( infile, ".csfasta.gz" ) + ".qual.gz" statement.append( """solid2fastq <(gunzip < %(infile)s) <(gunzip < %(quality)s) %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.fastq%(extension)s""" % locals() ) fastqfiles.append( ("%s/%s.fastq%s" % (tmpdir_fastq, track, extension),) ) elif infile.endswith( ".fastq.1.gz" ): bn = P.snip( infile, ".fastq.1.gz" ) infile2 = "%s.fastq.2.gz" % bn if not os.path.exists( infile2 ): raise ValueError("can not find paired ended file '%s' for '%s'" % (infile2, infile)) format = Fastq.guessFormat( IOTools.openFile( infile ), raises = False ) if 'sanger' not in format: statement.append( """gunzip < %(infile)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.1.fastq%(extension)s; gunzip < %(infile2)s | python %%(scriptsdir)s/fastq2fastq.py --change-format=sanger --guess-format=phred64 --log=%(outfile)s.log %(compress_cmd)s > %(tmpdir_fastq)s/%(track)s.2.fastq%(extension)s """ % locals() ) fastqfiles.append( ("%s/%s.1.fastq%s" % (tmpdir_fastq, track, extension), "%s/%s.2.fastq%s" % (tmpdir_fastq, track, extension) ) ) else: E.debug( "%s: assuming quality score format %s" % (infile, format ) ) fastqfiles.append( (infile, infile2, ) ) else: raise NotImplementedError( "unknown file format %s" % infile ) self.tmpdir_fastq = tmpdir_fastq assert len(fastqfiles) > 0, "no fastq files for mapping" return "; ".join( statement) + ";", fastqfiles
def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.getTempDir('/ifs/scratch') job_options = getGATKOptions() library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] threads = PARAMS["gatk_threads"] dbsnp = PARAMS["gatk_dbsnp"] solid_options = PARAMS["gatk_solid_options"] # need to unload java before runnning GATK as it now runs on java version 7 # full path to .jar file being specified as using module "GenomeAnalysisTK" # resulted in error: "Could not find the main class: # org.broadinstitute.sting.gatk.CommandLineGATK. Program will exit." # This error is seen when java version 6 is used # Find out why this error occurs when not specifying full path statement = '''module unload apps/java/jre1.6.0_26; checkpoint;''' statement += '''ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%%(bwa_index_dir)s/%%(genome)s.fa ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; checkpoint ;''' % locals() statement += '''AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(tmpdir_gatk)s/%(track)s.readgroups.bam RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals() statement += '''java -Xmx4g -jar /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar -T RealignerTargetCreator -o %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam ; checkpoint ;''' % locals() statement += '''java -Xmx4g -jar /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar -T IndelRealigner -o %(tmpdir_gatk)s/%(track)s.indelrealigned.bam -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.readgroups.bam -targetIntervals %(tmpdir_gatk)s/%(track)s.indelrealignment.intervals ; checkpoint ;''' % locals() statement += '''java -Xmx4g -jar /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam --knownSites %(dbsnp)s %(solid_options)s ; checkpoint ;''' % locals() statement += '''java -Xmx4g -jar /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %%(bwa_index_dir)s/%%(genome)s.fa -I %(tmpdir_gatk)s/%(track)s.indelrealigned.bam ; checkpoint ;''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' P.run()
def build(self, infile): ''' build statement for running SGA ''' track = self.getTrack(os.path.basename(infile)) # decide which algorithm to use based on # read length if "%(sga_long)s": index_algorithm = "sais" else: index_algorithm = "ropebwt" format = self.getFormat(infile) paired = self.checkPairs(infile) # directory in which to do the assembly tempdir = P.getTempDir(dir=".") # check whether the data are paired-end if not paired: pe_mode = "--pe-mode=0" files = os.path.abspath(infile) else: # DOESN'T DEAL WITH INTERLEAVED FILES YET pe_mode = "--pe-mode=1" files = " ".join([os.path.abspath(infile), os.path.abspath(paired[1])]) executable = "%(sga_executable)s" outdir = os.path.abspath("sga.dir") ############################################### # preprocessing step converts missing bases to # random bases or removes sequences with # missing bases ############################################### preprocess_options = "%(sga_preprocess_options)s" # outputs a merged fastq file outf_preprocessed = track + ".fastq" preprocess_statement = "cd %(tempdir)s; %(executable)s preprocess %(pe_mode)s %(preprocess_options)s %(files)s \ -o %(outf_preprocessed)s 2> %(outdir)s/%(track)s_preprocess.log" ############################################### # indexing reads with FM index ############################################### index_options = "%(sga_index_options)s" index_statement = "%(executable)s index --algorithm=%(index_algorithm)s \ %(outf_preprocessed)s 2> %(outdir)s/%(track)s_index.log" ############################################### # correct sequencing errors in reads ############################################### correction_method = "%(sga_correction_method)s" # if correction_method == "kmer": # ADD WARNING HERE correction_options = "%(sga_kmer_correction_options)s" # elif correction_method == "hybrid": # correction_options = "%(sga_hybrid_correction_options)s" # elif correction_method == "overlap": # correction_options = "%(sga_overlap_correction_options)s" # else: # raise ValueError("method %s does not exist: choose one of kmer, hybrid, overlap" % correction_method) outf_corrected = track + "_corrected.fa" metrics = "--metrics=%(track)s.metrics" % locals() correction_prefix = os.path.join( tempdir, P.snip(outf_corrected, ".fa")) correction_statement = "%(executable)s correct %(metrics)s \ --algorithm=%(correction_method)s \ %(correction_options)s \ %(outf_preprocessed)s \ -o %(outf_corrected)s 2> %(outdir)s/%(track)s_corrected.log" ############################################### # filter low quality reads and low abundance # kmers ############################################### filter_options = "%(sga_filter_options)s" outf_filtered = track + "_filtered.fa" filter_statement = "sga index %(outf_corrected)s; \ %(executable)s filter %(filter_options)s \ -o %(outf_filtered)s \ %(outf_corrected)s 2> %(outdir)s/%(track)s_filtered.log" ############################################### # overlap reads ############################################### # Note "asqg" is the default output from sga outf_overlap = track + "_filtered.asqg.gz" threads = "%(sga_threads)s" overlap_options = "%(sga_overlap_options)s" overlap_statement = "%(executable)s overlap %(overlap_options)s \ %(outf_filtered)s 2> %(outdir)s/%(track)s_overlap.log" ############################################### # assemble reads and perform error removal ############################################### assembly_options = "%(sga_assembly_options)s" error_removal_options = "%(sga_error_removal_options)s" out_prefix = track assembly_statement = "%(executable)s assemble %(assembly_options)s \ %(outf_overlap)s \ --out-prefix=%(out_prefix)s 2> %(outdir)s/%(track)s_contigs.log" ############################################### # build statement ############################################### metrics_file = os.path.basename(metrics.replace("--metrics=", "")) contigs_file = os.path.basename(out_prefix + "-contigs.fa") move_statement = "mv %(metrics_file)s %(outdir)s/%(metrics_file)s; \ cat %(contigs_file)s \ | python %%(scriptsdir)s/rename_contigs.py \ --log=%(outdir)s/%(track)s.contigs.log \ -a sga > %(outdir)s/%(track)s.contigs.fa" statement = "; ".join([preprocess_statement, index_statement, correction_statement, filter_statement, overlap_statement, assembly_statement, move_statement, "rm -rf %(tempdir)s"]) % locals() return statement
def filterBamfiles(infile, sentinel): """ Pre-process bamfiles prior to peak calling. i) sort bamfiles ii) remove unmapped readswith bam2bam.py iii) remove non-uniquely mapping reads with bam2bam.py (optional) iv) remove duplicates with Picards MarkDuplicates (optional) v) remove reads from masked regions with bedtools intersect (optional) vi) index """ # create tempfile for Picard's MarkDuplicates picard_tmp = picard_tmp = P.getTempDir(PARAMS["scratchdir"]) outfile = P.snip(sentinel, ".sentinel") + ".bam" # ensure bamfile is sorted, statement = [ "samtools sort @IN@ @OUT@", ] # remove unmapped reads statement.append("python %(scriptsdir)s/bam2bam.py" " --filter=mapped" " --log=%(outfile)s.log" " < @[email protected]" " > @OUT@") # remove non-uniquely mapping reads, if requested if PARAMS["filter_remove_non_unique"]: statement.append("python %(scriptsdir)s/bam2bam.py" " --filter=unique" " --log=%(outfile)s.log" " < @IN@" " > @OUT@") # remove duplicates, if requested if PARAMS["filter_remove_duplicates"]: statement.append("MarkDuplicates" " INPUT=@IN@" " ASSUME_SORTED=true" " REMOVE_DUPLICATES=true" " QUIET=false" " OUTPUT=@OUT@" " METRICS_FILE=/dev/null" " VALIDATION_STRINGENCY=SILENT" " TMP_DIR=%(picard_tmp)s" " 2> %(outfile)s.log") # mask regions, if intervals supplied if PARAMS["filter_mask_intervals"]: mask = PARAMS["filter_mask_intervals"] statement.append("bedtools intersect" " -abam @IN@" " -b %(mask)s" " -wa" " -v" " > @OUT@") statement.append("mv @IN@ %(outfile)s") statement.append("samtools index %(outfile)s") job_options = "-l mem_free=10G" statement = P.joinStatements(statement, infile) P.run() P.touch(sentinel) shutil.rmtree(picard_tmp)
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' to_cluster = True # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if IOTools.isEmpty(dbfile): P.touch(outfile) return if not os.path.exists(controlfile): raise P.PipelineError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.getTempDir(".") tmpfile = P.getTempFilename(".") for motiffile in motiffiles: if IOTools.isEmpty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() of = IOTools.openFile(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run() statement = "gzip < %(tmpfile)s > %(outfile)s" P.run() shutil.rmtree(tmpdir) os.unlink(tmpfile)
def build(self, infile): ''' build statement for running Ray ''' track = self.getTrack(infile) format = self.getFormat(infile) paired = self.checkPairs(infile) tempdir = P.getTempDir() # check whether the data are paired-end if len(paired) > 1: pair = paired[0] # Ray doesn't like .fastq.1.gz etc read1 = infile read2 = paired[1] read1_new = os.path.join(tempdir, read1.replace(".fastq.1.gz", ".1.fastq")) read2_new = os.path.join(tempdir, read2.replace(".fastq.2.gz", ".2.fastq")) files = " ".join([read1_new, read2_new]) else: pair = paired files = infile raydir = os.path.join(os.getcwd(), "ray.dir") # Ray picks up file types so should just have to # say whether its paired or not print files # build statement common_options = "-k %(kmer)s" if pair == "interleaved": filetype = "-i" elif not pair: filetype = "-s" elif pair == "separate": filetype = "-p" else: raise IOError, "do not support file of this type: %s" % infile statement = '''gunzip -c %(read1)s > %(read1_new)s ; gunzip -c %(read2)s > %(read2_new)s ; %%(ray_executable)s %(common_options)s %(filetype)s %(files)s -o %(raydir)s ; checkpoint; mv %(raydir)s/Scaffolds.fa %(raydir)s/%(track)s.scaffolds.fa ; mv %(raydir)s/ScaffoldComponents.txt %(raydir)s/%(track)s.scaffold_components.txt ; mv %(raydir)s/ScaffoldLengths.txt %(raydir)s/%(track)s.scaffold_lengths.txt ; mv %(raydir)s/ScaffoldLinks.txt %(raydir)s/%(track)s.scaffold_links.txt ; mv %(raydir)s/Contigs.fa %(raydir)s/%(track)s.contigs.fa# ; mv %(raydir)s/OutputNumbers.txt %(raydir)s/%(track)s.numbers.txt ; mv %(raydir)s/CoverageDistribution.txt %(raydir)s/graph/%(track)s.coverage_distribution.txt ; mkdir %(raydir)s/graph ; mv %(raydir)s/CoverageDistributionAnalysis.txt %(raydir)s/graph/%(track)s.coverage_distribution_analysis.txt ; mv %(raydir)s/degreeDistribution.txt %(raydir)s/graph/%(track)s.degree_distribution.txt ; mv %(raydir)s/Kmers.txt %(raydir)s/graph/%(track)s.kmers.txt ; mkdir %(raydir)s/assembly ; mv %(raydir)s/SeedLengthDistribution.txt %(raydir)s/assembly/%(track)s.seed_length_distribution.txt ; mv %(raydir)s/LibraryStatistics.txt %(raydir)s/%(track)s.library_statistics.txt ; mv %(raydir)s/LibraryData.xml %(raydir)s/%(track)s.library_data.xml ; rm -rf %(tempdir)s''' % locals() return statement