def GATKBaseRecal(infile, outfile, genome, intervals, padding, dbsnp, solid_options=""): '''Recalibrates base quality scores using GATK''' track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir('.') job_options = getGATKOptions() job_threads = 3 statement = '''GenomeAnalysisTK -T BaseRecalibrator --out %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -L %(intervals)s -ip %(padding)s -I %(infile)s --knownSites %(dbsnp)s %(solid_options)s ; ''' % locals() statement += '''GenomeAnalysisTK -T PrintReads -o %(outfile)s -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp -R %(genome)s -I %(infile)s ; ''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run(statement)
def runTomTom(infile, outfile): '''compare ab-initio motifs against tomtom.''' tmpdir = P.get_temp_dir(".") databases = " ".join(P.as_list(P.get_params()["tomtom_databases"])) target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "tomtom", outfile) if iotools.is_empty(infile): E.warn("input is empty - no computation performed") iotools.touch_file(outfile) return statement = ''' tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log ''' P.run(statement) # copy over results try: os.makedirs(os.path.dirname(target_path)) except OSError: # ignore "file exists" exception pass if os.path.exists(target_path): shutil.rmtree(target_path) shutil.move(tmpdir, target_path) shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)
def GATKpreprocessing(infile, outfile): '''Reorders BAM according to reference fasta and add read groups using SAMtools, realigns around indels and recalibrates base quality scores using GATK''' to_cluster = USECLUSTER track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir() job_memory = PARAMS["gatk_memory"] genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr") outfile2 = outfile.replace(".bqsr", ".realign.bqsr") exome.GATKReadGroups(infile, outfile1, genome, PARAMS["readgroup_library"], PARAMS["readgroup_platform"], PARAMS["readgroup_platform_unit"]) exome.GATKIndelRealign(outfile1, outfile2, genome, PARAMS["gatk_threads"]) iotools.zap_file(outfile1) exome.GATKBaseRecal(outfile2, outfile, genome, PARAMS["gatk_dbsnp"], PARAMS["gatk_solid_options"]) iotools.zap_file(outfile2)
def deduplicate_reads(infile, outfile): tmpdir = P.get_temp_dir(dir=PARAMS["shared_tmpdir"]) statement = ''' MarkDuplicates I=%(infile)s O=%(outfile)s ASSUME_SORTED=True VALIDATION_STRINGENCY=LENIENT METRICS_FILE=%(outfile)s.stats REMOVE_DUPLICATES=True TMP_DIR=%(tmpdir)s > %(outfile)s.log && samtools index %(outfile)s && rm %(tmpdir)s -r ''' job_memory = "16G" P.run(statement)
def mergeSampleBams(infile, outfile): '''merge control and tumor bams''' # Note: need to change readgroup headers for merge and subsequent # splitting of bam files to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.get_temp_dir(shared=True) outfile_tumor = outfile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_base = os.path.basename(infile) infile_tumor_base = infile_base.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) library = PARAMS["readgroup_library"] platform = PARAMS["readgroup_platform"] platform_unit = PARAMS["readgroup_platform_unit"] control_id = "Control.bam" tumor_id = control_id.replace( PARAMS["sample_control"], PARAMS["sample_tumour"]) statement = '''picard AddOrReplaceReadGroups INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(infile_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s ID=%(track)s VALIDATION_STRINGENCY=SILENT ;''' statement += '''picard AddOrReplaceReadGroups INPUT=%(infile_tumor)s OUTPUT=%(tmpdir_gatk)s/%(infile_tumor_base)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track_tumor)s ID=%(track_tumor)s VALIDATION_STRINGENCY=SILENT ;''' statement += '''samtools merge -rf %(outfile)s %(tmpdir_gatk)s/%(infile_base)s %(tmpdir_gatk)s/%(infile_tumor_base)s;''' statement += "samtools index %(outfile)s; " statement += "rm -rf %(tmpdir_gatk)s ;" P.run(statement) iotools.zap_file(infile) iotools.zap_file(infile_tumor)
def runMEME(track, outfile, dbhandle): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker This method is deprecated - use runMEMEOnSequences instead. ''' # job_options = "-l mem_free=8000M" target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) fasta = IndexedFasta.IndexedFasta( os.path.join(P.get_params()["genome_dir"], P.get_params()["genome"])) tmpdir = P.get_temp_dir(".") tmpfasta = os.path.join(tmpdir, "in.fa") nseq = writeSequencesForIntervals( track, tmpfasta, dbhandle, full=False, masker=P.as_list(P.get_params()['motifs_masker']), halfwidth=int(P.get_params()["meme_halfwidth"]), maxsize=int(P.get_params()["meme_max_size"]), proportion=P.get_params()["meme_proportion"], min_sequences=P.get_params()["meme_min_sequences"]) if nseq == 0: E.warn("%s: no sequences - meme skipped" % outfile) iotools.touch_file(outfile) else: statement = ''' meme %(tmpfasta)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(meme_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def __init__(self, save=True, summarize=False, threads=1, qual_format='phred64', *args, **kwargs): self.save = save self.summarize = summarize self.threads = threads if self.save: self.outdir = "processed.dir" else: self.outdir = P.get_temp_dir(shared=True) self.processors = [] self.qual_format = qual_format
def runPicardOnRealigned(infile, outfile): to_cluster = USECLUSTER job_memory = PARAMS["gatk_memory"] tmpdir_gatk = P.get_temp_dir() outfile_tumor = outfile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) infile_tumor = infile.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) track = P.snip(os.path.basename(infile), ".bam") track_tumor = track.replace(PARAMS["sample_control"], PARAMS["sample_tumour"]) genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"]) mappingqc.buildPicardAlignmentStats(infile, outfile, genome) mappingqc.buildPicardAlignmentStats(infile_tumor, outfile_tumor, genome)
def picardMarkDuplicates(infile, outfile): ''' Yield duplication metrics using Picard Tools. ''' out_dir = os.path.dirname(outfile) bam_in = os.path.join(os.path.dirname(outfile), "outs/possorted_genome_bam.bam") base_bam = 'marked_duplicates.bam' base_metrics = os.path.basename(outfile) picard_options = PARAMS["picard_markduplicate_options"] barcode_tag = PARAMS["picard_barcode_tag"] read_one_barcode_tag = PARAMS["picard_read_one_barcode_tag"] read_two_barcode_tag = PARAMS["picard_read_two_barcode_tag"] validation_stringency = PARAMS["picard_validation_stringency"] job_threads = PICARD_THREADS job_memory = PICARD_MEMORY local_tmpdir = P.get_temp_dir() statement = '''picard_out=`mktemp -d -p %(local_tmpdir)s`; MarkDuplicates I=%(bam_in)s O=${picard_out}/%(base_bam)s M=${picard_out}/%(base_metrics)s BARCODE_TAG=%(barcode_tag)s READ_ONE_BARCODE_TAG=%(read_one_barcode_tag)s READ_TWO_BARCODE_TAG=%(read_two_barcode_tag)s VALIDATION_STRINGENCY=%(validation_stringency)s %(picard_options)s; grep . ${picard_out}/%(base_metrics)s | grep -v "#" | head -n2 > %(outfile)s; rm -rv ${picard_out} ''' P.run(statement)
def GATKReadGroups(infile, outfile, genome, library="unknown", platform="Illumina", platform_unit="1", track="unknown"): '''Reorders BAM according to reference fasta and adds read groups''' if track == 'unknown': track = P.snip(os.path.basename(infile), ".bam") tmpdir_gatk = P.get_temp_dir('.') job_options = getGATKOptions() job_threads = 3 statement = '''picard ReorderSam INPUT=%(infile)s OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam REFERENCE=%(genome)s ALLOW_INCOMPLETE_DICT_CONCORDANCE=true VALIDATION_STRINGENCY=SILENT ;''' % locals() statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ; ''' % locals() statement += '''picard AddOrReplaceReadGroups INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam OUTPUT=%(outfile)s RGLB=%(library)s RGPL=%(platform)s RGPU=%(platform_unit)s RGSM=%(track)s VALIDATION_STRINGENCY=SILENT ;''' % locals() statement += '''samtools index %(outfile)s ; ''' % locals() statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals() P.run(statement)
def runFastqScreen(infiles, outfile): '''run FastqScreen on input files.''' # configure job_threads with fastq_screen_options from P.get_params() job_threads = re.findall(r'--threads \d+', P.get_params()['fastq_screen_options']) if len(job_threads) != 1: raise ValueError("Wrong number of threads for fastq_screen") job_threads = int(re.sub(r'--threads ', '', job_threads[0])) tempdir = P.get_temp_dir(".") conf_fn = os.path.join(tempdir, "fastq_screen.conf") with iotools.open_file(conf_fn, "w") as f: for i, k in P.get_params().items(): if i.startswith("fastq_screen_database"): f.write("DATABASE\t%s\t%s\n" % (i[22:], k)) m = mapping.FastqScreen(config_filename=conf_fn) statement = m.build((infiles,), outfile) P.run(statement, job_memory="8G") shutil.rmtree(tempdir) iotools.touch_file(outfile)
def runMEMEOnSequences(infile, outfile): '''run MEME to find motifs. In order to increase the signal/noise ratio, MEME is not run on all intervals but only the top 10% of intervals (peakval) are used. Also, only the segment of 200 bp around the peak is used and not the complete interval. * Softmasked sequence is converted to hardmasked sequence to avoid the detection of spurious motifs. * Sequence is run through dustmasker ''' # job_options = "-l mem_free=8000M" nseqs = int(FastaIterator.count(infile)) if nseqs == 0: E.warn("%s: no sequences - meme skipped" % outfile) iotools.touch_file(outfile) return target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "meme", outfile) tmpdir = P.get_temp_dir(".") statement = ''' meme %(infile)s -dna -revcomp -mod %(meme_model)s -nmotifs %(meme_nmotifs)s -oc %(tmpdir)s -maxsize %(motifs_max_size)s %(meme_options)s > %(outfile)s.log ''' P.run(statement) collectMEMEResults(tmpdir, target_path, outfile)
def run(self, infile, outfile, params): if not os.path.exists(params.reference_bam): raise OSError("reference bam file {} does not exist".format( params.reference_bam)) tmpdir = P.get_temp_dir(clear=True) statement = ( "mkdir {tmpdir}; " "samtools sort -n {infile} > {tmpdir}/comp.bam; " "samtools sort -n {params.reference_bam} > {tmpdir}/ref.bam; " "{params.path} bam-compare-alignments " "--output-filename-pattern={outfile}.daisy_bam_compare_alignments_%%s.tsv " "{params.options} " "--input-bam={tmpdir}/comp.bam " "--reference-bam={tmpdir}/ref.bam " ">& {outfile}; " "rm -rf {tmpdir}; ".format(**locals())) retval = P.run(statement) return retval
def runMAST(infiles, outfile): '''run mast on all intervals and motifs. Collect all results for an E-value up to 10000 so that all sequences are output and MAST curves can be computed. 10000 is a heuristic. ''' # job_options = "-l mem_free=8000M" controlfile, dbfile, motiffiles = infiles if iotools.is_empty(dbfile) or len(motiffiles) == 0: iotools.touch_file(outfile) return if not os.path.exists(controlfile): raise ValueError("control file %s for %s does not exist" % (controlfile, dbfile)) # remove previous results if os.path.exists(outfile): os.remove(outfile) tmpdir = P.get_temp_dir(".") tmpfile = P.get_temp_filename(".") for motiffile in motiffiles: if iotools.is_empty(motiffile): L.info("skipping empty motif file %s" % motiffile) continue of = iotools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - foreground ::\n" % motif) of.close() # mast bails if the number of nucleotides gets larger than # 2186800982? # To avoid this, run db and control file separately. statement = ''' cat %(dbfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) of = iotools.open_file(tmpfile, "a") motif, x = os.path.splitext(motiffile) of.write(":: motif = %s - background ::\n" % motif) of.close() statement = ''' cat %(controlfile)s | mast %(motiffile)s - -nohtml -oc %(tmpdir)s -ev %(mast_evalue)f %(mast_options)s >> %(outfile)s.log 2>&1; cat %(tmpdir)s/mast.txt >> %(tmpfile)s 2>&1 ''' P.run(statement) P.run("gzip < %(tmpfile)s > %(outfile)s") shutil.rmtree(tmpdir) os.unlink(tmpfile)
def buildStatement(self, *args, **PARAMS): """ Generate run statement for processing single, paired, or paired + singleton samples. Required arguments: index reference """ run_options = PARAMS["sortmerna_run_options"] threads = PARAMS["sortmerna_threads"] # A comma separated list of references references = PARAMS["sortmerna_reference"] references = ' --ref '.join(references.split(',')) # All listed references must be pre-indexed in this location index_dir = PARAMS[ "sortmerna_index"] # Check this isn't automatically passed. tmpf = P.get_temp_dir('.') tmpf_kvdb = os.path.join(tmpf, 'kvdb') tmpf_readb = os.path.join(tmpf, 'readb') if not self.fastn2: # Run sortMeRNA for single reads in_fastn1 = self.fastn1 in_prefix = P.snip(in_fastn1, self.fn_suffix, strip_path=True) out_prefix = os.path.join(self.outdir, in_prefix) # Run sortMeRNA for single reads statement = ( "sortmerna" " --index 0" # skip indexing, assume in idx-dir " --fastx" " --reads %(in_fastn1)s" " --ref %(references)s" " --idx-dir %(index_dir)s" # location of reference indexes " --aligned %(out_prefix)s_aligned" # output location of aligned seq " --other %(out_prefix)s_unaligned" # output location of unalinged seq " --readb %(tmpf_readb)s" # location of tmp file for reads " --kvdb %(tmpf_kvdb)s" # location of tmp file for kv pairs " --threads %(threads)s" " --zip-out" % locals()) else: # Run sortMeRNA for paired reads in_fastn1 = self.fastn1 in_fastn2 = self.fastn2 in_prefix = P.snip(in_fastn1, self.fn_suffix, strip_path=True) out_prefix = os.path.join(self.outdir, in_prefix) # Run sortMeRNA for single reads statement = ( "sortmerna" " --index 0" # skip indexing, assume in idx-dir " --fastx" " --reads %(in_fastn1)s" # First read file " --reads %(in_fastn2)s" # Second read file " --ref %(references)s" " --idx-dir %(index_dir)s" # location of reference indexes " --aligned %(out_prefix)s_aligned" # output location of aligned seq " --other %(out_prefix)s_unaligned" # output location of unalinged seq " --readb %(tmpf_readb)s" # location of tmp file for reads " --kvdb %(tmpf_kvdb)s" # location of tmp file for kv pairs " --paired_in" # If one read is aligned, both are output to aligned file " --out2" # Output paired reads to separate files " --threads %(threads)s" " --zip-out" % locals()) if self.fastn3 and not PARAMS.get('sortmerna_skip_singletons', False): in_fastn3 = self.fastn3 statement_2 = ( "sortmerna" " --index 0" # skip indexing, assume in idx-dir " --fastx" " --reads %(in_fastn3)s" " --idx-dir %(index_dir)s" # location of reference indexes " --ref %(references)s" " --aligned %(out_prefix)s_aligned_singleton" # output location of aligned seq " --other %(out_prefix)s_unaligned_singleton" # output location of unalinged seq " --readb %(tmpf_readb)s" # location of tmp file for reads " --kvdb %(tmpf_kvdb)s" # location of tmp file for kv pairs " --threads %(threads)s" " --zip-out" % locals()) statement = " && ".join([ statement, "rm -rf %(tmpf)s/*" % locals(), # location of tmp_readb & kvdb statement_2, "rm -rf %(tmpf)s" % locals() ]) return statement, run_options
def redirect2mounts(config, mountpoint=None, debug=None, mount_write=False, substitute_only=False, always_mount=False): """redirect filenames in dictionary config to a mount-point. Mount points in the config are indicated by the `arv=` prefix. If no option in config requires mounting, no mounting will be done and the method returns None. :param config: dictionary with config values. Will be modified in-place. :param mountpoint: if given, paths will be substituted by mountpoint. If None, a new mountpoint will be created. :param debug: if given, mount in debug mode and save log to filename. :param mount_write: if True, mount in --read-write mode. :param substitute_only: if True, only perform substitution, do not mount anything even if mountpoint is None. :param always_mount: if True, always mount, no matter if arv= prefix is present. :return: the mountpoint """ arvados_options = ["--disable-event-listening"] if debug: arvados_options.append(" --debug --logfile={}".format(debug)) if mount_write: arvados_options.append("--read-write") arvados_options = " ".join(arvados_options) if not mountpoint: mountpoint = P.get_temp_dir() + "/" E.info("redirect2mounts: mounting arvados at {} with --read-write". format(mountpoint)) E.run("arv-mount {} {}".format(arvados_options, mountpoint)) E.info("redirect2mounts: arvados mounted at {} with --read-write". format(mountpoint)) else: arvados_options.append("--read-only") if always_mount: mountpoint = P.get_temp_dir() + "/" do_mount = True else: do_mount = False for d, key, value in IOTools.nested_iter(config): if isinstance(value, str): if "arv=" in value: if substitute_only and mountpoint is None: continue if not mountpoint: mountpoint = P.get_temp_dir() + "/" do_mount = True d[key] = re.sub("arv=", mountpoint, value) if do_mount: raise NotImplementedError("arvados support disabled") # if not arvados.have_arvados(): # raise ValueError( # "config file requires arvados access, but arvados not available") arvados_options = " ".join(arvados_options) E.debug("redirect2mounts: mounting arvados at {} with options {}". format(mountpoint, arvados_options)) E.run("arv-mount {} {}".format(arvados_options, mountpoint)) E.debug( "redirect2mounts: arvados mounted at {}".format(mountpoint)) return mountpoint
def run(self, infiles, outfile, params): tmpdir = P.get_temp_dir(clear=True) statements = ["mkdir {}".format(tmpdir)] if params.remove_fields: cleanup_statement = ("| {params.path} annotate " "-x {params.remove_fields} " "2> {outfile}_annotate.log ".format( **locals())) else: cleanup_statement = "" # the current pattern is properly overly specific and # substitutes ./. with 0/0 if params.set_missing_genotype_to_reference: set_genotype = "| perl -p -e 's/\.\/\./0\/0/g'" else: set_genotype = "" with IOTools.open_file(outfile + ".filelist_blocks", "w") as blockf: for start in range(0, len(infiles), self.block_size): fn = outfile + ".filelist_{}".format(start) fn_vcf = os.path.join(tmpdir, "block_{}.vcf.gz".format(start)) with IOTools.open_file(fn, "w") as outf: end = start + self.block_size outf.write("\n".join(infiles[start:end]) + "\n") statements.append("{params.path} merge " "{params.options} " "-O v " "--file-list {outfile}.filelist_{start} " "2> {outfile}_merge_{start}.log " "{cleanup_statement} " "{set_genotype} " "| bgzip " "> {fn_vcf}; " "tabix -p vcf {fn_vcf}".format(**locals())) blockf.write(fn_vcf + "\n") if params.restrict_to_all: filter_statement = ("| {params.path} filter " "--include \"FORMAT/GT != '.'\" " "-O v " "2> {outfile}_filter.log ".format(**locals())) else: filter_statement = "" statements.append("{params.path} merge " "{params.options} " "-O v " "--file-list {outfile}.filelist_blocks " "2> {outfile}_merge.log " "{filter_statement} " "| bgzip " "> {outfile}; " "tabix -p vcf {outfile} ".format(**locals())) statements.append("rm -rf {}".format(tmpdir)) statement = "; ".join(statements) retvals = P.run(statement, **params._asdict()) return retvals
def removeHost(fastq1, outfile): '''Remove host contamination using bmtagger''' outf_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.txt' outf_host_stub = P.snip(outf_host, '.txt') + '_toremove' # Currently disabled. Has no effect. See drop_fastq.py # # Whether to keep pair if a read is identified as host. # if PARAMS['bmtagger_keep_pairs']: # keep_pairs = True # E.info("BMTagger: reads with a pair identified as host will be" # " discarded") # else: # keep_pairs = False # E.info("BMTagger: reads with a pair identified as host will be" # " kept as singletons (assuming they are not also identified" # " as host)") if IS_PAIRED: fastq2 = P.snip(fastq1, '.1.gz') + '.2.gz' fastq3 = P.snip(fastq1, '.1.gz') + '.3.gz' to_remove_paired = P.get_temp_filename('.') to_remove_singletons = P.get_temp_filename('.') # In some cases, it may be desirable to screen against multiple hosts. indexes = zip(PARAMS['bmtagger_bitmask'].split(','), PARAMS['bmtagger_srprism'].split(',')) for n, indexes in enumerate(indexes, 1): n = str(n) bitmask, srprism = indexes # Screen the paired reads, then singletons tmpdir1 = P.get_temp_dir('.') tmpdir2 = P.get_temp_dir('.') tmpf1 = P.get_temp_filename('.') tmpf2 = P.get_temp_filename('.') tmpf3 = P.get_temp_filename('.') # bmtagger truncates fasta headers... sed 's/[[:space:]]\+/__/g' # It won't accept... sed 's|[[:space:]].*$|/1|' # It also fails if fastq1 header differs from fastq2 statement1 = ( "zcat %(fastq1)s > %(tmpf1)s &&" " zcat %(fastq2)s > %(tmpf2)s &&" " bmtagger.sh" " -b %(bitmask)s" " -x %(srprism)s" " -T %(tmpdir1)s" " -q1" # Input is fastq " -1 %(tmpf1)s" " -2 %(tmpf2)s" " -o %(outf_host_stub)s_paired%(n)s" " &> %(outfile)s.log &&" " cat %(outf_host_stub)s_paired%(n)s" " >> %(to_remove_paired)s &&" " rm -rf %(tmpdir1)s %(tmpf1)s %(tmpf2)s" " %(outf_host_stub)s_paired%(n)s") # Screen the singletons if IOTools.open_file(fastq3).read(1): statement2 = ( "zcat %(fastq3)s > %(tmpf3)s &&" " bmtagger.sh" " -b %(bitmask)s" " -x %(srprism)s" " -T %(tmpdir2)s" " -q1" # Input is fastq " -1 %(tmpf3)s" " -o %(outf_host_stub)s_singletons%(n)s" " &>> %(outfile)s.log &&" " cat %(outf_host_stub)s_singletons%(n)s" " >> %(to_remove_singletons)s &&" " rm -rf %(tmpdir2)s %(tmpf3)s" " %(outf_host_stub)s_singletons%(n)s") else: statement2 = ("touch %(to_remove_singletons)s &&" " rm -rf %(tmpdir2)s %(tmpf3)s") statement = " && ".join([statement1, statement2]) P.run(statement, job_options=PARAMS['bmtagger_run_options']) # Drop host contaminated reads # A hack due to the fact that BMTagger truncates fastq identifiers # TO DO: Look at bmtagger/.../bin/extract_fullseq drop_script = os.path.join( os.path.splitext(__file__)[0], 'drop_fastqs.py') fastq1_out = outfile fastq2_out = P.snip(outfile, '.1.gz') + '.2.gz' fastq3_out = P.snip(outfile, '.1.gz') + '.3.gz' fastq1_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.1.gz' fastq2_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.2.gz' fastq3_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.3.gz' statement = ("python %(drop_script)s" " --fastq1 %(fastq1)s" " --fastq2 %(fastq2)s" " --fastq3 %(fastq3)s" " --to-drop-paired %(to_remove_paired)s" " --to-drop-single %(to_remove_singletons)s" " --fastq-out1 %(fastq1_out)s" " --fastq-out2 %(fastq2_out)s" " --fastq-out3 %(fastq3_out)s" " --fastq-drop1 %(fastq1_host)s" " --fastq-drop2 %(fastq2_host)s" " --fastq-drop3 %(fastq3_host)s" " &>> %(outfile)s.log") P.run(statement) os.unlink(to_remove_paired) os.unlink(to_remove_singletons) else: indexes = zip(PARAMS['bmtagger_bitmask'].split(','), PARAMS['bmtagger_srprism'].split(',')) to_remove = P.get_temp_filename('.') for n, indexes in enumerate(indexes, 1): n = str(n) bitmask, srprism = indexes # Screen the singletons tmpdir1 = P.get_temp_dir('.') tmpf = P.get_temp_filename('.') statement = ( "zcat %(fastq1)s > %(tmpf)s &&" " bmtagger.sh" " -b %(bitmask)s" " -x %(srprism)s" " -T %(tmpdir1)s" " -q1" # Input is fastq " -1 %(tmpf)s" " -o %(outf_host_stub)s_%(n)s" " &>> %(outfile)s.log &&" " cat %(outf_host_stub)s_%(n)s >> %(to_remove)s" " rm -rf %(tmpdir1)s %(tmpf)s %(outf_host_stub)s_%(n)s") P.run(statement, job_options=PARAMS['bmtagger_run_options']) # Drop host contaminated reads drop_script = ps.path.join( os.path.splitext(__file__)[0], 'drop_single_fastqs.py') fastq_host = P.snip(outfile, '_dehost.fastq.1.gz') + '_host.fastq.1.gz' statement = ("python %(drop_script)s" " --fastq1 %(fastq1)s" " --to-drop-single %(to_remove)s" " --fastq-out1 %(outfile)s" " --fastq-drop1 %(fastq_host)s" " &>> %(outfile)s.log") P.run(statement) os.unlink(to_remove)
def setUp(self): # ignore command line arguments for pytest P.initialize(argv=["test"]) self.work_dir = P.get_temp_dir(shared=True)
def setUp(self): self.work_dir = P.get_temp_dir(shared=True)